1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Just in case we're not in a build environment, make sure that 28 * TEXT_DOMAIN gets set to something. 29 */ 30 #if !defined(TEXT_DOMAIN) 31 #define TEXT_DOMAIN "SYS_TEST" 32 #endif 33 34 /* 35 * Metadevice diskset interfaces 36 */ 37 38 #include "meta_set_prv.h" 39 #include <meta.h> 40 #include <metad.h> 41 #include <mdmn_changelog.h> 42 #include <sys/lvm/md_crc.h> 43 #include <sys/utsname.h> 44 #include <sdssc.h> 45 46 #include <sys/sysevent/eventdefs.h> 47 #include <sys/sysevent/svm.h> 48 extern char *blkname(char *); 49 50 static md_drive_desc * 51 dr2drivedesc( 52 mdsetname_t *sp, 53 side_t sideno, 54 int flags, 55 md_error_t *ep 56 ) 57 { 58 md_set_record *sr; 59 md_drive_record *dr; 60 mddrivename_t *dnp; 61 md_drive_desc *dd_head = NULL; 62 md_set_desc *sd; 63 64 if (flags & MD_BYPASS_DAEMON) { 65 if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL) 66 return (NULL); 67 sd = metaget_setdesc(sp, ep); 68 sideno = getnodeside(mynode(), sd); 69 sp = metafakesetname(sp->setno, sr->sr_setname); 70 } else { 71 if ((sr = getsetbyname(sp->setname, ep)) == NULL) 72 return (NULL); 73 } 74 75 assert(sideno != MD_SIDEWILD); 76 77 /* 78 * WARNING: 79 * The act of getting the dnp from the namespace means that we 80 * will get the devid of the disk as recorded in the namespace. 81 * This devid has the potential to be stale if the disk is being 82 * replaced via a rebind, this means that any code that relies 83 * on any of the dnp information should take the appropriate action 84 * to preserve that information. For example in the rebind code the 85 * devid of the new disk is saved off and then copied back in once 86 * the code that has called this function has completed. 87 */ 88 for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { 89 if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key, 90 flags, ep)) == NULL) { 91 if (!(flags & MD_BYPASS_DAEMON)) 92 free_sr(sr); 93 metafreedrivedesc(&dd_head); 94 return (NULL); 95 } 96 97 (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt, 98 dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags); 99 } 100 101 if (!(flags & MD_BYPASS_DAEMON)) { 102 free_sr(sr); 103 } 104 return (dd_head); 105 } 106 107 static int 108 get_sidenmlist( 109 mdsetname_t *sp, 110 mddrivename_t *dnp, 111 md_error_t *ep 112 ) 113 { 114 md_set_desc *sd; 115 mdsidenames_t *sn, **sn_next; 116 int i; 117 118 if ((sd = metaget_setdesc(sp, ep)) == NULL) 119 return (-1); 120 121 metaflushsidenames(dnp); 122 sn_next = &dnp->side_names; 123 if (MD_MNSET_DESC(sd)) { 124 /* 125 * Only get sidenames for this node since 126 * that is the only side information stored in 127 * the local mddb for a multi-node diskset. 128 */ 129 if (sd->sd_mn_mynode) { 130 sn = Zalloc(sizeof (*sn)); 131 sn->sideno = sd->sd_mn_mynode->nd_nodeid; 132 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 133 sn->sideno, dnp->side_names_key, &sn->dname, 134 &sn->mnum, NULL, ep)) == NULL) { 135 if (sn->dname != NULL) 136 Free(sn->dname); 137 Free(sn); 138 return (-1); 139 } 140 141 /* Add to the end of the linked list */ 142 assert(*sn_next == NULL); 143 *sn_next = sn; 144 sn_next = &sn->next; 145 } 146 } else { 147 for (i = 0; i < MD_MAXSIDES; i++) { 148 /* Skip empty slots */ 149 if (sd->sd_nodes[i][0] == '\0') 150 continue; 151 152 sn = Zalloc(sizeof (*sn)); 153 sn->sideno = i; 154 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 155 i+SKEW, dnp->side_names_key, &sn->dname, 156 &sn->mnum, NULL, ep)) == NULL) { 157 /* 158 * It is possible that during the add of a 159 * host to have a 'missing' side as the side 160 * for this disk will be added later. So ignore 161 * the error. The 'missing' side will be added 162 * once the addhosts process has completed. 163 */ 164 if (mdissyserror(ep, ENOENT)) { 165 mdclrerror(ep); 166 Free(sn); 167 continue; 168 } 169 170 if (sn->dname != NULL) 171 Free(sn->dname); 172 Free(sn); 173 return (-1); 174 } 175 176 /* Add to the end of the linked list */ 177 assert(*sn_next == NULL); 178 *sn_next = sn; 179 sn_next = &sn->next; 180 } 181 } 182 183 return (0); 184 } 185 186 static md_drive_desc * 187 rl_to_dd( 188 mdsetname_t *sp, 189 md_replicalist_t *rlp, 190 md_error_t *ep 191 ) 192 { 193 md_replicalist_t *rl; 194 md_replica_t *r; 195 md_drive_desc *dd = NULL; 196 md_drive_desc *d; 197 int found; 198 md_set_desc *sd; 199 daddr_t nblks = 0; 200 201 if ((sd = metaget_setdesc(sp, ep)) == NULL) 202 return (NULL); 203 204 /* find the smallest existing replica */ 205 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 206 r = rl->rl_repp; 207 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 208 } 209 210 if (nblks <= 0) 211 nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 212 213 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 214 r = rl->rl_repp; 215 216 found = 0; 217 for (d = dd; d != NULL; d = d->dd_next) { 218 if (strcmp(r->r_namep->drivenamep->cname, 219 d->dd_dnp->cname) == 0) { 220 found = 1; 221 dd->dd_dbcnt++; 222 break; 223 } 224 } 225 226 if (! found) 227 (void) metadrivedesc_append(&dd, r->r_namep->drivenamep, 228 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK); 229 } 230 231 return (dd); 232 } 233 234 /* 235 * Exported Entry Points 236 */ 237 238 set_t 239 get_max_sets(md_error_t *ep) 240 { 241 242 static set_t max_sets = 0; 243 244 if (max_sets == 0) 245 if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0) 246 return (0); 247 248 return (max_sets); 249 } 250 251 int 252 get_max_meds(md_error_t *ep) 253 { 254 static int max_meds = 0; 255 256 if (max_meds == 0) 257 if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0) 258 return (0); 259 260 return (max_meds); 261 } 262 263 side_t 264 getmyside(mdsetname_t *sp, md_error_t *ep) 265 { 266 md_set_desc *sd; 267 char *node = NULL; 268 side_t sideno; 269 270 if (sp->setno == 0) 271 return (0); 272 273 if ((sd = metaget_setdesc(sp, ep)) == NULL) 274 return (MD_SIDEWILD); 275 276 node = mynode(); 277 278 assert(node != NULL); 279 280 sideno = getnodeside(node, sd); 281 282 if (sideno != MD_SIDEWILD) 283 return (sideno); 284 285 return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node)); 286 } 287 288 /* 289 * get set info from name 290 */ 291 md_set_record * 292 getsetbyname(char *setname, md_error_t *ep) 293 { 294 md_set_record *sr = NULL; 295 md_mnset_record *mnsr = NULL; 296 char *p; 297 size_t len; 298 299 /* get set info from daemon */ 300 if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1) 301 return (NULL); 302 if (sr != NULL) { 303 /* 304 * Returned record could be for a multi-node set or a 305 * non-multi-node set. 306 */ 307 if (MD_MNSET_REC(sr)) { 308 /* 309 * Record is for a multi-node set. Reissue call 310 * to get mnset information. Need to free 311 * record as if a non-multi-node set record since 312 * that is what clnt_getset gave us. If in 313 * the daemon, don't free since this is a pointer 314 * into the setrecords array. 315 */ 316 if (! md_in_daemon) { 317 sr->sr_flags &= ~MD_SR_MN; 318 free_sr(sr); 319 } 320 if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr, 321 ep) == -1) 322 return (NULL); 323 if (mnsr != NULL) 324 return ((struct md_set_record *)mnsr); 325 } else { 326 return (sr); 327 } 328 } 329 330 /* no such set */ 331 len = strlen(setname) + 30; 332 p = Malloc(len); 333 (void) snprintf(p, len, "setname \"%s\"", setname); 334 (void) mderror(ep, MDE_NO_SET, p); 335 Free(p); 336 return (NULL); 337 } 338 339 /* 340 * get set info from number 341 */ 342 md_set_record * 343 getsetbynum(set_t setno, md_error_t *ep) 344 { 345 md_set_record *sr; 346 md_mnset_record *mnsr = NULL; 347 char buf[100]; 348 349 if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1) 350 return (NULL); 351 352 if (sr != NULL) { 353 /* 354 * Record is for a multi-node set. Reissue call 355 * to get mnset information. Need to free 356 * record as if a non-multi-node set record since 357 * that is what clnt_getset gave us. If in 358 * the daemon, don't free since this is a pointer 359 * into the setrecords array. 360 */ 361 if (MD_MNSET_REC(sr)) { 362 /* 363 * Record is for a multi-node set. Reissue call 364 * to get mnset information. 365 */ 366 if (! md_in_daemon) { 367 sr->sr_flags &= ~MD_SR_MN; 368 free_sr(sr); 369 } 370 if (clnt_mngetset(mynode(), NULL, setno, &mnsr, 371 ep) == -1) 372 return (NULL); 373 if (mnsr != NULL) 374 return ((struct md_set_record *)mnsr); 375 } else { 376 return (sr); 377 } 378 } 379 380 (void) sprintf(buf, "setno %u", setno); 381 (void) mderror(ep, MDE_NO_SET, buf); 382 return (NULL); 383 } 384 385 int 386 meta_check_drive_inuse( 387 mdsetname_t *sp, 388 mddrivename_t *dnp, 389 int check_db, 390 md_error_t *ep 391 ) 392 { 393 mdnamelist_t *nlp = NULL; 394 mdnamelist_t *p; 395 int rval = 0; 396 397 /* get all underlying partitions */ 398 if (meta_getalldevs(sp, &nlp, check_db, ep) != 0) 399 return (-1); 400 401 /* search for drive */ 402 for (p = nlp; (p != NULL); p = p->next) { 403 mdname_t *np = p->namep; 404 405 if (strcmp(dnp->cname, np->drivenamep->cname) == 0) { 406 rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno, 407 NULL, dnp->cname, sp->setname)); 408 break; 409 } 410 } 411 412 /* cleanup, return success */ 413 metafreenamelist(nlp); 414 return (rval); 415 } 416 417 /* 418 * simple check for ownership 419 */ 420 int 421 meta_check_ownership(mdsetname_t *sp, md_error_t *ep) 422 { 423 int ownset; 424 md_set_desc *sd; 425 md_drive_desc *dd; 426 md_replicalist_t *rlp = NULL; 427 md_error_t xep = mdnullerror; 428 429 if (metaislocalset(sp)) 430 return (0); 431 432 ownset = own_set(sp, NULL, TRUE, ep); 433 if (! mdisok(ep)) 434 return (-1); 435 436 if ((sd = metaget_setdesc(sp, ep)) == NULL) 437 return (-1); 438 439 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 440 if (! mdisok(ep)) 441 return (-1); 442 443 /* If we have no drive descriptors, check for no ownership */ 444 if (dd == NULL) { 445 if (ownset == MD_SETOWNER_NONE) 446 return (0); 447 448 /* If ownership somehow has come to exist, we must clean up */ 449 450 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, 451 &xep) < 0) 452 mdclrerror(&xep); 453 454 if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL) 455 if (! mdisok(&xep)) 456 mdclrerror(&xep); 457 458 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 459 if (rel_own_bydd(sp, dd, TRUE, &xep)) 460 mdclrerror(&xep); 461 } 462 463 if (halt_set(sp, &xep)) 464 mdclrerror(&xep); 465 466 metafreereplicalist(rlp); 467 468 metafreedrivedesc(&dd); 469 470 return (0); 471 } 472 473 metafreedrivedesc(&sd->sd_drvs); 474 475 if (ownset == MD_SETOWNER_YES) 476 return (0); 477 478 return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL, 479 sp->setname)); 480 } 481 482 /* 483 * simple check for ownership 484 */ 485 int 486 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep) 487 { 488 md_set_desc *sd; 489 md_drive_desc *dd; 490 int bool; 491 492 if (metaislocalset(sp)) 493 return (0); 494 495 if ((sd = metaget_setdesc(sp, ep)) == NULL) 496 return (-1); 497 498 if (getnodeside(hostname, sd) == MD_SIDEWILD) 499 return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 500 hostname, NULL, sp->setname)); 501 502 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 503 if (! mdisok(ep)) 504 return (-1); 505 506 if (clnt_ownset(hostname, sp, &bool, ep) == -1) 507 return (-1); 508 509 if (dd == NULL) 510 return (0); 511 512 metafreedrivedesc(&sd->sd_drvs); 513 514 if (bool == TRUE) 515 return (0); 516 517 return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL, 518 sp->setname)); 519 } 520 521 /* 522 * Function that determines if a node is in the multinode diskset 523 * membership list. Calling node passes in node to be checked and 524 * the nodelist as returned from meta_read_nodelist. This routine 525 * anticipates being called many times using the same diskset membership 526 * list which is why the alloc and free of the diskset membership list 527 * is left to the calling routine. 528 * Returns: 529 * 1 - if a member 530 * 0 - not a member 531 */ 532 int 533 meta_is_member( 534 char *node_name, 535 md_mn_nodeid_t node_id, 536 mndiskset_membershiplist_t *nl 537 ) 538 { 539 mndiskset_membershiplist_t *nl2; 540 int flag_check_name; 541 542 if (node_id != 0) 543 flag_check_name = 0; 544 else if (node_name != NULL) 545 flag_check_name = 1; 546 else 547 return (0); 548 549 nl2 = nl; 550 while (nl2) { 551 if (flag_check_name) { 552 /* Compare given name against name in member list */ 553 if (strcmp(nl2->msl_node_name, node_name) == 0) 554 break; 555 } else { 556 /* Compare given nodeid against nodeid in member list */ 557 if (nl2->msl_node_id == node_id) 558 break; 559 } 560 nl2 = nl2->next; 561 } 562 /* No match found in member list */ 563 if (nl2 == NULL) { 564 return (0); 565 } 566 /* Return 1 if node is in member list */ 567 return (1); 568 } 569 570 /* 571 * meta_getnext_devinfo should go to the host that 572 * has the device, to return the device name, driver name, minor num. 573 * We can take the big cheat for now, since it is a requirement 574 * that the device names and device numbers are the same, and 575 * just get the info locally. 576 * 577 * This routine is very similar to meta_getnextside_devinfo except 578 * that the specific side to be used is being passed in. 579 * 580 * Exit status: 581 * 0 - No more side info to return 582 * 1 - More side info's to return 583 * -1 - An error has been detected 584 */ 585 /*ARGSUSED*/ 586 int 587 meta_getside_devinfo( 588 mdsetname_t *sp, /* for this set */ 589 char *bname, /* local block name (myside) */ 590 side_t sideno, /* sideno */ 591 char **ret_bname, /* block device name of returned side */ 592 char **ret_dname, /* driver name of returned side */ 593 minor_t *ret_mnum, /* minor number of returned side */ 594 md_error_t *ep 595 ) 596 { 597 mdname_t *np; 598 599 if (ret_bname != NULL) 600 *ret_bname = NULL; 601 if (ret_dname != NULL) 602 *ret_dname = NULL; 603 if (ret_mnum != NULL) 604 *ret_mnum = NODEV32; 605 606 607 if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL) 608 return (-1); 609 610 /* 611 * NOTE (future) - There will be more work here once devids are integrated 612 * into disksets. Then the side should be used to find the correct 613 * host and the b/d names should be gotten from that host. 614 */ 615 616 /* 617 * Return the side info. 618 */ 619 if (ret_bname != NULL) 620 *ret_bname = Strdup(np->bname); 621 622 if (ret_dname != NULL) { 623 mdcinfo_t *cinfo; 624 625 if ((cinfo = metagetcinfo(np, ep)) == NULL) 626 return (-1); 627 628 *ret_dname = Strdup(cinfo->dname); 629 } 630 631 if (ret_mnum != NULL) 632 *ret_mnum = meta_getminor(np->dev); 633 634 return (1); 635 } 636 637 /* 638 * Get the information on the device from the remote node using the devid 639 * of the disk. 640 * 641 * Exit status: 642 * 0 - No more side info to return 643 * 1 - More side info's to return 644 * -1 - An error has been detected 645 */ 646 int 647 meta_getnextside_devinfo( 648 mdsetname_t *sp, /* for this set */ 649 char *bname, /* local block name (myside) */ 650 side_t *sideno, /* previous sideno & returned sideno */ 651 char **ret_bname, /* block device name of returned side */ 652 char **ret_dname, /* driver name of returned side */ 653 minor_t *ret_mnum, /* minor number of returned side */ 654 md_error_t *ep 655 ) 656 { 657 md_set_desc *sd; 658 int i; 659 mdname_t *np; 660 mddrivename_t *dnp; 661 char *devidstr = NULL; 662 int devidstrlen; 663 md_dev64_t retdev = NODEV64; 664 char *ret_devname = NULL; 665 char *ret_blkdevname = NULL; 666 char *ret_driver = NULL; 667 char *nodename; 668 int fd; 669 int ret = -1; 670 char *minor_name = NULL; 671 md_mnnode_desc *nd; 672 673 674 if (ret_bname != NULL) 675 *ret_bname = NULL; 676 if (ret_dname != NULL) 677 *ret_dname = NULL; 678 if (ret_mnum != NULL) 679 *ret_mnum = NODEV32; 680 681 if (metaislocalset(sp)) { 682 /* no more sides - we are done */ 683 if (*sideno != MD_SIDEWILD) 684 return (0); 685 686 /* First time through - set up return sideno */ 687 *sideno = 0; 688 } else { 689 690 /* 691 * Find the next sideno, starting after the one given. 692 */ 693 if ((sd = metaget_setdesc(sp, ep)) == NULL) 694 return (-1); 695 696 if (MD_MNSET_DESC(sd)) { 697 nd = sd->sd_nodelist; 698 if ((*sideno == MD_SIDEWILD) && 699 (nd != (struct md_mnnode_desc *)NULL)) { 700 *sideno = nd->nd_nodeid; 701 } else { 702 while (nd) { 703 /* 704 * Found given sideno, now find 705 * next sideno, if there is one. 706 */ 707 if ((*sideno == nd->nd_nodeid) && 708 (nd->nd_next != 709 (struct md_mnnode_desc *)NULL)) { 710 *sideno = 711 nd->nd_next->nd_nodeid; 712 break; 713 } 714 nd = nd->nd_next; 715 } 716 if (nd == NULL) { 717 return (0); 718 } 719 } 720 if (*sideno == MD_SIDEWILD) 721 return (0); 722 } else { 723 for (i = (*sideno)+1; i < MD_MAXSIDES; i++) 724 /* Find next full slot */ 725 if (sd->sd_nodes[i][0] != '\0') 726 break; 727 728 /* No more sides - we are done */ 729 if (i == MD_MAXSIDES) 730 return (0); 731 732 /* Set up the return sideno */ 733 *sideno = i; 734 nodename = (char *)sd->sd_nodes[i]; 735 } 736 } 737 738 /* 739 * Need to pass the node the devid of the disk and get it to 740 * send back the details of the disk from that side. 741 */ 742 if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL) 743 return (-1); 744 745 dnp = np->drivenamep; 746 747 /* 748 * By default, set up the parameters so that they are copied out. 749 */ 750 if (ret_bname != NULL) 751 *ret_bname = Strdup(np->bname); 752 753 if (ret_dname != NULL) { 754 mdcinfo_t *cinfo; 755 756 if ((cinfo = metagetcinfo(np, ep)) == NULL) 757 return (-1); 758 759 *ret_dname = Strdup(cinfo->dname); 760 } 761 762 if (ret_mnum != NULL) 763 *ret_mnum = meta_getminor(np->dev); 764 765 /* 766 * Try some optimization. If this is the local set or the device 767 * is a metadevice then just copy the information. If the device 768 * does not have a devid (due to not having a minor name) then 769 * fall back to the pre-devid behaviour of copying the information 770 * on the device: this is okay because the sanity checks before this 771 * call would have found any issues with the device. If it's a 772 * multi-node diskset also just return ie. copy. 773 */ 774 if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) || 775 (MD_MNSET_DESC(sd))) 776 return (1); 777 778 if (np->minor_name == (char *)NULL) { 779 /* 780 * Have to get the minor name then. The slice should exist 781 * on the disk because it will have already been repartitioned 782 * up prior to getting to this point. 783 */ 784 if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) { 785 (void) mdsyserror(ep, errno, np->bname); 786 return (-1); 787 } 788 (void) devid_get_minor_name(fd, &minor_name); 789 np->minor_name = Strdup(minor_name); 790 devid_str_free(minor_name); 791 (void) close(fd); 792 } 793 794 /* allocate extra space for "/" and NULL hence +2 */ 795 devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2; 796 devidstr = (char *)Malloc(devidstrlen); 797 798 /* 799 * As a minor name is supplied then the ret_devname will be 800 * appropriate to that minor_name and in this case it will be 801 * a block device ie /dev/dsk. 802 */ 803 (void) snprintf(devidstr, devidstrlen, 804 "%s/%s", dnp->devid, np->minor_name); 805 806 ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev, 807 np->bname, &ret_devname, &ret_driver, ep); 808 809 Free(devidstr); 810 811 /* 812 * If the other side is not running device id in disksets, 813 * 'ret' is set to ENOTSUP in which case we fallback to 814 * the existing behaviour 815 */ 816 if (ret == ENOTSUP) 817 return (1); 818 else if (ret == -1) 819 return (-1); 820 821 /* 822 * ret_devname comes from the rpc call and is a 823 * raw device name. We need to make this into a 824 * block device via blkname for further processing. 825 * Unfortunately, when our device id isn't found in 826 * the system, the rpc call will return a " " in 827 * ret_devname in which case we need to fill that in 828 * as ret_blkname because blkname of " " returns NULL. 829 */ 830 if (ret_bname != NULL && ret_devname != NULL) { 831 ret_blkdevname = blkname(ret_devname); 832 if (ret_blkdevname == NULL) 833 *ret_bname = Strdup(ret_devname); 834 else 835 *ret_bname = Strdup(ret_blkdevname); 836 } 837 838 if (ret_dname != NULL && ret_driver != NULL) 839 *ret_dname = Strdup(ret_driver); 840 841 if (ret_mnum != NULL) 842 *ret_mnum = meta_getminor(retdev); 843 844 return (1); 845 } 846 847 int 848 meta_is_drive_in_anyset( 849 mddrivename_t *dnp, 850 mdsetname_t **spp, 851 int bypass_daemon, 852 md_error_t *ep 853 ) 854 { 855 set_t setno; 856 mdsetname_t *this_sp; 857 int is_it; 858 set_t max_sets; 859 860 if ((max_sets = get_max_sets(ep)) == 0) 861 return (-1); 862 863 assert(spp != NULL); 864 *spp = NULL; 865 866 for (setno = 1; setno < max_sets; setno++) { 867 if (!bypass_daemon) { 868 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 869 if (mdismddberror(ep, MDE_DB_NODB)) { 870 mdclrerror(ep); 871 return (0); 872 } 873 if (mdiserror(ep, MDE_NO_SET)) { 874 mdclrerror(ep); 875 continue; 876 } 877 return (-1); 878 } 879 } else 880 this_sp = metafakesetname(setno, NULL); 881 882 if ((is_it = meta_is_drive_in_thisset(this_sp, dnp, 883 bypass_daemon, ep)) == -1) { 884 if (mdiserror(ep, MDE_NO_SET)) { 885 mdclrerror(ep); 886 continue; 887 } 888 return (-1); 889 } 890 if (is_it) { 891 *spp = this_sp; 892 return (0); 893 } 894 } 895 return (0); 896 } 897 898 int 899 meta_is_drive_in_thisset( 900 mdsetname_t *sp, 901 mddrivename_t *dnp, 902 int bypass_daemon, 903 md_error_t *ep 904 ) 905 { 906 md_drive_desc *dd, *p; 907 908 if (bypass_daemon) 909 dd = dr2drivedesc(sp, MD_SIDEWILD, 910 (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep); 911 else 912 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 913 914 if (dd == NULL) { 915 if (! mdisok(ep)) 916 return (-1); 917 return (0); 918 } 919 920 921 for (p = dd; p != NULL; p = p->dd_next) 922 if (strcmp(p->dd_dnp->cname, dnp->cname) == 0) 923 return (1); 924 return (0); 925 } 926 927 /* 928 * Check to see if devid is in use in any diskset. 929 * This is used in the case when a partial diskset is being imported 930 * to make sure that the unvailable drive isn't already in use in an 931 * already imported partial diskset. Can't check on the cname since the 932 * unavailable disk's cname is from the previous system and may collide 933 * with a cname on this system. 934 * Return values: 935 * 1: devid has been found in a diskset 936 * 0: devid not found in any diskset 937 */ 938 int 939 meta_is_devid_in_anyset( 940 void *devid, 941 mdsetname_t **spp, 942 md_error_t *ep 943 ) 944 { 945 set_t setno; 946 mdsetname_t *this_sp; 947 int is_it; 948 set_t max_sets; 949 950 if ((max_sets = get_max_sets(ep)) == 0) 951 return (-1); 952 953 assert(spp != NULL); 954 *spp = NULL; 955 956 for (setno = 1; setno < max_sets; setno++) { 957 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 958 if (mdismddberror(ep, MDE_DB_NODB)) { 959 mdclrerror(ep); 960 return (0); 961 } 962 if (mdiserror(ep, MDE_NO_SET)) { 963 mdclrerror(ep); 964 continue; 965 } 966 return (-1); 967 } 968 969 if ((is_it = meta_is_devid_in_thisset(this_sp, 970 devid, ep)) == -1) { 971 if (mdiserror(ep, MDE_NO_SET)) { 972 mdclrerror(ep); 973 continue; 974 } 975 return (-1); 976 } 977 if (is_it) { 978 *spp = this_sp; 979 return (0); 980 } 981 } 982 return (0); 983 } 984 985 int 986 meta_is_devid_in_thisset( 987 mdsetname_t *sp, 988 void *devid, 989 md_error_t *ep 990 ) 991 { 992 md_drive_desc *dd, *p; 993 ddi_devid_t dd_devid; 994 995 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 996 if (dd == NULL) { 997 if (! mdisok(ep)) 998 return (-1); 999 return (0); 1000 } 1001 1002 for (p = dd; p != NULL; p = p->dd_next) { 1003 if (p->dd_dnp->devid == NULL) 1004 continue; 1005 (void) devid_str_decode(p->dd_dnp->devid, 1006 &dd_devid, NULL); 1007 if (dd_devid == NULL) 1008 continue; 1009 if (devid_compare(devid, dd_devid) == 0) { 1010 devid_free(dd_devid); 1011 return (1); 1012 } 1013 devid_free(dd_devid); 1014 } 1015 return (0); 1016 } 1017 1018 int 1019 meta_set_balance( 1020 mdsetname_t *sp, 1021 md_error_t *ep 1022 ) 1023 { 1024 md_set_desc *sd; 1025 md_drive_desc *dd, *curdd; 1026 daddr_t dbsize; 1027 daddr_t nblks; 1028 int i; 1029 int rval = 0; 1030 sigset_t oldsigs; 1031 md_setkey_t *cl_sk; 1032 md_error_t xep = mdnullerror; 1033 md_mnnode_desc *nd; 1034 int suspend1_flag = 0; 1035 1036 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1037 return (-1); 1038 1039 dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 1040 1041 /* Make sure we own the set */ 1042 if (meta_check_ownership(sp, ep) != 0) 1043 return (-1); 1044 1045 /* END CHECK CODE */ 1046 1047 /* 1048 * Get drive descriptors for the drives that are currently in the set. 1049 */ 1050 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); 1051 1052 if (! mdisok(ep)) 1053 return (-1); 1054 1055 /* Find the minimum replica size in use is or use the default */ 1056 if ((nblks = meta_db_minreplica(sp, ep)) < 0) 1057 mdclrerror(ep); 1058 else 1059 dbsize = nblks; /* adjust replica size */ 1060 1061 /* Make sure we are blocking all signals */ 1062 if (procsigs(TRUE, &oldsigs, &xep) < 0) 1063 mdclrerror(&xep); 1064 1065 /* 1066 * Lock the set on current set members. 1067 * For MN diskset lock_set and SUSPEND are used to protect against 1068 * other meta* commands running on the other nodes. 1069 */ 1070 if (MD_MNSET_DESC(sd)) { 1071 nd = sd->sd_nodelist; 1072 while (nd) { 1073 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1074 nd = nd->nd_next; 1075 continue; 1076 } 1077 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1078 rval = -1; 1079 goto out; 1080 } 1081 nd = nd->nd_next; 1082 } 1083 /* 1084 * Lock out other meta* commands by suspending 1085 * class 1 messages across the diskset. 1086 */ 1087 nd = sd->sd_nodelist; 1088 while (nd) { 1089 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1090 nd = nd->nd_next; 1091 continue; 1092 } 1093 if (clnt_mdcommdctl(nd->nd_nodename, 1094 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 1095 MD_MSCF_NO_FLAGS, ep)) { 1096 rval = -1; 1097 goto out; 1098 } 1099 suspend1_flag = 1; 1100 nd = nd->nd_next; 1101 } 1102 } else { 1103 for (i = 0; i < MD_MAXSIDES; i++) { 1104 /* Skip empty slots */ 1105 if (sd->sd_nodes[i][0] == '\0') continue; 1106 1107 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1108 rval = -1; 1109 goto out; 1110 } 1111 } 1112 } 1113 1114 /* We are not adding or deleting any drives, just balancing */ 1115 dd = NULL; 1116 1117 /* 1118 * Balance the DB's according to the list of existing drives and the 1119 * list of added drives. 1120 */ 1121 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) 1122 goto out; 1123 1124 out: 1125 /* 1126 * Unlock diskset by resuming class 1 messages across the diskset. 1127 * Just resume all classes so that resume is the same whether 1128 * just one class was locked or all classes were locked. 1129 */ 1130 if (suspend1_flag) { 1131 nd = sd->sd_nodelist; 1132 while (nd) { 1133 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1134 nd = nd->nd_next; 1135 continue; 1136 } 1137 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 1138 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 1139 /* 1140 * We are here because we failed to resume 1141 * rpc.mdcommd. However we potentially have 1142 * an error from the previous call 1143 * (meta_db_balance). If the previous call 1144 * did fail, we capture that error and 1145 * generate a perror withthe string, 1146 * "Unable to resume...". 1147 * Setting rval to -1 ensures that in the 1148 * next iteration of the loop, ep is not 1149 * clobbered. 1150 */ 1151 if (rval == 0) 1152 (void) mdstealerror(ep, &xep); 1153 else 1154 mdclrerror(&xep); 1155 rval = -1; 1156 mde_perror(ep, dgettext(TEXT_DOMAIN, 1157 "Unable to resume rpc.mdcommd.")); 1158 } 1159 nd = nd->nd_next; 1160 } 1161 } 1162 1163 /* Unlock the set */ 1164 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1165 if (MD_MNSET_DESC(sd)) { 1166 nd = sd->sd_nodelist; 1167 while (nd) { 1168 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1169 nd = nd->nd_next; 1170 continue; 1171 } 1172 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1173 if (rval == 0) 1174 (void) mdstealerror(ep, &xep); 1175 else 1176 mdclrerror(&xep); 1177 rval = -1; 1178 } 1179 nd = nd->nd_next; 1180 } 1181 } else { 1182 for (i = 0; i < MD_MAXSIDES; i++) { 1183 /* Skip empty slots */ 1184 if (sd->sd_nodes[i][0] == '\0') 1185 continue; 1186 1187 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1188 if (rval == 0) 1189 (void) mdstealerror(ep, &xep); 1190 rval = -1; 1191 } 1192 } 1193 } 1194 1195 /* release signals back to what they were on entry */ 1196 if (procsigs(FALSE, &oldsigs, &xep) < 0) 1197 mdclrerror(&xep); 1198 1199 cl_set_setkey(NULL); 1200 1201 metaflushsetname(sp); 1202 1203 return (rval); 1204 } 1205 1206 int 1207 meta_set_destroy( 1208 mdsetname_t *sp, 1209 int lock_set, 1210 md_error_t *ep 1211 ) 1212 { 1213 int i; 1214 med_rec_t medr; 1215 md_set_desc *sd; 1216 md_drive_desc *dd, *p, *p1; 1217 mddrivename_t *dnp; 1218 mdname_t *np; 1219 mdnamelist_t *nlp = NULL; 1220 int num_users = 0; 1221 int has_set; 1222 side_t mysideno; 1223 sigset_t oldsigs; 1224 md_error_t xep = mdnullerror; 1225 md_setkey_t *cl_sk; 1226 int rval = 0; 1227 int delete_end = 1; 1228 1229 /* Make sure we are blocking all signals */ 1230 if (procsigs(TRUE, &oldsigs, ep) < 0) 1231 return (-1); 1232 1233 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1234 if (! mdisok(ep)) 1235 rval = -1; 1236 goto out; 1237 } 1238 1239 /* 1240 * meta_set_destroy should not be called for a MN diskset. 1241 * This routine destroys a set without communicating this information 1242 * to the other nodes which would lead to an inconsistency in 1243 * the MN diskset. 1244 */ 1245 if (MD_MNSET_DESC(sd)) { 1246 rval = -1; 1247 goto out; 1248 } 1249 1250 /* Continue if a traditional diskset */ 1251 1252 /* 1253 * Check to see who has the set. If we are not the last user of the 1254 * set, we will not touch the replicas. 1255 */ 1256 for (i = 0; i < MD_MAXSIDES; i++) { 1257 /* Skip empty slots */ 1258 if (sd->sd_nodes[i][0] == '\0') 1259 continue; 1260 1261 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, 1262 ep); 1263 1264 if (has_set < 0) { 1265 mdclrerror(ep); 1266 } else 1267 num_users++; 1268 } 1269 1270 if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) { 1271 if (! mdisok(ep)) { 1272 rval = -1; 1273 goto out; 1274 } 1275 } 1276 1277 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 1278 rval = -1; 1279 goto out; 1280 } 1281 1282 if (lock_set == TRUE) { 1283 /* Lock the set on our side */ 1284 if (clnt_lock_set(mynode(), sp, ep)) { 1285 rval = -1; 1286 goto out; 1287 } 1288 } 1289 1290 /* 1291 * A traditional diskset has no diskset stale information to send 1292 * since there can only be one owner node at a time. 1293 */ 1294 if (snarf_set(sp, FALSE, ep)) 1295 mdclrerror(ep); 1296 1297 if (dd != NULL) { 1298 /* 1299 * Make sure that no drives are in use as parts of metadrives 1300 * or hot spare pools, this is one of the few error conditions 1301 * that will stop this routine, unless the environment has 1302 * META_DESTROY_SET_OK set, in which case, the operation will 1303 * proceed. 1304 */ 1305 if (getenv("META_DESTROY_SET_OK") == NULL) { 1306 for (p = dd; p != NULL; p = p->dd_next) { 1307 dnp = p->dd_dnp; 1308 1309 i = meta_check_drive_inuse(sp, dnp, FALSE, ep); 1310 if (i == -1) { 1311 /* need xep - wire calls clear error */ 1312 i = metaget_setownership(sp, &xep); 1313 if (i == -1) { 1314 rval = -1; 1315 goto out; 1316 } 1317 1318 mysideno = getmyside(sp, &xep); 1319 1320 if (mysideno == MD_SIDEWILD) { 1321 rval = -1; 1322 goto out; 1323 } 1324 1325 if (sd->sd_isown[mysideno] == FALSE) 1326 if (halt_set(sp, &xep)) { 1327 rval = -1; 1328 goto out; 1329 } 1330 1331 rval = -1; 1332 goto out; 1333 } 1334 } 1335 } 1336 1337 for (i = 0; i < MD_MAXSIDES; i++) { 1338 /* Skip empty slots */ 1339 if (sd->sd_nodes[i][0] == '\0') 1340 continue; 1341 1342 /* Skip non local nodes */ 1343 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1344 continue; 1345 1346 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep)) 1347 mdclrerror(ep); 1348 } 1349 1350 /* 1351 * Go thru each drive and individually delete the replicas. 1352 * This way we can ignore individual errors. 1353 */ 1354 for (p = dd; p != NULL; p = p->dd_next) { 1355 uint_t rep_slice; 1356 1357 dnp = p->dd_dnp; 1358 if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 1359 (((np = metaslicename(dnp, rep_slice, ep)) 1360 == NULL) && 1361 ((np = metaslicename(dnp, MD_SLICE0, ep)) 1362 == NULL))) { 1363 rval = -1; 1364 goto out; 1365 } 1366 1367 if ((np = metaslicename(dnp, 1368 rep_slice, ep)) == NULL) { 1369 if ((np = metaslicename(dnp, 1370 MD_SLICE0, ep)) == NULL) { 1371 rval = -1; 1372 goto out; 1373 } 1374 mdclrerror(ep); 1375 } 1376 1377 /* Yes this is UGLY!!! */ 1378 p1 = p->dd_next; 1379 p->dd_next = NULL; 1380 if (rel_own_bydd(sp, p, FALSE, ep)) 1381 mdclrerror(ep); 1382 p->dd_next = p1; 1383 1384 if (p->dd_dbcnt == 0) 1385 continue; 1386 1387 /* 1388 * Skip the replica removal if we are not the last user 1389 */ 1390 if (num_users != 1) 1391 continue; 1392 1393 nlp = NULL; 1394 (void) metanamelist_append(&nlp, np); 1395 if (meta_db_detach(sp, nlp, 1396 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep)) 1397 mdclrerror(ep); 1398 metafreenamelist(nlp); 1399 } 1400 } 1401 1402 if (halt_set(sp, ep)) { 1403 rval = -1; 1404 goto out; 1405 } 1406 1407 /* Setup the mediator record */ 1408 (void) memset(&medr, '\0', sizeof (med_rec_t)); 1409 medr.med_rec_mag = MED_REC_MAGIC; 1410 medr.med_rec_rev = MED_REC_REV; 1411 medr.med_rec_fl = 0; 1412 medr.med_rec_sn = sp->setno; 1413 (void) strcpy(medr.med_rec_snm, sp->setname); 1414 medr.med_rec_meds = sd->sd_med; /* structure assigment */ 1415 (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t)); 1416 medr.med_rec_foff = 0; 1417 1418 /* 1419 * If we are the last remaining user, then remove the mediator hosts 1420 */ 1421 if (num_users == 1) { 1422 for (i = 0; i < MED_MAX_HOSTS; i++) { 1423 if (medr.med_rec_meds.n_lst[i].a_cnt != 0) 1424 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, 1425 SVM_TAG_MEDIATOR, sp->setno, i); 1426 (void) memset(&medr.med_rec_meds.n_lst[i], '\0', 1427 sizeof (md_h_t)); 1428 } 1429 medr.med_rec_meds.n_cnt = 0; 1430 } else { /* Remove this host from the mediator node list. */ 1431 for (i = 0; i < MD_MAXSIDES; i++) { 1432 /* Skip empty slots */ 1433 if (sd->sd_nodes[i][0] == '\0') 1434 continue; 1435 1436 /* Copy non local node */ 1437 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) { 1438 (void) strcpy(medr.med_rec_nodes[i], 1439 sd->sd_nodes[i]); 1440 continue; 1441 } 1442 1443 /* Clear local node */ 1444 (void) memset(&medr.med_rec_nodes[i], '\0', 1445 sizeof (md_node_nm_t)); 1446 } 1447 } 1448 1449 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); 1450 1451 /* 1452 * If the client is part of a cluster put the DCS service 1453 * into a deleteing state. 1454 */ 1455 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1456 if (metad_isautotakebyname(sp->setname)) { 1457 delete_end = 0; 1458 } else { 1459 mdclrerror(ep); 1460 goto out; 1461 } 1462 } 1463 1464 /* Inform the mediator hosts of the new information */ 1465 for (i = 0; i < MED_MAX_HOSTS; i++) { 1466 if (sd->sd_med.n_lst[i].a_cnt == 0) 1467 continue; 1468 1469 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) 1470 mdclrerror(ep); 1471 } 1472 1473 /* Delete the set locally */ 1474 for (i = 0; i < MD_MAXSIDES; i++) { 1475 /* Skip empty slots */ 1476 if (sd->sd_nodes[i][0] == '\0') 1477 continue; 1478 1479 /* Skip non local nodes */ 1480 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1481 continue; 1482 1483 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) 1484 mdclrerror(ep); 1485 } 1486 if (delete_end && 1487 sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) 1488 rval = -1; 1489 1490 out: 1491 /* release signals back to what they were on entry */ 1492 if (procsigs(FALSE, &oldsigs, &xep) < 0) { 1493 if (rval == 0) 1494 (void) mdstealerror(ep, &xep); 1495 rval = -1; 1496 } 1497 1498 if (lock_set == TRUE) { 1499 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1500 if (clnt_unlock_set(mynode(), cl_sk, &xep)) { 1501 if (rval == 0) 1502 (void) mdstealerror(ep, &xep); 1503 rval = -1; 1504 } 1505 cl_set_setkey(NULL); 1506 } 1507 1508 metaflushsetname(sp); 1509 return (rval); 1510 } 1511 1512 int 1513 meta_set_purge( 1514 mdsetname_t *sp, 1515 int bypass_cluster, 1516 int forceflg, 1517 md_error_t *ep 1518 ) 1519 { 1520 char *thishost = mynode(); 1521 md_set_desc *sd; 1522 md_setkey_t *cl_sk; 1523 md_error_t xep = mdnullerror; 1524 int rval = 0; 1525 int i, num_hosts = 0; 1526 int has_set = 0; 1527 int max_node = 0; 1528 int delete_end = 1; 1529 md_mnnode_desc *nd; 1530 1531 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1532 /* unable to find set description */ 1533 rval = 1; 1534 return (rval); 1535 } 1536 1537 if (MD_MNSET_DESC(sd)) { 1538 /* 1539 * Get a count of the hosts in the set and also lock the set 1540 * on those hosts that know about it. 1541 */ 1542 nd = sd->sd_nodelist; 1543 while (nd) { 1544 /* 1545 * Only deal with those nodes that are members of 1546 * the set (MD_MN_NODE_ALIVE) or the node on which 1547 * the purge is being run. We must lock the set 1548 * on the purging node because the delset call 1549 * requires the lock to be set. 1550 */ 1551 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) && 1552 nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) { 1553 nd = nd->nd_next; 1554 continue; 1555 } 1556 has_set = nodehasset(sp, nd->nd_nodename, 1557 NHS_NST_EQ, ep); 1558 1559 /* 1560 * The host is not aware of this set (has_set < 0) or 1561 * the set does not match (has_set == 0). This check 1562 * prevents the code getting confused by an apparent 1563 * inconsistancy in the set's state, this is in the 1564 * purge code so something is broken in any case and 1565 * this is just trying to fix the brokeness. 1566 */ 1567 if (has_set <= 0) { 1568 mdclrerror(ep); 1569 nd->nd_flags |= MD_MN_NODE_NOSET; 1570 } else { 1571 num_hosts++; 1572 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1573 /* 1574 * If the force flag is set then 1575 * ignore any RPC failures because we 1576 * are only really interested with 1577 * the set on local node. 1578 */ 1579 if (forceflg && mdanyrpcerror(ep)) { 1580 mdclrerror(ep); 1581 } else { 1582 /* 1583 * set max_node so that in the 1584 * unlock code nodes in the 1585 * set that have not been 1586 * locked are not unlocked. 1587 */ 1588 max_node = nd->nd_nodeid; 1589 rval = 2; 1590 goto out1; 1591 } 1592 } 1593 1594 } 1595 nd = nd->nd_next; 1596 } 1597 max_node = 0; 1598 } else { 1599 /* 1600 * Get a count of the hosts in the set and also lock the set 1601 * on those hosts that know about it. 1602 */ 1603 for (i = 0; i < MD_MAXSIDES; i++) { 1604 /* Skip empty slots */ 1605 if (sd->sd_nodes[i][0] == '\0') 1606 continue; 1607 1608 has_set = nodehasset(sp, sd->sd_nodes[i], 1609 NHS_NST_EQ, ep); 1610 1611 /* 1612 * The host is not aware of this set (has_set < 0) or 1613 * the set does not match (has_set == 0). This check 1614 * prevents the code getting confused by an apparent 1615 * inconsistancy in the set's state, this is in the 1616 * purge code so something is broken in any case and 1617 * this is just trying to fix the brokeness. 1618 */ 1619 if (has_set <= 0) { 1620 mdclrerror(ep); 1621 /* 1622 * set the node to NULL to prevent further 1623 * requests to this unresponsive node. 1624 */ 1625 sd->sd_nodes[i][0] = '\0'; 1626 } else { 1627 num_hosts++; 1628 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1629 /* 1630 * If the force flag is set then 1631 * ignore any RPC failures because we 1632 * are only really interested with 1633 * the set on local node. 1634 */ 1635 if (forceflg && mdanyrpcerror(ep)) { 1636 mdclrerror(ep); 1637 } else { 1638 rval = 2; 1639 /* 1640 * set max_node so that in the 1641 * unlock code nodes in the 1642 * set that have not been 1643 * locked are not unlocked. 1644 */ 1645 max_node = i; 1646 goto out1; 1647 } 1648 } 1649 } 1650 } 1651 max_node = i; /* now MD_MAXSIDES */ 1652 } 1653 if (!bypass_cluster) { 1654 /* 1655 * If there is only one host associated with the 1656 * set then remove the set from the cluster. 1657 */ 1658 if (num_hosts == 1) { 1659 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1660 if (metad_isautotakebyname(sp->setname)) { 1661 delete_end = 0; 1662 } else { 1663 mdclrerror(ep); 1664 rval = 3; 1665 goto out1; 1666 } 1667 } 1668 } 1669 } 1670 1671 if (MD_MNSET_DESC(sd)) { 1672 nd = sd->sd_nodelist; 1673 while (nd) { 1674 if (nd->nd_nodeid == sd->sd_mn_mynode->nd_nodeid) { 1675 /* 1676 * This is the node on which the purge is 1677 * being run. We do not care if it is 1678 * alive or not, just want to get rid of 1679 * the set. 1680 */ 1681 if (clnt_delset(nd->nd_nodename, sp, 1682 ep) == -1) { 1683 md_perror(dgettext(TEXT_DOMAIN, 1684 "delset")); 1685 if (!bypass_cluster && num_hosts == 1) 1686 (void) sdssc_delete_end( 1687 sp->setname, SDSSC_CLEANUP); 1688 mdclrerror(ep); 1689 goto out1; 1690 } 1691 nd = nd->nd_next; 1692 continue; 1693 } 1694 1695 /* 1696 * Only contact those nodes that are members of 1697 * the set. 1698 */ 1699 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1700 nd = nd->nd_next; 1701 continue; 1702 } 1703 1704 /* 1705 * Tell the remote node to remove this node 1706 */ 1707 if (clnt_delhosts(nd->nd_nodename, sp, 1, &thishost, 1708 ep) == -1) { 1709 /* 1710 * If we fail to delete ourselves 1711 * from the remote host it does not 1712 * really matter because the set is 1713 * being "purged" from this node. The 1714 * set can be purged from the other 1715 * node at a later time. 1716 */ 1717 mdclrerror(ep); 1718 } 1719 nd = nd->nd_next; 1720 } 1721 } else { 1722 for (i = 0; i < MD_MAXSIDES; i++) { 1723 /* Skip empty slots */ 1724 if (sd->sd_nodes[i][0] == '\0') 1725 continue; 1726 if (strcmp(thishost, sd->sd_nodes[i]) != 0) { 1727 /* 1728 * Tell the remote node to remove this node 1729 */ 1730 if (clnt_delhosts(sd->sd_nodes[i], sp, 1, 1731 &thishost, ep) == -1) { 1732 /* 1733 * If we fail to delete ourselves 1734 * from the remote host it does not 1735 * really matter because the set is 1736 * being "purged" from this node. The 1737 * set can be purged from the other 1738 * node at a later time. 1739 */ 1740 mdclrerror(ep); 1741 } 1742 continue; 1743 } 1744 1745 /* remove the set from this host */ 1746 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) { 1747 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1748 if (!bypass_cluster && num_hosts == 1) 1749 (void) sdssc_delete_end(sp->setname, 1750 SDSSC_CLEANUP); 1751 mdclrerror(ep); 1752 goto out1; 1753 } 1754 } 1755 } 1756 1757 if (!bypass_cluster && num_hosts == 1) { 1758 if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == 1759 SDSSC_ERROR) { 1760 rval = 4; 1761 } 1762 } 1763 1764 out1: 1765 1766 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1767 1768 /* 1769 * Remove the set lock on those nodes that had the set locked 1770 * max_node will either be MD_MAXSIDES or array index of the last 1771 * node contacted (or rather failed to contact) for traditional 1772 * diskset. For a MN diskset, max_node is the node_id of the node 1773 * that failed the lock. 1774 */ 1775 if (MD_MNSET_DESC(sd)) { 1776 nd = sd->sd_nodelist; 1777 while (nd) { 1778 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1779 nd = nd->nd_next; 1780 continue; 1781 } 1782 if (nd->nd_nodeid == max_node) 1783 break; 1784 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1785 if (forceflg && mdanyrpcerror(&xep)) { 1786 mdclrerror(&xep); 1787 nd = nd->nd_next; 1788 continue; 1789 } 1790 if (rval == 0) 1791 (void) mdstealerror(ep, &xep); 1792 rval = 5; 1793 } 1794 nd = nd->nd_next; 1795 } 1796 } else { 1797 for (i = 0; i < max_node; i++) { 1798 /* Skip empty slots */ 1799 if (sd->sd_nodes[i][0] == '\0') 1800 continue; 1801 1802 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1803 if (forceflg && mdanyrpcerror(&xep)) { 1804 mdclrerror(&xep); 1805 continue; 1806 } 1807 if (rval == 0) 1808 (void) mdstealerror(ep, &xep); 1809 rval = 5; 1810 } 1811 } 1812 } 1813 1814 cl_set_setkey(NULL); 1815 1816 return (rval); 1817 } 1818 1819 int 1820 meta_set_query( 1821 mdsetname_t *sp, 1822 mddb_dtag_lst_t **dtlpp, 1823 md_error_t *ep 1824 ) 1825 { 1826 mddb_dtag_get_parm_t dtgp; 1827 1828 (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t)); 1829 dtgp.dtgp_setno = sp->setno; 1830 1831 /*CONSTCOND*/ 1832 while (1) { 1833 if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0) 1834 if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) || 1835 *dtlpp == NULL) 1836 return (mdstealerror(ep, &dtgp.dtgp_mde)); 1837 else 1838 break; 1839 1840 /* 1841 * Run to the end of the list 1842 */ 1843 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) 1844 /* void */; 1845 1846 *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t)); 1847 1848 (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt, 1849 sizeof (mddb_dtag_t)); 1850 1851 dtgp.dtgp_dt.dt_id++; 1852 } 1853 return (0); 1854 } 1855 1856 /* 1857 * return drivename get by key 1858 */ 1859 mddrivename_t * 1860 metadrivename_withdrkey( 1861 mdsetname_t *sp, 1862 side_t sideno, 1863 mdkey_t key, 1864 int flags, 1865 md_error_t *ep 1866 ) 1867 { 1868 char *nm; 1869 mdname_t *np; 1870 mddrivename_t *dnp; 1871 ddi_devid_t devidp; 1872 md_set_desc *sd; 1873 1874 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1875 return (NULL); 1876 } 1877 1878 /* 1879 * Get the devid associated with the key. 1880 * 1881 * If a devid was returned, it MUST be valid even in 1882 * the case where a device id has been "updated". The 1883 * "update" of the device id may have occured due to 1884 * a firmware upgrade. 1885 */ 1886 if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) 1887 != NULL) { 1888 /* 1889 * Look for the correct dnp using the devid for comparison. 1890 */ 1891 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 1892 free(devidp); 1893 1894 /* dnp could be NULL if the devid could not be decoded. */ 1895 if (dnp == NULL) { 1896 return (NULL); 1897 } 1898 dnp->side_names_key = key; 1899 } else { 1900 /* 1901 * We didn't get a devid. We'll try for a dnp using the 1902 * name. If we have a MN diskset or if the dnp is a did 1903 * device, we're done because then we don't have devids. 1904 * Otherwise we'll try to set the devid 1905 * and get the dnp via devid again. 1906 * We also need to clear the ep structure. When the 1907 * above call to meta_getdidbykey returned a null, it 1908 * also put an error code into ep. In this case, the null 1909 * return is actually OK and any errors can be ignored. The 1910 * reason it is OK is because this could be a MN set or 1911 * we could be running without devids (ex cluster). 1912 */ 1913 mdclrerror(ep); 1914 1915 if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key, 1916 ep)) == NULL) 1917 return (NULL); 1918 /* get device name */ 1919 if (flags & PRINT_FAST) { 1920 if ((np = metaname_fast(&sp, nm, 1921 LOGICAL_DEVICE, ep)) == NULL) { 1922 Free(nm); 1923 return (NULL); 1924 } 1925 } else { 1926 if ((np = metaname(&sp, nm, LOGICAL_DEVICE, 1927 ep)) == NULL) { 1928 Free(nm); 1929 return (NULL); 1930 } 1931 } 1932 Free(nm); 1933 /* make sure it's OK */ 1934 if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, 1935 ep) != 0)) 1936 return (NULL); 1937 1938 /* get drivename */ 1939 dnp = np->drivenamep; 1940 dnp->side_names_key = key; 1941 /* 1942 * Skip the devid set/check for the following cases: 1943 * 1) If MN diskset, there are no devid's 1944 * 2) if dnp is did device 1945 * The device id is disabled for did device due to the 1946 * lack of minor name support in the did driver. The following 1947 * devid code path can set and propagate the error and 1948 * eventually prevent did disks from being added to the 1949 * diskset under SunCluster systems 1950 * 1951 * Note that this code can be called through rpc.mdcommd. 1952 * sdssc_version cannot be used because the library won't 1953 * be bound. 1954 */ 1955 if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) 1956 == 0) || (MD_MNSET_DESC(sd))) 1957 goto out; 1958 1959 /* 1960 * It is okay if replica is not in devid mode 1961 */ 1962 if (mdissyserror(ep, MDDB_F_NODEVID)) { 1963 mdclrerror(ep); 1964 goto out; 1965 } 1966 1967 /* 1968 * We're not MN or did devices but 1969 * devid is missing so this means that we have 1970 * just upgraded from a configuration where 1971 * devid's were not used so try to add in 1972 * the devid and requery. If the devid still isn't there, 1973 * that's OK. dnp->devid will be null as it is in any 1974 * configuration with no devids. 1975 */ 1976 if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0) 1977 return (NULL); 1978 if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, 1979 sideno+SKEW, key, ep)) != NULL) { 1980 /* 1981 * Found a devid so look for the dnp using the 1982 * devid as the search mechanism. 1983 */ 1984 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 1985 free(devidp); 1986 if (dnp == NULL) { 1987 return (NULL); 1988 } 1989 dnp->side_names_key = key; 1990 } 1991 } 1992 1993 1994 1995 out: 1996 if (flags & MD_BYPASS_DAEMON) 1997 return (dnp); 1998 1999 if (get_sidenmlist(sp, dnp, ep)) 2000 return (NULL); 2001 2002 /* return success */ 2003 return (dnp); 2004 } 2005 2006 void 2007 metafreedrivedesc(md_drive_desc **dd) 2008 { 2009 md_drive_desc *p, *next = NULL; 2010 2011 for (p = *dd; p != NULL; p = next) { 2012 next = p->dd_next; 2013 Free(p); 2014 } 2015 *dd = NULL; 2016 } 2017 2018 md_drive_desc * 2019 metaget_drivedesc( 2020 mdsetname_t *sp, 2021 int flags, 2022 md_error_t *ep 2023 ) 2024 { 2025 side_t sideno = MD_SIDEWILD; 2026 2027 assert(! (flags & MD_BYPASS_DAEMON)); 2028 2029 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 2030 return (NULL); 2031 2032 return (metaget_drivedesc_sideno(sp, sideno, flags, ep)); 2033 } 2034 2035 md_drive_desc * 2036 metaget_drivedesc_fromnamelist( 2037 mdsetname_t *sp, 2038 mdnamelist_t *nlp, 2039 md_error_t *ep 2040 ) 2041 { 2042 md_set_desc *sd; 2043 mdnamelist_t *p; 2044 md_drive_desc *dd = NULL; 2045 2046 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2047 return (NULL); 2048 2049 for (p = nlp; p != NULL; p = p->next) 2050 (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0, 2051 sd->sd_ctime, sd->sd_genid, MD_DR_ADD); 2052 2053 return (dd); 2054 } 2055 2056 md_drive_desc * 2057 metaget_drivedesc_sideno( 2058 mdsetname_t *sp, 2059 side_t sideno, 2060 int flags, 2061 md_error_t *ep 2062 ) 2063 { 2064 md_set_desc *sd = NULL; 2065 2066 assert(! (flags & MD_BYPASS_DAEMON)); 2067 2068 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2069 return (NULL); 2070 2071 if (sd->sd_drvs) 2072 return (sd->sd_drvs); 2073 2074 if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL) 2075 return (NULL); 2076 2077 return (sd->sd_drvs); 2078 } 2079 2080 int 2081 metaget_setownership( 2082 mdsetname_t *sp, 2083 md_error_t *ep 2084 ) 2085 { 2086 md_set_desc *sd; 2087 int bool; 2088 int i; 2089 md_mnnode_desc *nd; 2090 2091 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2092 return (-1); 2093 2094 if (MD_MNSET_DESC(sd)) { 2095 nd = sd->sd_nodelist; 2096 while (nd) { 2097 /* If node isn't alive, can't own diskset */ 2098 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2099 nd->nd_flags &= ~MD_MN_NODE_OWN; 2100 nd = nd->nd_next; 2101 continue; 2102 } 2103 /* 2104 * If can't communicate with rpc.metad, then mark 2105 * this node as not an owner. That node may 2106 * in fact, be an owner, but without rpc.metad running 2107 * that node can't do much. 2108 */ 2109 if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) { 2110 nd->nd_flags &= ~MD_MN_NODE_OWN; 2111 } else if (bool == TRUE) { 2112 nd->nd_flags |= MD_MN_NODE_OWN; 2113 } else { 2114 nd->nd_flags &= ~MD_MN_NODE_OWN; 2115 } 2116 nd = nd->nd_next; 2117 } 2118 return (0); 2119 } 2120 2121 /* Rest of code handles traditional disksets */ 2122 2123 for (i = 0; i < MD_MAXSIDES; i++) 2124 sd->sd_isown[i] = 0; 2125 2126 if (clnt_ownset(mynode(), sp, &bool, ep) == -1) 2127 return (-1); 2128 2129 if (bool == TRUE) 2130 sd->sd_isown[getmyside(sp, ep)] = 1; 2131 2132 return (0); 2133 } 2134 2135 char * 2136 mynode(void) 2137 { 2138 static struct utsname myuname; 2139 static int done = 0; 2140 2141 if (! done) { 2142 if (uname(&myuname) == -1) { 2143 md_perror(dgettext(TEXT_DOMAIN, "uname")); 2144 assert(0); 2145 } 2146 done = 1; 2147 } 2148 return (myuname.nodename); 2149 } 2150 2151 int 2152 strinlst(char *str, int cnt, char **lst) 2153 { 2154 int i; 2155 2156 for (i = 0; i < cnt; i++) 2157 if (strcmp(lst[i], str) == 0) 2158 return (TRUE); 2159 2160 return (FALSE); 2161 } 2162 2163 /* 2164 * meta_get_reserved_names 2165 * returns an mdnamelist_t of reserved slices 2166 * reserved slices are those that are used but don't necessarily 2167 * show up as metadevices (ex. reserved slice for db in sets, logs) 2168 */ 2169 2170 /*ARGSUSED*/ 2171 int 2172 meta_get_reserved_names( 2173 mdsetname_t *sp, 2174 mdnamelist_t **nlpp, 2175 int options, 2176 md_error_t *ep) 2177 { 2178 int count = 0; 2179 mdname_t *np = NULL; 2180 mdnamelist_t *transnlp = NULL; 2181 mdnamelist_t **tailpp = nlpp; 2182 mdnamelist_t *nlp; 2183 md_drive_desc *dd, *di; 2184 2185 if (metaislocalset(sp)) 2186 goto out; 2187 2188 if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) { 2189 count = -1; 2190 goto out; 2191 } 2192 2193 /* db in for sets on reserved slice */ 2194 for (di = dd; di && count >= 0; di = di->dd_next) { 2195 uint_t rep_slice; 2196 2197 /* 2198 * Add the name struct to the end of the 2199 * namelist but keep a pointer to the last 2200 * element so that we don't incur the overhead 2201 * of traversing the list each time 2202 */ 2203 if (di->dd_dnp && 2204 (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) && 2205 (np = metaslicename(di->dd_dnp, rep_slice, ep)) && 2206 (tailpp = meta_namelist_append_wrapper(tailpp, np))) 2207 count++; 2208 else 2209 count = -1; 2210 } 2211 2212 /* now find logs */ 2213 if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) { 2214 count = -1; 2215 goto out; 2216 } 2217 2218 for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) { 2219 mdname_t *transnp = nlp->namep; 2220 md_trans_t *transp; 2221 2222 if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) { 2223 count = -1; 2224 goto out; 2225 } 2226 if (transp->lognamep) { 2227 /* 2228 * Add the name struct to the end of the 2229 * namelist but keep a pointer to the last 2230 * element so that we don't incur the overhead 2231 * of traversing the list each time 2232 */ 2233 tailpp = meta_namelist_append_wrapper( 2234 tailpp, transp->lognamep); 2235 } 2236 } 2237 out: 2238 metafreenamelist(transnlp); 2239 return (count); 2240 } 2241 2242 /* 2243 * Entry point to join a node to MultiNode diskset. 2244 * 2245 * Validate host in diskset. 2246 * - Should be in membership list from API 2247 * - Should not already be joined into diskset. 2248 * - Set must have drives 2249 * Assume valid configuration is stored in the set/drive/node records 2250 * in the local mddb since no node or drive can be added to the MNset 2251 * unless all drives and nodes are available. Reconfig steps will 2252 * resync all ALIVE nodes in case of panic in critical areas. 2253 * 2254 * Lock down the set. 2255 * Verify host is a member of this diskset. 2256 * If drives exist in the configuration, load the mddbs. 2257 * Set this node to active by notifying master if one exists. 2258 * If this is the first node active in the diskset, this node 2259 * becomes the master. 2260 * Unlock the set. 2261 * 2262 * Mirror Resync: 2263 * If this node is the last node to join the set and clustering 2264 * isn't running, then start the 'metasync -r' type resync 2265 * on all mirrors in this diskset. 2266 * If clustering is running, this resync operation will 2267 * be handled by the reconfig steps and should NOT 2268 * be handled during a join operation. 2269 * 2270 * There are multiple return values in order to assist 2271 * the join operation of all sets in the metaset command. 2272 * 2273 * Return values: 2274 * 0 - Node successfully joined to set. 2275 * -1 - Join attempted but failed 2276 * - any failure from libmeta calls 2277 * - node not in the member list 2278 * -2 - Join not attempted since 2279 * - this set had no drives in set 2280 * - this node already joined to set 2281 * - set is not a multinode set 2282 * -3 - Node joined to STALE set. 2283 */ 2284 extern int 2285 meta_set_join( 2286 mdsetname_t *sp, 2287 md_error_t *ep 2288 ) 2289 { 2290 md_set_desc *sd; 2291 md_drive_desc *dd; 2292 md_mnnode_desc *nd, *nd2, my_nd; 2293 int rval = 0; 2294 md_setkey_t *cl_sk; 2295 md_error_t xep = mdnullerror; 2296 md_error_t ep_snarf = mdnullerror; 2297 int master_flag = 0; 2298 md_mnset_record *mas_mnsr = NULL; 2299 int clear_nr_flags = 0; 2300 md_mnnode_record *nr; 2301 int stale_set = 0; 2302 int rb_flags = 0; 2303 int stale_bool = FALSE; 2304 int suspendall_flag = 0; 2305 int suspend1_flag = 0; 2306 sigset_t oldsigs; 2307 int send_reinit = 0; 2308 2309 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2310 return (-1); 2311 } 2312 2313 /* Must be a multinode diskset */ 2314 if (!MD_MNSET_DESC(sd)) { 2315 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2316 return (-2); 2317 } 2318 2319 /* Verify that the node is ALIVE (i.e. is in the API membership list) */ 2320 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) { 2321 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, 2322 sd->sd_mn_mynode->nd_nodename, NULL, sp->setname); 2323 return (-1); 2324 } 2325 2326 /* Make sure we are blocking all signals */ 2327 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2328 mdclrerror(&xep); 2329 2330 /* 2331 * Lock the set on current set members. 2332 * For MN diskset lock_set and SUSPEND are used to protect against 2333 * other meta* commands running on the other nodes. 2334 */ 2335 nd = sd->sd_nodelist; 2336 while (nd) { 2337 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2338 nd = nd->nd_next; 2339 continue; 2340 } 2341 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2342 rval = -1; 2343 goto out; 2344 } 2345 nd = nd->nd_next; 2346 } 2347 2348 /* 2349 * Lock out other meta* commands by suspending 2350 * class 1 messages across the diskset. 2351 */ 2352 nd = sd->sd_nodelist; 2353 while (nd) { 2354 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2355 nd = nd->nd_next; 2356 continue; 2357 } 2358 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2359 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2360 rval = -1; 2361 goto out; 2362 } 2363 suspend1_flag = 1; 2364 nd = nd->nd_next; 2365 } 2366 2367 /* 2368 * Verify that this host is a member (in the host list) of the set. 2369 */ 2370 nd = sd->sd_nodelist; 2371 while (nd) { 2372 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2373 break; 2374 } 2375 nd = nd->nd_next; 2376 } 2377 if (!nd) { 2378 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2379 sd->sd_mn_mynode->nd_nodename, NULL, 2380 sp->setname); 2381 rval = -1; 2382 goto out; 2383 } 2384 2385 /* 2386 * Need to return failure if host is already 'joined' 2387 * into the set. This is done so that if later the user 2388 * issues a command to join all sets and a failure is 2389 * encountered - that the resulting cleanup effort 2390 * (withdrawing from all sets that were joined 2391 * during that command) won't withdraw from this set. 2392 */ 2393 if (nd->nd_flags & MD_MN_NODE_OWN) { 2394 rval = -2; 2395 goto out2; 2396 } 2397 2398 /* 2399 * Call metaget_setownership that calls each node in diskset and 2400 * marks in set descriptor if node is an owner of the set or not. 2401 * metaget_setownership checks to see if a node is an owner by 2402 * checking to see if that node's kernel has the mddb loaded. 2403 * If a node had panic'd during a reconfig or an 2404 * add/delete/join/withdraw operation, the other nodes' node 2405 * records may not reflect the current state of the diskset, 2406 * so calling metaget_setownership is the safest thing to do. 2407 */ 2408 if (metaget_setownership(sp, ep) == -1) { 2409 rval = -1; 2410 goto out; 2411 } 2412 2413 /* If first active member of diskset, become the master. */ 2414 nd = sd->sd_nodelist; 2415 while (nd) { 2416 if (nd->nd_flags & MD_MN_NODE_OWN) 2417 break; 2418 nd = nd->nd_next; 2419 } 2420 if (nd == NULL) 2421 master_flag = 1; 2422 2423 /* 2424 * If not first active member of diskset, then get the 2425 * master information from a node that is already joined 2426 * and set the master information for this node. Be sure 2427 * that this node (the already joined node) has its own 2428 * join flag set. If not, then this diskset isn't currently 2429 * consistent and shouldn't allow a node to join. This diskset 2430 * inconsistency should only occur when a node has panic'd in 2431 * the set while doing a metaset operation and the sysadmin is 2432 * attempting to join a node into the set. This inconsistency 2433 * will be fixed during a reconfig cycle which should be occurring 2434 * soon since a node panic'd. 2435 * 2436 * If unable to get this information from an owning node, then 2437 * this diskset isn't currently consistent and shouldn't 2438 * allow a node to join. 2439 */ 2440 if (!master_flag) { 2441 /* get master information from an owner (joined) node */ 2442 if (clnt_mngetset(nd->nd_nodename, sp->setname, 2443 sp->setno, &mas_mnsr, ep) == -1) { 2444 rval = -1; 2445 goto out; 2446 } 2447 2448 /* Verify that owner (joined) node has its own JOIN flag set */ 2449 nr = mas_mnsr->sr_nodechain; 2450 while (nr) { 2451 if ((nd->nd_nodeid == nr->nr_nodeid) && 2452 ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) { 2453 (void) mddserror(ep, MDE_DS_NODENOSET, 2454 sp->setno, nd->nd_nodename, NULL, 2455 nd->nd_nodename); 2456 free_sr((md_set_record *)mas_mnsr); 2457 rval = -1; 2458 goto out; 2459 } 2460 nr = nr->nr_next; 2461 } 2462 2463 /* 2464 * Does master have set marked as STALE? 2465 * If so, need to pass this down to kernel when 2466 * this node snarfs the set. 2467 */ 2468 if (clnt_mn_is_stale(nd->nd_nodename, sp, 2469 &stale_bool, ep) == -1) { 2470 rval = -1; 2471 goto out; 2472 } 2473 2474 /* set master information in my rpc.metad's set record */ 2475 if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm, 2476 mas_mnsr->sr_master_nodeid, ep)) { 2477 free_sr((md_set_record *)mas_mnsr); 2478 rval = -1; 2479 goto out; 2480 } 2481 2482 /* set master information in my cached set desc */ 2483 (void) strcpy(sd->sd_mn_master_nodenm, 2484 mas_mnsr->sr_master_nodenm); 2485 sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid; 2486 nd2 = sd->sd_nodelist; 2487 while (nd2) { 2488 if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) { 2489 sd->sd_mn_masternode = nd2; 2490 break; 2491 } 2492 nd2 = nd2->nd_next; 2493 } 2494 free_sr((md_set_record *)mas_mnsr); 2495 2496 /* 2497 * Set the node flags in mynode's rpc.metad node records for 2498 * the nodes that are in the diskset. Can use my sd 2499 * since earlier call to metaget_setownership set the 2500 * owner flags based on whether that node had snarfed 2501 * the MN diskset mddb. Reconfig steps guarantee that 2502 * return of metaget_setownership will match the owning 2503 * node's owner list except in the case where a node 2504 * has just panic'd and in this case, a reconfig will 2505 * be starting immediately and the owner lists will 2506 * be sync'd up by the reconfig. 2507 * 2508 * Flag of SET means to take no action except to 2509 * set the node flags as given in the nodelist linked list. 2510 */ 2511 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 2512 MD_NR_SET, NULL, ep)) { 2513 rval = -1; 2514 goto out; 2515 } 2516 } 2517 2518 /* 2519 * Read in the mddb if there are drives in the set. 2520 */ 2521 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2522 ep)) == NULL) { 2523 /* No drives in list */ 2524 if (! mdisok(ep)) { 2525 rval = -1; 2526 goto out; 2527 } 2528 rval = -2; 2529 goto out; 2530 } 2531 2532 /* 2533 * Notify rpc.mdcommd on all nodes of a nodelist change. 2534 * Start by suspending rpc.mdcommd (which drains it of all messages), 2535 * then change the nodelist followed by a reinit and resume. 2536 */ 2537 nd = sd->sd_nodelist; 2538 while (nd) { 2539 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2540 nd = nd->nd_next; 2541 continue; 2542 } 2543 2544 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, 2545 MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2546 rval = -1; 2547 goto out; 2548 } 2549 suspendall_flag = 1; 2550 nd = nd->nd_next; 2551 } 2552 2553 /* Set master in my set record in rpc.metad */ 2554 if (master_flag) { 2555 if (clnt_mnsetmaster(mynode(), sp, 2556 sd->sd_mn_mynode->nd_nodename, 2557 sd->sd_mn_mynode->nd_nodeid, ep)) { 2558 rval = -1; 2559 goto out; 2560 } 2561 } 2562 /* 2563 * Causes mddbs to be loaded into the kernel. 2564 * Set the force flag so that replica locations can be 2565 * loaded into the kernel even if a mediator node was 2566 * unavailable. This allows a node to join an MO 2567 * diskset when there are sufficient replicas available, 2568 * but a mediator node in unavailable. 2569 */ 2570 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 2571 mde_perror(ep, dgettext(TEXT_DOMAIN, 2572 "Host not able to start diskset.")); 2573 rval = -1; 2574 goto out; 2575 } 2576 2577 if (! mdisok(ep)) { 2578 rval = -1; 2579 goto out; 2580 } 2581 2582 /* 2583 * Set rollback flags to 1 so that halt_set is called if a failure 2584 * is seen after this point. If snarf_set fails, still need to 2585 * call halt_set to cleanup the diskset. 2586 */ 2587 rb_flags = 1; 2588 2589 /* Starts the set */ 2590 if (snarf_set(sp, stale_bool, ep) != 0) { 2591 if (mdismddberror(ep, MDE_DB_STALE)) { 2592 /* 2593 * Don't fail join, STALE means that set has 2594 * < 50% mddbs. 2595 */ 2596 (void) mdstealerror(&ep_snarf, ep); 2597 stale_set = 1; 2598 } else if (mdisok(ep)) { 2599 /* If snarf failed, but no error was set - set it */ 2600 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2601 sp->setno, 0, NULL); 2602 rval = -1; 2603 goto out; 2604 } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) { 2605 /* 2606 * Don't fail join if ACCOK; ACCOK means that mediator 2607 * provided extra vote. 2608 */ 2609 rval = -1; 2610 goto out; 2611 } 2612 } 2613 2614 /* Did set really get snarfed? */ 2615 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) { 2616 if (mdisok(ep)) { 2617 /* If snarf failed, but no error was set - set it */ 2618 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2619 sp->setno, 0, NULL); 2620 } 2621 mde_perror(ep, dgettext(TEXT_DOMAIN, 2622 "Host not able to start diskset.")); 2623 rval = -1; 2624 goto out; 2625 } 2626 2627 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2628 send_reinit = 1; 2629 2630 /* If first node to enter set, setup master and clear change log */ 2631 if (master_flag) { 2632 /* Set master in my locally cached set descriptor */ 2633 (void) strcpy(sd->sd_mn_master_nodenm, 2634 sd->sd_mn_mynode->nd_nodename); 2635 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; 2636 sd->sd_mn_am_i_master = 1; 2637 2638 /* 2639 * If first node to join set, then clear out change log 2640 * entries. Change log entries are only needed when a 2641 * change of master is occurring in a diskset that has 2642 * multiple owners. Since this node is the first owner 2643 * of the diskset, clear the entries. 2644 * 2645 * Only do this if we are in a single node non-SC3.x 2646 * situation. 2647 */ 2648 if (meta_mn_singlenode() && 2649 mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) { 2650 mde_perror(ep, dgettext(TEXT_DOMAIN, 2651 "Unable to reset changelog.")); 2652 rval = -1; 2653 goto out; 2654 } 2655 } 2656 2657 /* Set my locally cached flag */ 2658 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 2659 2660 /* 2661 * Set this node's own flag on all joined nodes in the set 2662 * (including my node). 2663 */ 2664 clear_nr_flags = 1; 2665 2666 my_nd = *(sd->sd_mn_mynode); 2667 my_nd.nd_next = NULL; 2668 nd = sd->sd_nodelist; 2669 while (nd) { 2670 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2671 nd = nd->nd_next; 2672 continue; 2673 } 2674 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 2675 MD_NR_JOIN, NULL, ep)) { 2676 rval = -1; 2677 goto out; 2678 } 2679 nd = nd->nd_next; 2680 } 2681 2682 out: 2683 if (rval != NULL) { 2684 /* 2685 * If rollback flag is 1, then node was joined to set. 2686 * Since an error occurred, withdraw node from set in 2687 * order to rollback to before command was run. 2688 * Need to preserve ep so that calling function can 2689 * get error information. 2690 */ 2691 if (rb_flags == 1) { 2692 if (halt_set(sp, &xep)) { 2693 mdclrerror(&xep); 2694 } 2695 } 2696 2697 /* 2698 * If error, reset master to INVALID. 2699 * Ignore error since (next) first node to successfully join 2700 * will set master on all nodes. 2701 */ 2702 (void) clnt_mnsetmaster(mynode(), sp, "", 2703 MD_MN_INVALID_NID, &xep); 2704 mdclrerror(&xep); 2705 /* Reset master in my locally cached set descriptor */ 2706 sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; 2707 sd->sd_mn_am_i_master = 0; 2708 2709 /* 2710 * If nr flags set on other nodes, reset them. 2711 */ 2712 if (clear_nr_flags) { 2713 nd = sd->sd_nodelist; 2714 while (nd) { 2715 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2716 nd = nd->nd_next; 2717 continue; 2718 } 2719 (void) clnt_upd_nr_flags(nd->nd_nodename, sp, 2720 &my_nd, MD_NR_WITHDRAW, NULL, &xep); 2721 mdclrerror(&xep); 2722 nd = nd->nd_next; 2723 } 2724 /* Reset my locally cached flag */ 2725 sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN; 2726 } 2727 } 2728 2729 /* 2730 * Notify rpc.mdcommd on all nodes of a nodelist change. 2731 * Send reinit command to mdcommd which forces it to get 2732 * fresh set description. 2733 */ 2734 if (send_reinit) { 2735 /* Send reinit */ 2736 nd = sd->sd_nodelist; 2737 while (nd) { 2738 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2739 nd = nd->nd_next; 2740 continue; 2741 } 2742 2743 /* Class is ignored for REINIT */ 2744 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 2745 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 2746 /* 2747 * We are here because we failed to resume 2748 * rpc.mdcommd. However we potentially have 2749 * an error from the previous call 2750 * If the previous call did fail, we capture 2751 * that error and generate a perror with 2752 * the string, "Unable to resume...". 2753 * Setting rval to -1 ensures that in the 2754 * next iteration of the loop, ep is not 2755 * clobbered. 2756 */ 2757 if (rval == 0) 2758 (void) mdstealerror(ep, &xep); 2759 else 2760 mdclrerror(&xep); 2761 rval = -1; 2762 mde_perror(ep, dgettext(TEXT_DOMAIN, 2763 "Unable to reinit rpc.mdcommd.")); 2764 } 2765 nd = nd->nd_next; 2766 } 2767 2768 } 2769 2770 out2: 2771 /* 2772 * Unlock diskset by resuming messages across the diskset. 2773 * Just resume all classes so that resume is the same whether 2774 * just one class was locked or all classes were locked. 2775 */ 2776 if ((suspend1_flag) || (suspendall_flag)) { 2777 nd = sd->sd_nodelist; 2778 while (nd) { 2779 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2780 nd = nd->nd_next; 2781 continue; 2782 } 2783 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 2784 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 2785 /* 2786 * We are here because we failed to resume 2787 * rpc.mdcommd. However we potentially have 2788 * an error from the previous call 2789 * If the previous call did fail, we capture 2790 * that error and generate a perror with 2791 * the string, "Unable to resume...". 2792 * Setting rval to -1 ensures that in the 2793 * next iteration of the loop, ep is not 2794 * clobbered. 2795 */ 2796 if (rval == 0) 2797 (void) mdstealerror(ep, &xep); 2798 else 2799 mdclrerror(&xep); 2800 rval = -1; 2801 mde_perror(ep, dgettext(TEXT_DOMAIN, 2802 "Unable to resume rpc.mdcommd.")); 2803 } 2804 nd = nd->nd_next; 2805 } 2806 meta_ping_mnset(sp->setno); 2807 } 2808 2809 /* 2810 * Unlock set. This flushes the caches on the servers. 2811 */ 2812 cl_sk = cl_get_setkey(sp->setno, sp->setname); 2813 nd = sd->sd_nodelist; 2814 while (nd) { 2815 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2816 nd = nd->nd_next; 2817 continue; 2818 } 2819 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 2820 if (rval == 0) 2821 (void) mdstealerror(ep, &xep); 2822 else 2823 mdclrerror(&xep); 2824 rval = -1; 2825 } 2826 nd = nd->nd_next; 2827 } 2828 2829 /* 2830 * If this node is the last to join the diskset and clustering isn't 2831 * running, then resync the mirrors in the diskset. We have to wait 2832 * until all nodes are joined so that the status gets propagated to 2833 * all of the members of the set. 2834 * Ignore any error from the resync as the join function shouldn't fail 2835 * because the mirror resync had a problem. 2836 * 2837 * Don't start resync if set is stale. 2838 */ 2839 if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) && 2840 (stale_set != 1)) { 2841 nd = sd->sd_nodelist; 2842 while (nd) { 2843 if (!(nd->nd_flags & MD_MN_NODE_OWN)) 2844 break; 2845 nd = nd->nd_next; 2846 } 2847 /* 2848 * nd set to NULL means that we have no nodes in the set that 2849 * haven't joined. In this case we start the resync. 2850 */ 2851 if (nd == NULL) { 2852 (void) meta_mirror_resync_all(sp, 0, &xep); 2853 mdclrerror(&xep); 2854 } 2855 } 2856 2857 /* Update ABR state for all soft partitions */ 2858 (void) meta_sp_update_abr(sp, &xep); 2859 mdclrerror(&xep); 2860 2861 /* 2862 * call metaflushsetnames to reset local cache for master and 2863 * node information. 2864 */ 2865 metaflushsetname(sp); 2866 2867 /* release signals back to what they were on entry */ 2868 if (procsigs(FALSE, &oldsigs, &xep) < 0) 2869 mdclrerror(&xep); 2870 2871 /* 2872 * If no error and stale_set is set, then set ep back 2873 * to ep from snarf_set call and return -3. If another error 2874 * occurred and rval is not 0, then that error would have 2875 * caused the node to be withdrawn from the set and would 2876 * have set ep to that error information. 2877 */ 2878 if ((rval == 0) && (stale_set)) { 2879 (void) mdstealerror(ep, &ep_snarf); 2880 return (-3); 2881 } 2882 2883 return (rval); 2884 } 2885 2886 /* 2887 * Entry point to withdraw a node from MultiNode diskset. 2888 * 2889 * Validate host in diskset. 2890 * - Should be joined into diskset. 2891 * Assume valid configuration is stored in the set/drive/node records 2892 * in the local mddb since no node or drive can be added to the MNset 2893 * unless all drives and nodes are available. Reconfig steps will 2894 * resync all ALIVE nodes in case of panic in critical areas. 2895 * 2896 * Lock down the set. 2897 * Verify that drives exist in configuration. 2898 * Verify host is a member of this diskset. 2899 * Verify host is an owner of the diskset (host is joined to diskset). 2900 * Only allow withdrawal of master node if master node is the only joined 2901 * in the diskset. 2902 * Halt the diskset on this node. 2903 * Reset Master on this node. 2904 * Updated node flags that this node with withdrawn. 2905 * Unlock the set. 2906 * 2907 * Return values: 2908 * 0 - Node successfully withdrew from set. 2909 * -1 - Withdrawal attempted but failed 2910 * - any failure from libmeta calls 2911 * - node not in the member list 2912 * -2 - Withdrawal not attempted since 2913 * - this set had no drives in set 2914 * - this node not joined to set 2915 * - set is not a multinode set 2916 */ 2917 extern int 2918 meta_set_withdraw( 2919 mdsetname_t *sp, 2920 md_error_t *ep 2921 ) 2922 { 2923 md_set_desc *sd; 2924 md_drive_desc *dd = 0; 2925 md_mnnode_desc *nd, my_nd; 2926 int rval = 0; 2927 md_setkey_t *cl_sk; 2928 md_error_t xep = mdnullerror; 2929 int set_halted = 0; 2930 int suspendall_flag = 0; 2931 int suspend1_flag = 0; 2932 bool_t stale_bool = FALSE; 2933 mddb_config_t c; 2934 int node_id_list[1]; 2935 sigset_t oldsigs; 2936 int send_reinit = 0; 2937 2938 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2939 return (-1); 2940 } 2941 2942 /* Must be a multinode diskset */ 2943 if (!MD_MNSET_DESC(sd)) { 2944 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2945 return (-1); 2946 } 2947 2948 /* Make sure we are blocking all signals */ 2949 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2950 mdclrerror(&xep); 2951 2952 /* 2953 * Lock the set on current set members. 2954 * For MN diskset lock_set and SUSPEND are used to protect against 2955 * other meta* commands running on the other nodes. 2956 */ 2957 nd = sd->sd_nodelist; 2958 while (nd) { 2959 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2960 nd = nd->nd_next; 2961 continue; 2962 } 2963 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2964 rval = -1; 2965 goto out; 2966 } 2967 nd = nd->nd_next; 2968 } 2969 /* 2970 * Lock out other meta* commands by suspending 2971 * class 1 messages across the diskset. 2972 */ 2973 nd = sd->sd_nodelist; 2974 while (nd) { 2975 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2976 nd = nd->nd_next; 2977 continue; 2978 } 2979 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2980 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2981 rval = -1; 2982 goto out; 2983 } 2984 suspend1_flag = 1; 2985 nd = nd->nd_next; 2986 } 2987 2988 /* Get list of drives - needed in case of failure */ 2989 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2990 ep)) == NULL) { 2991 /* Error getting drives in list */ 2992 if (! mdisok(ep)) { 2993 rval = -1; 2994 goto out2; 2995 } 2996 /* no drives in list */ 2997 rval = -2; 2998 goto out2; 2999 } 3000 3001 /* 3002 * Verify that this host is a member (in the host list) of the set. 3003 */ 3004 nd = sd->sd_nodelist; 3005 while (nd) { 3006 if (strcmp(mynode(), nd->nd_nodename) == 0) { 3007 break; 3008 } 3009 nd = nd->nd_next; 3010 } 3011 if (!nd) { 3012 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 3013 sd->sd_mn_mynode->nd_nodename, NULL, 3014 sp->setname); 3015 rval = -1; 3016 goto out2; 3017 } 3018 3019 /* 3020 * Call metaget_setownership that calls each node in diskset and 3021 * marks in set descriptor if node is an owner of the set or not. 3022 * metaget_setownership checks to see if a node is an owner by 3023 * checking to see if that node's kernel has the mddb loaded. 3024 * If a node had panic'd during a reconfig or an 3025 * add/delete/join/withdraw operation, the other nodes' node 3026 * records may not reflect the current state of the diskset, 3027 * so calling metaget_setownership is the safest thing to do. 3028 */ 3029 if (metaget_setownership(sp, ep) == -1) { 3030 rval = -1; 3031 goto out2; 3032 } 3033 3034 /* 3035 * Verify that this node is joined 3036 * to diskset (i.e. is an owner of the diskset). 3037 */ 3038 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 3039 rval = -2; 3040 goto out2; 3041 } 3042 3043 /* 3044 * For a MN diskset, only withdraw master if it is 3045 * the only joined node. 3046 */ 3047 if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) { 3048 nd = sd->sd_nodelist; 3049 while (nd) { 3050 /* Skip my node since checking for other owners */ 3051 if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { 3052 nd = nd->nd_next; 3053 continue; 3054 } 3055 /* If another owner node if found, error */ 3056 if (nd->nd_flags & MD_MN_NODE_OWN) { 3057 (void) mddserror(ep, MDE_DS_WITHDRAWMASTER, 3058 sp->setno, 3059 sd->sd_mn_mynode->nd_nodename, NULL, 3060 sp->setname); 3061 rval = -1; 3062 goto out2; 3063 } 3064 nd = nd->nd_next; 3065 } 3066 } 3067 3068 /* 3069 * Is current set STALE? 3070 */ 3071 (void) memset(&c, 0, sizeof (c)); 3072 c.c_id = 0; 3073 c.c_setno = sp->setno; 3074 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 3075 (void) mdstealerror(ep, &c.c_mde); 3076 rval = -1; 3077 goto out; 3078 } 3079 if (c.c_flags & MDDB_C_STALE) { 3080 stale_bool = TRUE; 3081 } 3082 3083 /* 3084 * Notify rpc.mdcommd on all nodes of a nodelist change. 3085 * Start by suspending rpc.mdcommd (which drains it of all messages), 3086 * then change the nodelist followed by a reinit and resume. 3087 */ 3088 nd = sd->sd_nodelist; 3089 while (nd) { 3090 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3091 nd = nd->nd_next; 3092 continue; 3093 } 3094 3095 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 3096 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 3097 rval = -1; 3098 goto out; 3099 } 3100 suspendall_flag = 1; 3101 nd = nd->nd_next; 3102 } 3103 3104 /* 3105 * Withdraw the set - halt set. 3106 * This will fail if any I/O is occuring to any metadevice which 3107 * includes a resync to a mirror metadevice. 3108 */ 3109 set_halted = 1; 3110 if (halt_set(sp, ep)) { 3111 /* Was set actually halted? */ 3112 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) { 3113 set_halted = 0; 3114 } 3115 rval = -1; 3116 goto out; 3117 } 3118 3119 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 3120 send_reinit = 1; 3121 3122 /* Reset master on withdrawn node */ 3123 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "", 3124 MD_MN_INVALID_NID, ep)) { 3125 rval = -1; 3126 goto out; 3127 } 3128 3129 /* Mark my node as withdrawn and send to other nodes */ 3130 nd = sd->sd_nodelist; 3131 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3132 my_nd.nd_next = NULL; 3133 while (nd) { 3134 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3135 nd = nd->nd_next; 3136 continue; 3137 } 3138 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3139 MD_NR_WITHDRAW, NULL, ep)) { 3140 rval = -1; 3141 goto out; 3142 } 3143 nd = nd->nd_next; 3144 } 3145 3146 /* 3147 * If withdrawn node is a mirror owner, reset mirror owner 3148 * to NULL. If an error occurs, print a warning and continue. 3149 * Don't fail metaset because of mirror owner reset problem since 3150 * next node to grab mirror will resolve this issue. 3151 * Before next node grabs mirrors, metaset will show the withdrawn 3152 * node as owner which is why an attempt to reset the mirror owner 3153 * is made. 3154 */ 3155 node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */ 3156 nd = sd->sd_nodelist; 3157 while (nd) { 3158 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3159 nd = nd->nd_next; 3160 continue; 3161 } 3162 if (clnt_reset_mirror_owner(nd->nd_nodename, sp, 3163 1, &node_id_list[0], &xep) == 01) { 3164 mde_perror(&xep, dgettext(TEXT_DOMAIN, 3165 "Unable to reset mirror owner on node %s"), 3166 nd->nd_nodename); 3167 mdclrerror(&xep); 3168 } 3169 nd = nd->nd_next; 3170 } 3171 3172 out: 3173 if (rval == -1) { 3174 /* Rejoin node - Mark node as joined and send to other nodes */ 3175 nd = sd->sd_nodelist; 3176 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3177 my_nd.nd_next = NULL; 3178 while (nd) { 3179 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3180 nd = nd->nd_next; 3181 continue; 3182 } 3183 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3184 MD_NR_JOIN, NULL, &xep)) { 3185 mdclrerror(&xep); 3186 } 3187 nd = nd->nd_next; 3188 } 3189 3190 /* Set master on withdrawn node */ 3191 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, 3192 sd->sd_mn_master_nodenm, 3193 sd->sd_mn_master_nodeid, &xep)) { 3194 mdclrerror(&xep); 3195 } 3196 3197 /* Join set if halt_set had succeeded */ 3198 if (set_halted) { 3199 /* 3200 * Causes mddbs to be loaded into the kernel. 3201 * Set the force flag so that replica locations can be 3202 * loaded into the kernel even if a mediator node was 3203 * unavailable. This allows a node to join an MO 3204 * diskset when there are sufficient replicas available, 3205 * but a mediator node in unavailable. 3206 */ 3207 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) { 3208 mdclrerror(&xep); 3209 } 3210 /* If set previously stale - make it so at re-join */ 3211 if (snarf_set(sp, stale_bool, &xep) != 0) { 3212 mdclrerror(&xep); 3213 (void) halt_set(sp, &xep); 3214 mdclrerror(&xep); 3215 } 3216 } 3217 } 3218 3219 /* 3220 * Notify rpc.mdcommd on all nodes of a nodelist change. 3221 * Send reinit command to mdcommd which forces it to get 3222 * fresh set description. 3223 */ 3224 if (send_reinit) { 3225 /* Send reinit */ 3226 nd = sd->sd_nodelist; 3227 while (nd) { 3228 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3229 nd = nd->nd_next; 3230 continue; 3231 } 3232 3233 /* Class is ignored for REINIT */ 3234 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 3235 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 3236 /* 3237 * We are here because we failed to resume 3238 * rpc.mdcommd. However we potentially have 3239 * an error from the previous call. 3240 * If the previous call did fail, we 3241 * capture that error and generate a perror 3242 * withthe string, "Unable to resume...". 3243 * Setting rval to -1 ensures that in the 3244 * next iteration of the loop, ep is not 3245 * clobbered. 3246 */ 3247 if (rval == 0) 3248 (void) mdstealerror(ep, &xep); 3249 else 3250 mdclrerror(&xep); 3251 rval = -1; 3252 mde_perror(ep, dgettext(TEXT_DOMAIN, 3253 "Unable to reinit rpc.mdcommd.")); 3254 } 3255 nd = nd->nd_next; 3256 } 3257 } 3258 3259 out2: 3260 /* 3261 * Unlock diskset by resuming messages across the diskset. 3262 * Just resume all classes so that resume is the same whether 3263 * just one class was locked or all classes were locked. 3264 */ 3265 if ((suspend1_flag) || (suspendall_flag)) { 3266 nd = sd->sd_nodelist; 3267 while (nd) { 3268 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3269 nd = nd->nd_next; 3270 continue; 3271 } 3272 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 3273 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 3274 /* 3275 * We are here because we failed to resume 3276 * rpc.mdcommd. However we potentially have 3277 * an error from the previous call 3278 * If the previous call did fail, we capture 3279 * that error and generate a perror with 3280 * the string, "Unable to resume...". 3281 * Setting rval to -1 ensures that in the 3282 * next iteration of the loop, ep is not 3283 * clobbered. 3284 */ 3285 if (rval == 0) 3286 (void) mdstealerror(ep, &xep); 3287 else 3288 mdclrerror(&xep); 3289 rval = -1; 3290 mde_perror(ep, dgettext(TEXT_DOMAIN, 3291 "Unable to resume rpc.mdcommd.")); 3292 } 3293 nd = nd->nd_next; 3294 } 3295 meta_ping_mnset(sp->setno); 3296 } 3297 3298 /* 3299 * Unlock set. This flushes the caches on the servers. 3300 */ 3301 cl_sk = cl_get_setkey(sp->setno, sp->setname); 3302 nd = sd->sd_nodelist; 3303 while (nd) { 3304 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3305 nd = nd->nd_next; 3306 continue; 3307 } 3308 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 3309 if (rval == 0) 3310 (void) mdstealerror(ep, &xep); 3311 else 3312 mdclrerror(&xep); 3313 rval = -1; 3314 } 3315 nd = nd->nd_next; 3316 } 3317 3318 /* 3319 * call metaflushsetnames to reset local cache for master and 3320 * node information. 3321 */ 3322 metaflushsetname(sp); 3323 3324 /* release signals back to what they were on entry */ 3325 if (procsigs(FALSE, &oldsigs, &xep) < 0) 3326 mdclrerror(&xep); 3327 3328 return (rval); 3329 3330 } 3331 3332 /* 3333 * Update nodelist with cluster member information. 3334 * A node not in the member list will be marked 3335 * as not ALIVE and not OWN. 3336 * A node in the member list will be marked ALIVE, but 3337 * the OWN bit will not be changed. 3338 * 3339 * If mynode isn't in the membership list, fail causing 3340 * another reconfig cycle to be started since a non-member 3341 * node shouldn't be taking part in the reconfig cycle. 3342 * 3343 * Return values: 3344 * 0 - No problem. 3345 * 1 - Any failure including RPC failure to my node. 3346 */ 3347 int 3348 meta_reconfig_update_nodelist( 3349 mdsetname_t *sp, 3350 mndiskset_membershiplist_t *nl, 3351 md_set_desc *sd, 3352 md_error_t *ep 3353 ) 3354 { 3355 mndiskset_membershiplist_t *nl2; 3356 md_mnnode_desc *nd; 3357 md_error_t xep = mdnullerror; 3358 int rval = 0; 3359 3360 /* 3361 * Walk through nodelist, checking to see if each 3362 * node is in the member list. 3363 * If node is not a member, reset ALIVE and OWN node flag. 3364 * If node is a member, set ALIVE. 3365 * If mynode's OWN flag gets reset, then halt the diskset on this node. 3366 */ 3367 nd = sd->sd_nodelist; 3368 while (nd) { 3369 nl2 = nl; 3370 while (nl2) { 3371 /* If node is in member list, set ALIVE */ 3372 if (nl2->msl_node_id == nd->nd_nodeid) { 3373 nd->nd_flags |= MD_MN_NODE_ALIVE; 3374 break; 3375 } else { 3376 nl2 = nl2->next; 3377 } 3378 /* node is not in member list, mark !ALIVE and !OWN */ 3379 if (nl2 == NULL) { 3380 /* If node is mynode, then halt set if needed */ 3381 if (strcmp(mynode(), nd->nd_nodename) == 0) { 3382 /* 3383 * This shouldn't happen, but just 3384 * in case... Any node not in the 3385 * membership list should be dead and 3386 * not running reconfig step1. 3387 */ 3388 if (nd->nd_flags & MD_MN_NODE_OWN) { 3389 if (halt_set(sp, &xep)) { 3390 mde_perror(&xep, ""); 3391 mdclrerror(&xep); 3392 } 3393 } 3394 /* 3395 * Return failure since this node 3396 * (mynode) is not in the membership 3397 * list, but process the rest of the 3398 * nodelist first so that rpc.metad 3399 * can be updated with the latest 3400 * membership information. 3401 */ 3402 (void) mddserror(ep, 3403 MDE_DS_NOTINMEMBERLIST, 3404 sp->setno, nd->nd_nodename, NULL, 3405 sp->setname); 3406 rval = 1; 3407 } 3408 nd->nd_flags &= ~MD_MN_NODE_ALIVE; 3409 nd->nd_flags &= ~MD_MN_NODE_OWN; 3410 } 3411 } 3412 nd = nd->nd_next; 3413 } 3414 3415 /* Send this information to rpc.metad */ 3416 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 3417 MD_NR_SET, MNSET_IN_RECONFIG, &xep)) { 3418 /* Return failure if can't send node flags to rpc.metad */ 3419 if (rval == 0) { 3420 (void) mdstealerror(ep, &xep); 3421 rval = 1; 3422 } 3423 } 3424 return (rval); 3425 } 3426 3427 /* 3428 * Choose master determines the master for a diskset. 3429 * Each node determines the master on its own and 3430 * adds this information to its local rpc.metad nodelist 3431 * and also sends it to the kernel. 3432 * 3433 * Nodelist in set descriptor (sd) is sorted in 3434 * monotonically increasing sequence of nodeid. 3435 * 3436 * Return values: 3437 * 0 - No problem. 3438 * 205 - There was an RPC problem to another node. 3439 * -1 - There was an error. This could be an RPC error to my node. 3440 * This is a catastrophic failure causing node to panic. 3441 */ 3442 int 3443 meta_reconfig_choose_master_for_set( 3444 mdsetname_t *sp, 3445 md_set_desc *sd, 3446 md_error_t *ep 3447 ) 3448 { 3449 int is_owner; 3450 md_mnset_record *mnsr = NULL; 3451 int lowest_alive_nodeid = 0; 3452 uint_t master_nodeid; 3453 md_mnnode_desc *nd, *nd2; 3454 md_mnnode_record *nr; 3455 md_drive_desc *dd; 3456 md_setkey_t *cl_sk; 3457 int rval = 0; 3458 md_error_t xep = mdnullerror; 3459 mddb_setflags_config_t sf; 3460 3461 /* 3462 * Is current node joined to diskset? 3463 * Don't trust flags, really check to see if mddb is snarfed. 3464 */ 3465 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 3466 /* 3467 * If a node is joined to the diskset, this node checks 3468 * to see if the current master of the diskset is valid and 3469 * is still in the membership list (ALIVE) and is 3470 * still joined (OWN). Need to verify if master is 3471 * really joined - don't trust the flags. (Can trust 3472 * ALIVE since set during earlier part of reconfig cycle.) 3473 * If the current master is valid, still in the membership 3474 * list and joined, then master is not changed on this node. 3475 * Just return. 3476 * 3477 * Verify that nodeid is valid before accessing masternode. 3478 */ 3479 if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) && 3480 (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) { 3481 if (clnt_ownset(sd->sd_mn_master_nodenm, sp, 3482 &is_owner, ep) == -1) { 3483 /* If RPC failure to another node return 205 */ 3484 if ((mdanyrpcerror(ep)) && 3485 (sd->sd_mn_mynode->nd_nodeid != 3486 sd->sd_mn_master_nodeid)) { 3487 return (205); 3488 } else { 3489 /* Any other failure */ 3490 return (-1); 3491 } 3492 } else { 3493 if (is_owner == TRUE) { 3494 3495 meta_mc_log(MC_LOG5, dgettext( 3496 TEXT_DOMAIN, "Set %s previous " 3497 "master chosen %s (%d): %s"), 3498 sp->setname, 3499 sd->sd_mn_master_nodenm, 3500 sd->sd_mn_master_nodeid, 3501 meta_print_hrtime(gethrtime() - 3502 start_time)); 3503 3504 /* Previous master is ok - done */ 3505 return (0); 3506 } 3507 } 3508 } 3509 3510 /* 3511 * If current master is no longer in the membership list or 3512 * is no longer joined, then this node uses the following 3513 * algorithm: 3514 * - node calls RPC routine clnt_ownset to get latest 3515 * information on which nodes are owners of diskset. 3516 * clnt_ownset checks on each node to see if its kernel 3517 * has that diskset snarfed. 3518 */ 3519 nd = sd->sd_nodelist; 3520 while (nd) { 3521 /* Don't consider node that isn't in member list */ 3522 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3523 nd = nd->nd_next; 3524 continue; 3525 } 3526 3527 if (clnt_ownset(nd->nd_nodename, sp, 3528 &is_owner, ep) == -1) { 3529 /* If RPC failure to another node return 205 */ 3530 if ((mdanyrpcerror(ep)) && 3531 (sd->sd_mn_mynode->nd_nodeid != 3532 nd->nd_nodeid)) { 3533 return (205); 3534 } else { 3535 /* Any other failure */ 3536 return (-1); 3537 } 3538 } 3539 3540 /* 3541 * Set owner flag for each node based on whether 3542 * that node really has a diskset mddb snarfed in 3543 * or not. 3544 */ 3545 if (is_owner == TRUE) 3546 nd->nd_flags |= MD_MN_NODE_OWN; 3547 else 3548 nd->nd_flags &= ~MD_MN_NODE_OWN; 3549 3550 nd = nd->nd_next; 3551 } 3552 3553 /* 3554 * - node walks through nodelist looking for nodes that are 3555 * owners of the diskset that are in the membership list. 3556 * - for each owner, node calls RPC routine clnt_getset to 3557 * see if that node has its node record set to OK. 3558 * - If so, master is chosen to be this owner node. 3559 */ 3560 nd = sd->sd_nodelist; 3561 while (nd) { 3562 /* Don't consider node that isn't in member list */ 3563 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3564 nd = nd->nd_next; 3565 continue; 3566 } 3567 3568 /* Don't consider a node that isn't an owner */ 3569 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3570 nd = nd->nd_next; 3571 continue; 3572 } 3573 3574 /* Does node has its own node record set to OK? */ 3575 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3576 MD_SET_BAD, &mnsr, ep) == -1) { 3577 /* If RPC failure to another node return 205 */ 3578 if ((mdanyrpcerror(ep)) && 3579 (sd->sd_mn_mynode->nd_nodeid != 3580 nd->nd_nodeid)) { 3581 return (205); 3582 } else { 3583 /* Any other failure */ 3584 return (-1); 3585 } 3586 } 3587 nr = mnsr->sr_nodechain; 3588 while (nr) { 3589 if (nd->nd_nodeid == nr->nr_nodeid) { 3590 if (nr->nr_flags & MD_MN_NODE_OK) { 3591 /* Found a master */ 3592 free_sr( 3593 (md_set_record *)mnsr); 3594 goto found_master; 3595 } 3596 } 3597 nr = nr->nr_next; 3598 } 3599 free_sr((md_set_record *)mnsr); 3600 nd = nd->nd_next; 3601 } 3602 3603 /* 3604 * - If no owner node has its own node record on its own node 3605 * set to OK, then this node checks all of the non-owner 3606 * nodes that are in the membership list. 3607 * - for each non-owner, node calls RPC routine clnt_getset to 3608 * see if that node has its node record set to OK. 3609 * - If set doesn't exist, don't choose node for master. 3610 * - If so, master is chosen to be this non-owner node. 3611 * 3612 */ 3613 nd = sd->sd_nodelist; 3614 while (nd) { 3615 /* Don't consider node that isn't in member list */ 3616 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3617 nd = nd->nd_next; 3618 continue; 3619 } 3620 3621 /* Only checking non-owner nodes this time around */ 3622 if (nd->nd_flags & MD_MN_NODE_OWN) { 3623 nd = nd->nd_next; 3624 continue; 3625 } 3626 3627 /* Does node has its own node record set to OK? */ 3628 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3629 MD_SET_BAD, &mnsr, ep) == -1) { 3630 /* 3631 * If set doesn't exist on non-owner node, 3632 * don't consider this node for master. 3633 */ 3634 if (mdiserror(ep, MDE_NO_SET)) { 3635 nd = nd->nd_next; 3636 continue; 3637 } else if ((mdanyrpcerror(ep)) && 3638 (sd->sd_mn_mynode->nd_nodeid != 3639 nd->nd_nodeid)) { 3640 /* RPC failure to another node */ 3641 return (205); 3642 } else { 3643 /* Any other failure */ 3644 return (-1); 3645 } 3646 } 3647 nr = mnsr->sr_nodechain; 3648 while (nr) { 3649 if (nd->nd_nodeid == nr->nr_nodeid) { 3650 if (nr->nr_flags & MD_MN_NODE_OK) { 3651 /* Found a master */ 3652 free_sr( 3653 (md_set_record *)mnsr); 3654 goto found_master; 3655 } 3656 } 3657 nr = nr->nr_next; 3658 } 3659 free_sr((md_set_record *)mnsr); 3660 nd = nd->nd_next; 3661 } 3662 3663 /* 3664 * - If no node can be found that has its own node record on 3665 * its node to be set to OK, then all alive nodes 3666 * were in the process of being added to or deleted 3667 * from set. Each alive node will remove all 3668 * information pertaining to this set from its node. 3669 * 3670 * If all nodes in set are ALIVE, then call sdssc end routines 3671 * since set was truly being initially created or destroyed. 3672 */ 3673 goto delete_set; 3674 } else { 3675 3676 /* 3677 * If node is not joined to diskset, then this 3678 * node uses the following algorithm: 3679 * - If unjoined node doesn't have a node record for itself, 3680 * just delete the diskset since diskset was in the 3681 * process of being created. 3682 * - node needs to find master of diskset before 3683 * reconfig cycle, if a master existed. 3684 * - node calls RPC routine clnt_ownset to get latest 3685 * information on which nodes are owners of diskset. 3686 * clnt_ownset checks on each node to see if its 3687 * kernel has that diskset snarfed. 3688 */ 3689 3690 /* 3691 * Is my node in the set description? 3692 * If not, delete the set from this node. 3693 * sr2setdesc sets sd_mn_mynode pointer to the node 3694 * descriptor for this node if there was a node 3695 * record for this node. 3696 * 3697 */ 3698 if (sd->sd_mn_mynode == NULL) { 3699 goto delete_set; 3700 } 3701 3702 nd = sd->sd_nodelist; 3703 while (nd) { 3704 /* Don't consider node that isn't in member list */ 3705 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3706 nd = nd->nd_next; 3707 continue; 3708 } 3709 3710 if (clnt_ownset(nd->nd_nodename, sp, 3711 &is_owner, ep) == -1) { 3712 /* If RPC failure to another node return 205 */ 3713 if ((mdanyrpcerror(ep)) && 3714 (sd->sd_mn_mynode->nd_nodeid != 3715 nd->nd_nodeid)) { 3716 return (205); 3717 } else { 3718 /* Any other failure */ 3719 return (-1); 3720 } 3721 } 3722 3723 /* 3724 * Set owner flag for each node based on whether 3725 * that node really has a diskset mddb snarfed in 3726 * or not. 3727 */ 3728 if (is_owner == TRUE) 3729 nd->nd_flags |= MD_MN_NODE_OWN; 3730 else 3731 nd->nd_flags &= ~MD_MN_NODE_OWN; 3732 3733 nd = nd->nd_next; 3734 } 3735 3736 /* 3737 * - node walks through nodelist looking for nodes that 3738 * are owners of the diskset that are in 3739 * the membership list. 3740 * - for each owner, node calls RPC routine clnt_getset to 3741 * see if that node has a master set and to get the 3742 * diskset description. 3743 * - If the owner node has a set description that doesn't 3744 * include the non-joined node in the nodelist, this node 3745 * removes its set description of that diskset 3746 * (i.e. removes the set from its local mddbs). This is 3747 * handling the case of when a node was removed from a 3748 * diskset while it was not in the cluster membership 3749 * list. 3750 * - If that node has a master set and the master is in the 3751 * membership list and is an owner, then either this was 3752 * the master from before the reconfig cycle or this 3753 * node has already chosen a new master - either way, 3754 * the master value is valid as long as it is in the 3755 * membership list and is an owner 3756 * - master is chosen to be owner node's master 3757 */ 3758 nd = sd->sd_nodelist; 3759 while (nd) { 3760 /* Don't consider node that isn't in member list */ 3761 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3762 nd = nd->nd_next; 3763 continue; 3764 } 3765 3766 /* Don't consider a node that isn't an owner */ 3767 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3768 nd = nd->nd_next; 3769 continue; 3770 } 3771 3772 /* Get owner node's set record */ 3773 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3774 MD_SET_BAD, &mnsr, ep) == -1) { 3775 /* If RPC failure to another node return 205 */ 3776 if ((mdanyrpcerror(ep)) && 3777 (sd->sd_mn_mynode->nd_nodeid != 3778 nd->nd_nodeid)) { 3779 return (205); 3780 } else { 3781 /* Any other failure */ 3782 return (-1); 3783 } 3784 } 3785 3786 /* Is this node in the owner node's set record */ 3787 nr = mnsr->sr_nodechain; 3788 while (nr) { 3789 if (sd->sd_mn_mynode->nd_nodeid == 3790 nr->nr_nodeid) { 3791 break; 3792 } 3793 nr = nr->nr_next; 3794 } 3795 if (nr == NULL) { 3796 /* my node not found - delete set */ 3797 free_sr((md_set_record *)mnsr); 3798 goto delete_set; 3799 } 3800 3801 /* Is owner's node's master valid? */ 3802 master_nodeid = mnsr->sr_master_nodeid; 3803 free_sr((md_set_record *)mnsr); 3804 if (master_nodeid == MD_MN_INVALID_NID) { 3805 nd = nd->nd_next; 3806 continue; 3807 } 3808 3809 nd2 = sd->sd_nodelist; 3810 while (nd2) { 3811 if ((nd2->nd_nodeid == master_nodeid) && 3812 (nd2->nd_flags & MD_MN_NODE_ALIVE) && 3813 (nd2->nd_flags & MD_MN_NODE_OWN)) { 3814 nd = nd2; 3815 goto found_master; 3816 } 3817 nd2 = nd2->nd_next; 3818 } 3819 nd = nd->nd_next; 3820 } 3821 3822 /* 3823 * - If no owner node has a valid master, then follow 3824 * algorithm of when a node is joined to the diskset. 3825 * - node walks through nodelist looking for nodes that are 3826 * owners of the diskset that are in the membership list. 3827 * - for each owner, node calls RPC routine clnt_getset to 3828 * see if that node has its node record set to OK. 3829 * - If so, master is chosen to be this owner node. 3830 */ 3831 nd = sd->sd_nodelist; 3832 while (nd) { 3833 /* Don't consider node that isn't in member list */ 3834 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3835 nd = nd->nd_next; 3836 continue; 3837 } 3838 3839 /* Don't consider a node that isn't an owner */ 3840 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3841 nd = nd->nd_next; 3842 continue; 3843 } 3844 3845 /* Does node has its own node record set to OK? */ 3846 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3847 MD_SET_BAD, &mnsr, ep) == -1) { 3848 /* If RPC failure to another node return 205 */ 3849 if ((mdanyrpcerror(ep)) && 3850 (sd->sd_mn_mynode->nd_nodeid != 3851 nd->nd_nodeid)) { 3852 return (205); 3853 } else { 3854 /* Any other failure */ 3855 return (-1); 3856 } 3857 } 3858 nr = mnsr->sr_nodechain; 3859 while (nr) { 3860 if (nd->nd_nodeid == nr->nr_nodeid) { 3861 if (nr->nr_flags & MD_MN_NODE_OK) { 3862 /* Found a master */ 3863 free_sr( 3864 (md_set_record *)mnsr); 3865 goto found_master; 3866 } 3867 } 3868 nr = nr->nr_next; 3869 } 3870 free_sr((md_set_record *)mnsr); 3871 nd = nd->nd_next; 3872 } 3873 3874 /* 3875 * - If no owner node has its own node record on its own node 3876 * set to OK, then this node checks all of the non-owner 3877 * nodes that are in the membership list. 3878 * - for each non-owner, node calls RPC routine clnt_getset to 3879 * see if that node has its node record set to OK. 3880 * - If set doesn't exist, don't choose node for master. 3881 * - If this node doesn't exist in the nodelist on any of the 3882 * non-owner nodes, this node removes its set description 3883 * of that diskset (i.e. removes the set from its local 3884 * mddbs). This is handling the case of when a node was 3885 * removed from a diskset while it was not in the 3886 * cluster membership list. 3887 * - If non-owner node has its node record set to OK and if 3888 * this node hasn't removed this diskset (step directly 3889 * before this one), then the master is chosen to be this 3890 * non-owner node. 3891 */ 3892 nd = sd->sd_nodelist; 3893 while (nd) { 3894 /* Don't consider node that isn't in member list */ 3895 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3896 nd->nd_flags |= MD_MN_NODE_DEL; 3897 nd = nd->nd_next; 3898 continue; 3899 } 3900 3901 /* Don't consider owner nodes since none are OK */ 3902 if (nd->nd_flags & MD_MN_NODE_OWN) { 3903 nd->nd_flags |= MD_MN_NODE_DEL; 3904 nd = nd->nd_next; 3905 continue; 3906 } 3907 3908 /* 3909 * Don't need to get nodelist from my node since 3910 * this is where sd_nodelist was obtained. 3911 */ 3912 if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3913 nd = nd->nd_next; 3914 continue; 3915 } 3916 3917 /* 3918 * If node has already been decided against for 3919 * master, then skip it. 3920 */ 3921 if (nd->nd_flags & MD_MN_NODE_DEL) { 3922 nd = nd->nd_next; 3923 continue; 3924 } 3925 3926 /* 3927 * Does node in my nodelist have its own node 3928 * record marked OK on its node? And does node 3929 * in my nodelist exist on all other nodes? 3930 * Don't want to choose a node for master unless 3931 * that node is marked OK on its own node and that 3932 * node exists on all other alive nodes. 3933 * 3934 * This is guarding against the case when several 3935 * nodes are down and one of the downed nodes is 3936 * deleted from the diskset. When the down nodes 3937 * are rebooted into the cluster, you don't want 3938 * any node to pick the deleted node as the master. 3939 */ 3940 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3941 MD_SET_BAD, &mnsr, ep) == -1) { 3942 /* 3943 * If set doesn't exist on non-owner node, 3944 * don't consider this node for master. 3945 */ 3946 if (mdiserror(ep, MDE_NO_SET)) { 3947 nd->nd_flags |= MD_MN_NODE_DEL; 3948 nd = nd->nd_next; 3949 continue; 3950 } else if (mdanyrpcerror(ep)) { 3951 /* RPC failure to another node */ 3952 return (205); 3953 } else { 3954 /* Any other failure */ 3955 return (-1); 3956 } 3957 } 3958 /* 3959 * Is my node in the nodelist gotten from the other 3960 * node? If not, then remove the set from my node 3961 * since set was deleted from my node while my node 3962 * was out of the cluster. 3963 */ 3964 nr = mnsr->sr_nodechain; 3965 while (nr) { 3966 if (sd->sd_mn_mynode->nd_nodeid == 3967 nr->nr_nodeid) { 3968 break; 3969 } 3970 nr = nr->nr_next; 3971 } 3972 if (nr == NULL) { 3973 /* my node not found - delete set */ 3974 free_sr((md_set_record *)mnsr); 3975 goto delete_set; 3976 } 3977 3978 /* Is node being checked marked OK on its own node? */ 3979 nr = mnsr->sr_nodechain; 3980 while (nr) { 3981 if (nd->nd_nodeid == nr->nr_nodeid) { 3982 if (!(nr->nr_flags & MD_MN_NODE_OK)) { 3983 nd->nd_flags |= MD_MN_NODE_DEL; 3984 } 3985 break; 3986 } 3987 nr = nr->nr_next; 3988 } 3989 /* 3990 * If node being checked doesn't exist on its 3991 * own node - don't choose it as master. 3992 */ 3993 if (nr == NULL) { 3994 nd->nd_flags |= MD_MN_NODE_DEL; 3995 } 3996 3997 /* 3998 * Check every node in my node's nodelist against 3999 * the nodelist gotten from the other node. 4000 * If a node in my node's nodelist is not found in the 4001 * other node's nodelist, then set the DEL flag. 4002 */ 4003 nd2 = sd->sd_nodelist; 4004 while (nd2) { 4005 nr = mnsr->sr_nodechain; 4006 while (nr) { 4007 if (nd2->nd_nodeid == nr->nr_nodeid) { 4008 break; 4009 } 4010 nr = nr->nr_next; 4011 } 4012 /* nd2 not found in other node's nodelist */ 4013 if (nr == NULL) { 4014 nd2->nd_flags |= MD_MN_NODE_DEL; 4015 } 4016 nd2 = nd2->nd_next; 4017 } 4018 4019 free_sr((md_set_record *)mnsr); 4020 nd = nd->nd_next; 4021 } 4022 4023 /* 4024 * Rescan list look for node that has not been marked DEL. 4025 * First node found is the master. 4026 */ 4027 nd = sd->sd_nodelist; 4028 while (nd) { 4029 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4030 break; 4031 } 4032 nd = nd->nd_next; 4033 continue; 4034 } 4035 if (nd) { 4036 /* Found a master */ 4037 goto found_master; 4038 } 4039 4040 /* 4041 * - If no node can be found that has its own node record on 4042 * its node to be set to OK, then all alive nodes 4043 * were in the process of being added to or deleted 4044 * from set. Each alive node will remove all 4045 * information pertaining to this set from its node. 4046 * 4047 * If all nodes in set are ALIVE, then call sdssc end routines 4048 * since set was truly being initially created or destroyed. 4049 */ 4050 goto delete_set; 4051 } 4052 4053 found_master: 4054 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4055 "Set %s master chosen %s (%d): %s"), 4056 sp->setname, nd->nd_nodename, nd->nd_nodeid, 4057 meta_print_hrtime(gethrtime() - start_time)); 4058 4059 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4060 return (-1); 4061 } 4062 4063 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4064 4065 if (clnt_mnsetmaster(mynode(), sp, 4066 nd->nd_nodename, nd->nd_nodeid, ep)) { 4067 rval = -1; 4068 } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 4069 /* If this node is new master, set flag in this node's kernel */ 4070 (void) memset(&sf, 0, sizeof (sf)); 4071 sf.sf_setno = sp->setno; 4072 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 4073 /* Use magic to help protect ioctl against attack. */ 4074 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4075 sf.sf_flags = MDDB_NM_SET; 4076 4077 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4078 "Setting new master flag for set %s: %s"), 4079 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4080 4081 /* 4082 * Fail reconfig cycle if ioctl fails since it is critical 4083 * to set new master flag. 4084 */ 4085 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, 4086 NULL) != NULL) { 4087 (void) mdstealerror(ep, &sf.sf_mde); 4088 rval = -1; 4089 } 4090 } 4091 4092 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4093 if (rval == 0) { 4094 (void) mdstealerror(ep, &xep); 4095 rval = -1; 4096 } 4097 } 4098 4099 cl_set_setkey(NULL); 4100 4101 metaflushsetname(sp); 4102 4103 return (rval); 4104 4105 delete_set: 4106 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4107 "Master not chosen, deleting set %s: %s"), 4108 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4109 4110 /* 4111 * Remove all set information from this node: 4112 * - node records for this set 4113 * - drive records for this set 4114 * - set record for this set 4115 * (Only do this on this node since each node 4116 * will do it for its own local mddb.) 4117 * 4118 * If all nodes in set are ALIVE, then 4119 * the lowest numbered ALIVE nodeid in set 4120 * (irregardless of whether an owner node or not) will 4121 * call the DCS service to cleanup for create/delete of set. 4122 * sdssc_create_end(cleanup) if set was being created or 4123 * sdssc_delete_end(cleanup) if set was being deleted. 4124 * A node record with flag ADD denotes a set being 4125 * created. A node record with flag DEL denotes a 4126 * set being deleted. 4127 */ 4128 nd = sd->sd_nodelist; 4129 while (nd) { 4130 /* Found a node that isn't alive */ 4131 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) 4132 break; 4133 4134 /* Is my node the lowest numbered ALIVE node? */ 4135 if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) { 4136 break; 4137 } 4138 nd = nd->nd_next; 4139 } 4140 if (nd == NULL) { 4141 /* All nodes ALIVE and this is the lowest nodeid */ 4142 lowest_alive_nodeid = 1; 4143 } 4144 4145 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4146 return (-1); 4147 } 4148 4149 4150 /* 4151 * If this node had been joined, withdraw and reset master. 4152 * 4153 * This could happen if a node was being added to or removed 4154 * from a diskset and the node doing the add/delete operation and 4155 * all other nodes in the diskset have left the cluster. 4156 */ 4157 if (sd->sd_mn_mynode) { 4158 nd = sd->sd_mn_mynode; 4159 if (nd->nd_flags & MD_MN_NODE_OWN) { 4160 if (clnt_withdrawset(mynode(), sp, ep)) { 4161 rval = -1; 4162 goto out; 4163 } 4164 if (clnt_mnsetmaster(mynode(), sp, "", 4165 MD_MN_INVALID_NID, ep)) { 4166 rval = -1; 4167 goto out; 4168 } 4169 } 4170 } 4171 4172 /* 4173 * Remove side records for this node (side) from local mddb 4174 * (clnt_deldrvs does this) if there are drives in the set. 4175 * 4176 * Don't need to mark this node as DEL since already marked as 4177 * ADD or DEL (or this node would have been chosen as master). 4178 * Don't need to mark other node records, drive records or 4179 * set records as DEL. If a panic occurs during clnt_delset, 4180 * these records will be deleted the next time this node 4181 * becomes a member and goes through the reconfig cycle. 4182 */ 4183 /* Get the drive descriptors for this set */ 4184 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4185 ep)) == NULL) { 4186 if (! mdisok(ep)) { 4187 /* 4188 * Ignore and clear out any failures from 4189 * metaget_drivedesc since a panic could have 4190 * occurred when a node was partially added to a set. 4191 */ 4192 mdclrerror(ep); 4193 } 4194 } else { 4195 if (clnt_deldrvs(mynode(), sp, dd, ep)) { 4196 rval = -1; 4197 goto out; 4198 } 4199 } 4200 4201 /* 4202 * Now, delete the set - this removes the node, drive 4203 * and set records from the local mddb. 4204 */ 4205 if (clnt_delset(mynode(), sp, ep)) { 4206 rval = -1; 4207 goto out; 4208 } 4209 4210 out: 4211 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4212 4213 /* 4214 * Ignore errors from unlock of set since set is no longer 4215 * known (if clnt_delset worked). 4216 */ 4217 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4218 mdclrerror(&xep); 4219 } 4220 4221 cl_set_setkey(NULL); 4222 4223 metaflushsetname(sp); 4224 4225 /* 4226 * If this node is the lowest numbered nodeid then 4227 * call sdssc_create/delete_end depending on whether 4228 * this node is marked as ADD or DEL in the node record. 4229 */ 4230 if (lowest_alive_nodeid) { 4231 if (nd->nd_flags & MD_MN_NODE_ADD) 4232 sdssc_create_end(sp->setname, SDSSC_CLEANUP); 4233 else if (nd->nd_flags & MD_MN_NODE_DEL) 4234 sdssc_delete_end(sp->setname, SDSSC_CLEANUP); 4235 } 4236 4237 /* Finished with this set -- return */ 4238 return (rval); 4239 } 4240 4241 /* 4242 * Reconfig step to choose a new master for all MN disksets. 4243 * Return values: 4244 * 0 - Everything is great. 4245 * 1 - This node failed to reconfig. 4246 * 205 - Cause another reconfig due to a nodelist problem 4247 * or RPC failure to another node 4248 */ 4249 int 4250 meta_reconfig_choose_master( 4251 long timeout, 4252 md_error_t *ep 4253 ) 4254 { 4255 set_t max_sets, setno; 4256 int nodecnt; 4257 mndiskset_membershiplist_t *nl; 4258 md_set_desc *sd; 4259 mdsetname_t *sp; 4260 int rval = 0; 4261 mddb_setflags_config_t sf; 4262 int start_node_delayed = 0; 4263 4264 if ((max_sets = get_max_sets(ep)) == 0) { 4265 mde_perror(ep, dgettext(TEXT_DOMAIN, 4266 "Unable to get number of sets")); 4267 return (1); 4268 } 4269 4270 /* 4271 * Get membershiplist from API routine. If there's 4272 * an error, return a 205 to cause another reconfig. 4273 */ 4274 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { 4275 mde_perror(ep, ""); 4276 return (205); 4277 } 4278 4279 for (setno = 1; setno < max_sets; setno++) { 4280 if ((sp = metasetnosetname(setno, ep)) == NULL) { 4281 if (mdiserror(ep, MDE_NO_SET)) { 4282 /* No set for this setno - continue */ 4283 mdclrerror(ep); 4284 continue; 4285 } else { 4286 /* 4287 * If encountered an RPC error from my node, 4288 * then immediately fail. 4289 */ 4290 if (mdanyrpcerror(ep)) { 4291 mde_perror(ep, ""); 4292 return (1); 4293 } 4294 /* Can't get set information */ 4295 mde_perror(ep, dgettext(TEXT_DOMAIN, 4296 "Unable to get information for " 4297 "set number %d"), setno); 4298 mdclrerror(ep); 4299 continue; 4300 } 4301 } 4302 4303 /* If setname is there, set desc should exist. */ 4304 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4305 /* 4306 * If encountered an RPC error from my node, 4307 * then immediately fail. 4308 */ 4309 if (mdanyrpcerror(ep)) { 4310 mde_perror(ep, ""); 4311 return (1); 4312 } 4313 mde_perror(ep, dgettext(TEXT_DOMAIN, 4314 "Unable to get set %s desc information"), 4315 sp->setname); 4316 mdclrerror(ep); 4317 continue; 4318 } 4319 4320 /* Only reconfig MN disksets */ 4321 if (!MD_MNSET_DESC(sd)) { 4322 continue; 4323 } 4324 4325 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4326 "Begin choose master for set %s: %s"), 4327 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4328 4329 /* Update nodelist with member information. */ 4330 if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) { 4331 /* 4332 * If encountered an RPC error from my node, 4333 * then immediately fail. 4334 */ 4335 if (mdanyrpcerror(ep)) { 4336 mde_perror(ep, ""); 4337 return (1); 4338 } 4339 mde_perror(ep, ""); 4340 mdclrerror(ep); 4341 continue; 4342 } 4343 4344 /* 4345 * If all nodes in a cluster are starting, then 4346 * all nodes will attempt to contact all other nodes 4347 * to determine a master node. This can lead to a 4348 * problem where node 1 is trying to contact the rpc.metad 4349 * node 2 and node 2 is trying to contact the rpc.metad 4350 * on node 1 -- and this causes the rpc call to fail 4351 * on both nodes and causes a new reconfig cycle. 4352 * 4353 * In order to break this problem, a newly starting node 4354 * will delay a small amount of time (nodeid mod 4 seconds) 4355 * and will then run the code to choose a master for the 4356 * first set. Delay will only be done once regardless of the 4357 * number of sets. 4358 */ 4359 if (start_node_delayed == 0) { 4360 (void) memset(&sf, 0, sizeof (sf)); 4361 sf.sf_setno = sp->setno; 4362 sf.sf_flags = MDDB_NM_GET; 4363 /* Use magic to help protect ioctl against attack. */ 4364 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4365 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, 4366 &sf.sf_mde, NULL) == 0) && 4367 ((sf.sf_setflags & MD_SET_MN_START_RC) == 4368 MD_SET_MN_START_RC)) { 4369 (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4); 4370 } 4371 start_node_delayed = 1; 4372 } 4373 4374 /* Choose master for this set */ 4375 rval = meta_reconfig_choose_master_for_set(sp, sd, ep); 4376 if (rval == -1) { 4377 mde_perror(ep, ""); 4378 return (1); 4379 } else if (rval == 205) { 4380 mde_perror(ep, ""); 4381 return (205); 4382 } 4383 4384 /* reinit rpc.mdcommd with new nodelist */ 4385 if (mdmn_reinit_set(sp->setno, timeout)) { 4386 md_eprintf(dgettext(TEXT_DOMAIN, 4387 "Could not re-initialise rpc.mdcommd for " 4388 "set %s\n"), sp->setname); 4389 return (1); 4390 } 4391 4392 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4393 "Choose master for set %s completed: %s"), 4394 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4395 } 4396 4397 /* 4398 * Each node turns on I/Os for all MN disksets. 4399 * This is to recover from the situation where the master died 4400 * during a previous reconfig cycle when I/Os were suspended 4401 * for a MN diskset. 4402 * If a failure occurs return a 1 which will force this node to 4403 * panic. Cannot leave node in the situation where I/Os are 4404 * not resumed. 4405 */ 4406 setno = 0; /* 0 means all MN sets */ 4407 if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) { 4408 mde_perror(ep, ""); 4409 return (1); 4410 } 4411 4412 /* Free the nodelist */ 4413 if (nodecnt) 4414 meta_free_nodelist(nl); 4415 4416 return (0); 4417 } 4418 4419 /* 4420 * meta_mnsync_user_records will synchronize the diskset user records across 4421 * all nodes in the diskset. The diskset user records are stored in 4422 * each node's local set mddb. 4423 * 4424 * This needs to be done even if there is no master change during the 4425 * reconfig cycle since this routine should clean up any mess left by 4426 * the untimely termination of a metaset or metadb command (due to a 4427 * node panic or to user intervention). 4428 * 4429 * Caller is the Master node. 4430 * 4431 * Returns 0 - Success 4432 * 205 - Failure during RPC to another node 4433 * -1 - Any other failure and ep is filled in. 4434 */ 4435 int 4436 meta_mnsync_user_records( 4437 mdsetname_t *sp, 4438 md_error_t *ep 4439 ) 4440 { 4441 md_set_desc *sd; 4442 md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail; 4443 md_mnset_record *mnsr; 4444 md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL; 4445 md_mnnode_record *nr; 4446 md_drive_record *dr; 4447 int dr_cnt, dd_cnt; 4448 int found_my_nr; 4449 md_drive_desc *dd, *dd_prev, *master_dd, *other_dd; 4450 int all_drives_ok; 4451 int rval = 0; 4452 int max_genid = 0; 4453 int num_alive_nodes, num_alive_nodes_del = 0; 4454 int set_locked = 0; 4455 md_setkey_t *cl_sk; 4456 md_error_t xep = mdnullerror; 4457 char *anode[1]; 4458 mddb_setflags_config_t sf; 4459 4460 /* 4461 * Sync up node records first. 4462 * Construct a master nodelist using the nodelist from this 4463 * node's rpc.metad node records and then setting the state of each 4464 * node following these rules: 4465 * - If a node record is marked OK on its node, mark it OK 4466 * in the master nodelist (and later OK on all nodes) 4467 * If a node record is also marked OWN on its node, 4468 * mark it OWN in the master nodelist. 4469 * - If a node record is not marked OK on its node, then mark 4470 * it as DEL in the master list (later deleting it) 4471 * - If node record doesn't exist on that node, then mark it DEL 4472 * (later deleting it) 4473 * - If set record doesn't exist on that node, mark node as DEL 4474 * - If a node record doesn't exist on all nodes, then mark it DEL 4475 * - If a node is not ALIVE, then 4476 * - If that node marked DEL on any node - mark it DEL 4477 * in master list but leave in nodelist 4478 * - If that node is marked as ADD on any node, mark it 4479 * ADD in the master list but leave in nodelist 4480 * - When that node returns to the living, the DEL 4481 * node record will be removed and the ADD node 4482 * record may be removed if marked ADD on that 4483 * node. 4484 * The key rule is to not remove a node from the nodelist until 4485 * that node record is removed from its own node. Do not want to 4486 * remove a node's record from all other nodes and then have 4487 * that node have its own record marked OK so that a node will pick 4488 * a different master than the other nodes. 4489 * 4490 * Next, 4491 * If node is ALIVE and node record is marked DEL in master nodelist, 4492 * remove node from set. 4493 * If node is ALIVE and node record is marked OK in master nodelist, 4494 * mark it OK on all other nodes. 4495 * If node is not ALIVE and node record is marked DEL in master 4496 * nodelist, mark it DEL on all other nodes. 4497 * If node is not ALIVE and node record is marked ADD in master, 4498 * nodelist, mark it ADD on all other nodes. 4499 */ 4500 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4501 return (-1); 4502 } 4503 master_nodelist = sd->sd_nodelist; 4504 4505 /* 4506 * Walk through nodelist creating a master nodelist. 4507 */ 4508 num_alive_nodes = 0; 4509 nd = master_nodelist; 4510 while (nd) { 4511 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4512 nd = nd->nd_next; 4513 continue; 4514 } 4515 num_alive_nodes++; 4516 if (clnt_mngetset(nd->nd_nodename, sp->setname, 4517 MD_SET_BAD, &mnsr, ep) == -1) { 4518 if (mdiserror(ep, MDE_NO_SET)) { 4519 /* set doesn't exist, mark node as DEL */ 4520 nd->nd_flags &= ~MD_MN_NODE_OK; 4521 nd->nd_flags &= ~MD_MN_NODE_ADD; 4522 nd->nd_flags |= MD_MN_NODE_DEL; 4523 nd->nd_flags |= MD_MN_NODE_NOSET; 4524 nd = nd->nd_next; 4525 continue; 4526 } else { 4527 /* If RPC failure to another node return 205 */ 4528 if ((mdanyrpcerror(ep)) && 4529 (sd->sd_mn_mynode->nd_nodeid != 4530 nd->nd_nodeid)) { 4531 rval = 205; 4532 } else { 4533 /* Any other failure */ 4534 rval = -1; 4535 } 4536 goto out; 4537 } 4538 } 4539 /* Find biggest genid in records for this diskset */ 4540 if (mnsr->sr_genid > max_genid) 4541 max_genid = mnsr->sr_genid; 4542 4543 dr = mnsr->sr_drivechain; 4544 while (dr) { 4545 /* Find biggest genid in records for this diskset */ 4546 if (dr->dr_genid > max_genid) { 4547 max_genid = dr->dr_genid; 4548 } 4549 dr = dr->dr_next; 4550 } 4551 4552 found_my_nr = 0; 4553 nr = mnsr->sr_nodechain; 4554 /* nr is the list of node recs from nd_nodename node */ 4555 while (nr) { 4556 /* Find biggest genid in records for this diskset */ 4557 if (nr->nr_genid > max_genid) 4558 max_genid = nr->nr_genid; 4559 nd2 = master_nodelist; 4560 ndtail = NULL; 4561 /* For each node record, is it in master list? */ 4562 while (nd2) { 4563 if (nd2->nd_nodeid == nr->nr_nodeid) 4564 break; 4565 if (nd2->nd_next == NULL) 4566 ndtail = nd2; 4567 nd2 = nd2->nd_next; 4568 } 4569 /* 4570 * Found node record not in master list -- add it 4571 * to list marking it as DEL since node record 4572 * should exist on all nodes unless a panic occurred 4573 * during addition or deletion of host to diskset. 4574 */ 4575 if (nd2 == NULL) { 4576 nd2 = Zalloc(sizeof (*nd2)); 4577 (void) strcpy(nd2->nd_nodename, 4578 nr->nr_nodename); 4579 nd2->nd_flags = nr->nr_flags; 4580 nd2->nd_flags |= MD_MN_NODE_DEL; 4581 nd2->nd_nodeid = nr->nr_nodeid; 4582 nd2->nd_next = NULL; 4583 ndtail->nd_next = nd2; 4584 nd2 = NULL; 4585 nr = nr->nr_next; 4586 continue; 4587 } 4588 /* 4589 * Is this the node record for the node that 4590 * we requested the set desc from? 4591 * If so, check if node has its own node record 4592 * marked OK. If marked OK, check for the OWN bit. 4593 */ 4594 if (nr->nr_nodeid == nd->nd_nodeid) { 4595 found_my_nr = 1; 4596 if (nr->nr_flags & MD_MN_NODE_OK) { 4597 /* 4598 * If node record is marked OK 4599 * on its own node, then mark it OK 4600 * in the master list. Node record 4601 * would have to exist on all nodes 4602 * in the ADD state before it could 4603 * be put into the OK state. 4604 */ 4605 nd->nd_flags |= MD_MN_NODE_OK; 4606 nd->nd_flags &= 4607 ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL); 4608 /* 4609 * Mark own in master list as marked 4610 * on own node. 4611 */ 4612 if (nr->nr_flags & MD_MN_NODE_OWN) 4613 nd->nd_flags |= MD_MN_NODE_OWN; 4614 else 4615 nd->nd_flags &= ~MD_MN_NODE_OWN; 4616 } else { 4617 /* Otherwise, mark node as DEL */ 4618 nd->nd_flags &= ~MD_MN_NODE_OK; 4619 nd->nd_flags &= ~MD_MN_NODE_ADD; 4620 nd->nd_flags |= MD_MN_NODE_DEL; 4621 } 4622 } 4623 /* 4624 * If node is not ALIVE and marked DEL 4625 * on any node, make it DEL in master list. 4626 * If node is not ALIVE and marked ADD 4627 * on any node, make it ADD in master list 4628 * unless node record has already been marked DEL. 4629 */ 4630 if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) { 4631 if (nr->nr_flags & MD_MN_NODE_ADD) { 4632 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4633 /* If not DEL - mark it ADD */ 4634 nd->nd_flags |= MD_MN_NODE_ADD; 4635 nd->nd_flags &= ~MD_MN_NODE_OK; 4636 } 4637 } 4638 if (nr->nr_flags & MD_MN_NODE_DEL) { 4639 nd->nd_flags |= MD_MN_NODE_DEL; 4640 nd->nd_flags &= ~MD_MN_NODE_OK; 4641 /* Could already be ADD - make it DEL */ 4642 nd->nd_flags &= ~MD_MN_NODE_ADD; 4643 } 4644 } 4645 nr = nr->nr_next; 4646 } 4647 /* 4648 * If a node record doesn't exist on its own node, 4649 * then mark node as DEL. 4650 */ 4651 if (found_my_nr == 0) { 4652 nd->nd_flags &= ~MD_MN_NODE_OK; 4653 nd->nd_flags |= MD_MN_NODE_DEL; 4654 } 4655 4656 /* 4657 * If node is OK - put mnsr onto master_mnsr_node list for 4658 * later use when syncing up the drive records in the set. 4659 */ 4660 if (nd->nd_flags & MD_MN_NODE_OK) { 4661 mnsr_node = Zalloc(sizeof (*mnsr_node)); 4662 mnsr_node->mmn_mnsr = mnsr; 4663 (void) strncpy(mnsr_node->mmn_nodename, 4664 nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1); 4665 mnsr_node->mmn_next = master_mnsr_node; 4666 master_mnsr_node = mnsr_node; 4667 } else { 4668 free_sr((struct md_set_record *)mnsr); 4669 } 4670 4671 nd = nd->nd_next; 4672 } 4673 4674 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4675 "Master nodelist created for set %s: %s"), 4676 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4677 4678 /* 4679 * Send master nodelist to the rpc.metad on all nodes (including 4680 * myself) and each node will update itself. This will set the 4681 * ADD and DEL flags on each node as setup in the master nodelist. 4682 * Don't send nodelist to node where set doesn't exist. 4683 */ 4684 nd = master_nodelist; 4685 while (nd) { 4686 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4687 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4688 nd = nd->nd_next; 4689 continue; 4690 } 4691 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 4692 master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 4693 /* If RPC failure to another node return 205 */ 4694 if ((mdanyrpcerror(ep)) && 4695 (sd->sd_mn_mynode->nd_nodeid != 4696 nd->nd_nodeid)) { 4697 rval = 205; 4698 } else { 4699 /* Any other failure */ 4700 rval = -1; 4701 } 4702 goto out; 4703 } 4704 nd = nd->nd_next; 4705 } 4706 4707 /* 4708 * Now, delete nodes that need to be deleted. 4709 */ 4710 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4711 ep)) == NULL) { 4712 if (! mdisok(ep)) { 4713 rval = -1; 4714 goto out; 4715 } 4716 } 4717 4718 /* 4719 * May be doing lots of RPC commands to the nodes, so lock the 4720 * ALIVE members of the set since most of the rpc.metad routines 4721 * require this for security reasons. 4722 */ 4723 nd = master_nodelist; 4724 while (nd) { 4725 /* Skip non-alive nodes and node without set */ 4726 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4727 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4728 nd = nd->nd_next; 4729 continue; 4730 } 4731 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 4732 /* If RPC failure to another node return 205 */ 4733 if ((mdanyrpcerror(ep)) && 4734 (sd->sd_mn_mynode->nd_nodeid != 4735 nd->nd_nodeid)) { 4736 rval = 205; 4737 } else { 4738 /* Any other failure */ 4739 rval = -1; 4740 } 4741 goto out; 4742 } 4743 set_locked = 1; 4744 nd = nd->nd_next; 4745 } 4746 4747 nd = master_nodelist; 4748 while (nd) { 4749 /* Skip non-alive nodes */ 4750 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4751 nd = nd->nd_next; 4752 continue; 4753 } 4754 if (nd->nd_flags & MD_MN_NODE_DEL) { 4755 num_alive_nodes_del++; 4756 /* 4757 * Delete this node rec from all ALIVE nodes in diskset. 4758 */ 4759 nd2 = master_nodelist; 4760 while (nd2) { 4761 /* Skip non-alive nodes and node without set */ 4762 if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) || 4763 (nd2->nd_flags & MD_MN_NODE_NOSET)) { 4764 nd2 = nd2->nd_next; 4765 continue; 4766 } 4767 4768 /* This is a node being deleted from set */ 4769 if (nd2->nd_nodeid == nd->nd_nodeid) { 4770 /* Mark set record as DEL */ 4771 if (clnt_upd_sr_flags(nd->nd_nodename, 4772 sp, MD_SR_DEL, ep)) { 4773 /* RPC failure to !my node */ 4774 if ((mdanyrpcerror(ep)) && 4775 (sd->sd_mn_mynode-> 4776 nd_nodeid 4777 != nd->nd_nodeid)) { 4778 rval = 205; 4779 } else { 4780 /* Any other failure */ 4781 rval = -1; 4782 } 4783 goto out; 4784 } 4785 if (clnt_deldrvs(nd->nd_nodename, sp, 4786 dd, ep)) { 4787 /* RPC failure to !my node */ 4788 if ((mdanyrpcerror(ep)) && 4789 (sd->sd_mn_mynode-> 4790 nd_nodeid 4791 != nd->nd_nodeid)) { 4792 rval = 205; 4793 } else { 4794 /* Any other failure */ 4795 rval = -1; 4796 } 4797 goto out; 4798 } 4799 if (clnt_delset(nd->nd_nodename, sp, 4800 ep) == -1) { 4801 /* RPC failure to !my node */ 4802 if ((mdanyrpcerror(ep)) && 4803 (sd->sd_mn_mynode-> 4804 nd_nodeid 4805 != nd->nd_nodeid)) { 4806 rval = 205; 4807 } else { 4808 /* Any other failure */ 4809 rval = -1; 4810 } 4811 goto out; 4812 } 4813 } else { 4814 /* 4815 * Delete host from sets on hosts 4816 * not being deleted. 4817 */ 4818 anode[0] = Strdup(nd->nd_nodename); 4819 if (clnt_delhosts(nd2->nd_nodename, sp, 4820 1, anode, ep) == -1) { 4821 Free(anode[0]); 4822 /* RPC failure to !my node */ 4823 if ((mdanyrpcerror(ep)) && 4824 (sd->sd_mn_mynode-> 4825 nd_nodeid 4826 != nd2->nd_nodeid)) { 4827 rval = 205; 4828 } else { 4829 /* Any other failure */ 4830 rval = -1; 4831 } 4832 goto out; 4833 } 4834 4835 meta_mc_log(MC_LOG5, 4836 dgettext(TEXT_DOMAIN, 4837 "Deleted node %s (%d) on node %s " 4838 "from set %s: %s"), 4839 nd->nd_nodename, nd->nd_nodeid, 4840 nd2->nd_nodename, 4841 sp->setname, 4842 meta_print_hrtime( 4843 gethrtime() - start_time)); 4844 4845 Free(anode[0]); 4846 } 4847 nd2 = nd2->nd_next; 4848 } 4849 } 4850 nd = nd->nd_next; 4851 } 4852 4853 nd = master_nodelist; 4854 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4855 while (nd) { 4856 /* Skip non-alive nodes and node without set */ 4857 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4858 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4859 nd = nd->nd_next; 4860 continue; 4861 } 4862 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 4863 /* If RPC failure to another node return 205 */ 4864 if ((mdanyrpcerror(ep)) && 4865 (sd->sd_mn_mynode->nd_nodeid != 4866 nd->nd_nodeid)) { 4867 rval = 205; 4868 } else { 4869 /* Any other failure */ 4870 rval = -1; 4871 } 4872 goto out; 4873 } 4874 nd = nd->nd_next; 4875 } 4876 cl_set_setkey(NULL); 4877 set_locked = 0; 4878 4879 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4880 "Nodelist syncronization complete for set %s: %s"), 4881 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4882 4883 metaflushsetname(sp); 4884 4885 /* 4886 * If all alive nodes have been deleted from set, just 4887 * return since nothing else can be done until non-alive 4888 * nodes (if there are any) rejoin the cluster. 4889 */ 4890 if (num_alive_nodes == num_alive_nodes_del) { 4891 rval = 0; 4892 goto out; 4893 } 4894 4895 /* 4896 * Sync up drive records. 4897 * 4898 * If a node panic'd (or metaset command was killed) during the 4899 * addition or deletion of a drive to the diskset, the nodes 4900 * may have a different view of the drive list. During cleanup 4901 * of the drive list during reconfig, a drive will be deleted 4902 * from the list if the master node sees that the drive has been 4903 * marked in the ADD state on any node or is marked in the DEL state 4904 * on all nodes. 4905 * This cleanup must occur even if all nodes in the cluster are 4906 * not part of the cluster so that all nodes have the same view 4907 * of the drivelist. 4908 * Then if the entire cluster goes down and comes back up, the 4909 * new master node could be a node that wasn't in the cluster when 4910 * the node was deleted. This could lead to a situation where the 4911 * master node thinks that a drive is OK, but this drive isn't 4912 * known to the other nodes. 4913 * This situation can also occur during the addition of a drive 4914 * where a node has the drive marked OK, but the node executing the 4915 * metaset command enountered a failure before marking that drive OK 4916 * on the rest of the nodes. If the node with the OK drive then 4917 * panics, then rest of the nodes will remove that drive marked ADD 4918 * and when the node with the OK drive rejoins the cluster, it will 4919 * have a drive marked OK that is unknown by the other nodes. 4920 * 4921 * There are 2 situations to consider: 4922 * A) Master knows about a drive that other nodes don't know about. 4923 * B) At least one slave node knows about a drive that the master 4924 * node doesn't know about. 4925 * 4926 * To handle these situations the following steps are followed: 4927 * 1) Count number of drives known by this master node and the 4928 * other slave nodes. 4929 * If all nodes have the same number of drives and the master has 4930 * all drives marked OK, then skip to step4. 4931 * 4932 * 2) If a node has less drives listed than the master, the master 4933 * must get the drive descriptor list from that node so that 4934 * master can determine which drive it needs to delete from that 4935 * node. Master must get the drive descriptor list since the 4936 * drive record list does not contain the name of the drive, but 4937 * only a key and the key can only be interprested on that other 4938 * node. 4939 * 4940 * 3) The master will then create the master drive list by doing: 4941 * - Master starts with drive list known by master. 4942 * - Any drive marked ADD will be removed from the list. 4943 * - Any drive not known by another node (from step2) will be 4944 * removed from the drive list. 4945 * - If a drive is marked DEL on the master, the master must 4946 * verify that the drive record is marked DEL on all nodes. 4947 * If any node has the drive record marked OK, mark it OK 4948 * on the master. (The reason why is described below). 4949 * 4950 * 4) The master sends out the master drive list and the slave 4951 * nodes will force their drive lists to match the master 4952 * drive list by deleting drives, if necessary and by changing 4953 * the drive record states from ADD->OK if master has drive 4954 * marked OK and slave has drive marked ADD. 4955 * 4956 * Interesting scenarios: 4957 * 4958 * 1) System has 4 nodes with node 1 as the master. Node 3 starts 4959 * to delete a drive record (drive record on node 1 is marked DEL), 4960 * but is stopped when node 3 panics. Node 1 also panics. 4961 * During reconfig cycle, node 2 is picked as master and the drive 4962 * record is left alone since all nodes in the cluster have it 4963 * marked OK. User now sees drive as part of diskset. 4964 * Now, entire cluster is rebooted and node 1 rejoins the cluster. 4965 * Node 1 is picked as the master and node 1 has drive record 4966 * marked DEL. Node 1 contacts all other nodes in the cluster 4967 * and since at least one node has the drive record marked OK, 4968 * the master marks the drive record OK. 4969 * User continues to see the drive as part of the diskset. 4970 */ 4971 4972 /* Reget set descriptor since flushed above */ 4973 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4974 rval = -1; 4975 goto out; 4976 } 4977 4978 /* Has side effect of setting sd->sd_drvs to same as master_dd */ 4979 if ((master_dd = metaget_drivedesc_sideno(sp, 4980 sd->sd_mn_mynode->nd_nodeid, 4981 (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { 4982 /* No drives in list */ 4983 if (!mdisok(ep)) { 4984 /* 4985 * Can't get drive list for this node, so 4986 * return -1 causing this node to be removed 4987 * cluster config and fixed. 4988 */ 4989 rval = -1; 4990 goto out; 4991 } 4992 } 4993 4994 /* Count the number of drives for all nodes */ 4995 mnsr_node = master_mnsr_node; 4996 while (mnsr_node) { 4997 dr_cnt = 0; 4998 dr = mnsr_node->mmn_mnsr->sr_drivechain; 4999 while (dr) { 5000 dr_cnt++; 5001 dr = dr->dr_next; 5002 } 5003 mnsr_node->mmn_numdrives = dr_cnt; 5004 mnsr_node = mnsr_node->mmn_next; 5005 } 5006 5007 /* Count the number of drives for the master; also check flags */ 5008 all_drives_ok = 1; 5009 dd_cnt = 0; 5010 dd = master_dd; 5011 while (dd) { 5012 dd_cnt++; 5013 if (!(dd->dd_flags & MD_DR_OK)) 5014 all_drives_ok = 0; 5015 dd = dd->dd_next; 5016 } 5017 5018 /* If all drives are ok, do quick check against number of drives */ 5019 if (all_drives_ok) { 5020 /* If all nodes have same number of drives, almost done */ 5021 mnsr_node = master_mnsr_node; 5022 while (mnsr_node) { 5023 if (mnsr_node->mmn_numdrives != dd_cnt) 5024 break; 5025 mnsr_node = mnsr_node->mmn_next; 5026 } 5027 /* All nodes have same number of drives, just send flags */ 5028 if (mnsr_node == NULL) { 5029 goto send_drive_list; 5030 } 5031 } 5032 5033 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5034 "Begin detailed drive synchronization for set %s: %s"), 5035 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5036 5037 /* Detailed check required */ 5038 mnsr_node = master_mnsr_node; 5039 while (mnsr_node) { 5040 /* Does slave node have less drives than master? */ 5041 if (mnsr_node->mmn_numdrives < dd_cnt) { 5042 /* Yes - must determine which drive is missing */ 5043 if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp, 5044 &other_dd, ep)) { 5045 /* RPC failure to !my node */ 5046 if ((mdanyrpcerror(ep)) && 5047 (strcmp(mynode(), mnsr_node->mmn_nodename) 5048 != 0)) { 5049 rval = 205; 5050 } else { 5051 /* Any other failure */ 5052 rval = -1; 5053 } 5054 mde_perror(ep, dgettext(TEXT_DOMAIN, 5055 "Master node %s unable to " 5056 "retrieve drive list from node %s"), 5057 mynode(), mnsr_node->mmn_nodename); 5058 goto out; 5059 } 5060 mnsr_node->mmn_dd = other_dd; 5061 dd = master_dd; 5062 while (dd) { 5063 if (!(dd->dd_flags & MD_DR_OK)) { 5064 dd = dd->dd_next; 5065 continue; 5066 } 5067 other_dd = mnsr_node->mmn_dd; 5068 while (other_dd) { 5069 /* Convert to devids, when available */ 5070 if (strcmp(other_dd->dd_dnp->cname, 5071 dd->dd_dnp->cname) == 0) { 5072 break; 5073 } 5074 other_dd = other_dd->dd_next; 5075 } 5076 /* 5077 * dd not found on slave so mark it 5078 * ADD for later deletion (drives in ADD 5079 * state are deleted later in this routine). 5080 */ 5081 if (other_dd == NULL) { 5082 dd->dd_flags = MD_DR_ADD; 5083 } 5084 dd = dd->dd_next; 5085 } 5086 5087 } 5088 mnsr_node = mnsr_node->mmn_next; 5089 } 5090 5091 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5092 "Drive check completed for set %s: %s"), 5093 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5094 5095 dd = master_dd; 5096 dd_prev = 0; 5097 while (dd) { 5098 /* Remove any ADD drives from list */ 5099 if (dd->dd_flags & MD_DR_ADD) { 5100 if (dd_prev) { 5101 dd_prev->dd_next = dd->dd_next; 5102 dd->dd_next = NULL; 5103 metafreedrivedesc(&dd); 5104 dd = dd_prev->dd_next; 5105 } else { 5106 /* 5107 * If removing drive descriptor from head 5108 * of linked list, also change sd->sd_drvs. 5109 */ 5110 master_dd = sd->sd_drvs = dd->dd_next; 5111 dd->dd_next = NULL; 5112 metafreedrivedesc(&dd); 5113 dd = master_dd; 5114 } 5115 /* dd setup in if/else above */ 5116 continue; 5117 } 5118 /* 5119 * If drive is marked DEL, check all other nodes. 5120 * If drive on another node is marked OK, mark drive OK 5121 * in master list. If drive is marked DEL or doesn't exist 5122 * on all nodes, remove drive from list. 5123 */ 5124 if (dd->dd_flags & MD_DR_DEL) { 5125 mnsr_node = master_mnsr_node; 5126 while (mnsr_node) { 5127 if (mnsr_node->mmn_dd == NULL) { 5128 if (clnt_getdrivedesc( 5129 mnsr_node->mmn_nodename, sp, 5130 &other_dd, ep)) { 5131 /* RPC failure to !my node */ 5132 if ((mdanyrpcerror(ep)) && 5133 (strcmp(mynode(), 5134 mnsr_node->mmn_nodename) 5135 != 0)) { 5136 rval = 205; 5137 } else { 5138 /* Any other failure */ 5139 rval = -1; 5140 } 5141 mde_perror(ep, 5142 dgettext(TEXT_DOMAIN, 5143 "Master node %s unable " 5144 "to retrieve drive list " 5145 "from node %s"), mynode(), 5146 mnsr_node->mmn_nodename); 5147 goto out; 5148 } 5149 mnsr_node->mmn_dd = other_dd; 5150 } 5151 other_dd = mnsr_node->mmn_dd; 5152 while (other_dd) { 5153 /* Found drive (OK) from other node */ 5154 if (strcmp(dd->dd_dnp->cname, 5155 other_dd->dd_dnp->cname) 5156 == 0) { 5157 /* Drive marked OK */ 5158 if (other_dd->dd_flags & 5159 MD_DR_OK) { 5160 dd->dd_flags = MD_DR_OK; 5161 } 5162 break; 5163 } 5164 other_dd = other_dd->dd_next; 5165 } 5166 if (dd->dd_flags == MD_DR_OK) 5167 break; 5168 5169 mnsr_node = mnsr_node->mmn_next; 5170 } 5171 /* 5172 * If no node had this drive marked OK, delete it. 5173 */ 5174 if (dd->dd_flags & MD_DR_DEL) { 5175 if (dd_prev) { 5176 dd_prev->dd_next = dd->dd_next; 5177 dd->dd_next = NULL; 5178 metafreedrivedesc(&dd); 5179 dd = dd_prev->dd_next; 5180 } else { 5181 /* 5182 * If removing drive descriptor from 5183 * head of linked list, also change 5184 * sd->sd_drvs. 5185 */ 5186 master_dd = sd->sd_drvs = dd->dd_next; 5187 dd->dd_next = NULL; 5188 metafreedrivedesc(&dd); 5189 dd = master_dd; 5190 } 5191 /* dd setup in if/else above */ 5192 continue; 5193 } 5194 } 5195 dd_prev = dd; 5196 dd = dd->dd_next; 5197 } 5198 5199 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5200 "Setting drive states completed for set %s: %s"), 5201 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5202 5203 send_drive_list: 5204 /* 5205 * Set genid on all drives to be the highest value seen. 5206 */ 5207 dd = master_dd; 5208 while (dd) { 5209 dd->dd_genid = max_genid; 5210 dd = dd->dd_next; 5211 } 5212 /* 5213 * Send updated drive list to all alive nodes. 5214 * Will also set genid on set and node records to have same 5215 * as the drive records. 5216 */ 5217 nd = sd->sd_nodelist; 5218 while (nd) { 5219 /* Skip non-alive nodes */ 5220 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5221 nd = nd->nd_next; 5222 continue; 5223 } 5224 if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) { 5225 /* RPC failure to another node */ 5226 if ((mdanyrpcerror(ep)) && 5227 (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) { 5228 rval = 205; 5229 } else { 5230 /* Any other failure */ 5231 rval = -1; 5232 } 5233 goto out; 5234 } 5235 nd = nd->nd_next; 5236 } 5237 5238 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5239 "Sent drive list to all nodes for set %s: %s"), 5240 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5241 5242 /* 5243 * If no drive records left in set and nodes had been joined, 5244 * withdraw the nodes. Always reset the master and mark 5245 * all nodes as withdrawn on all nodes. 5246 */ 5247 if (master_dd == NULL) { 5248 /* Reset new master flag since no longer master */ 5249 (void) memset(&sf, 0, sizeof (sf)); 5250 sf.sf_setno = sp->setno; 5251 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5252 sf.sf_flags = MDDB_NM_RESET; 5253 /* Use magic to help protect ioctl against attack. */ 5254 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5255 /* Ignore failure, failure to reset flag isn't catastrophic */ 5256 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5257 &sf.sf_mde, NULL); 5258 5259 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5260 "Reset new master flag for " "set %s: %s"), 5261 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5262 5263 nd = sd->sd_nodelist; 5264 while (nd) { 5265 /* Skip non-alive nodes */ 5266 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5267 nd = nd->nd_next; 5268 continue; 5269 } 5270 5271 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 5272 /* RPC failure to another node */ 5273 if ((mdanyrpcerror(ep)) && 5274 (sd->sd_mn_mynode->nd_nodeid != 5275 nd->nd_nodeid)) { 5276 rval = 205; 5277 } else { 5278 /* Any other failure */ 5279 rval = -1; 5280 } 5281 goto out; 5282 } 5283 set_locked = 1; 5284 5285 /* Withdraw node from set if owner */ 5286 if ((nd->nd_flags & MD_MN_NODE_OWN) && 5287 (clnt_withdrawset(nd->nd_nodename, sp, ep))) { 5288 /* RPC failure to another node */ 5289 if ((mdanyrpcerror(ep)) && 5290 (sd->sd_mn_mynode->nd_nodeid != 5291 nd->nd_nodeid)) { 5292 rval = 205; 5293 } else { 5294 /* Any other failure */ 5295 rval = -1; 5296 } 5297 goto out; 5298 } 5299 5300 /* Mark all nodes as withdrawn on this node */ 5301 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5302 sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) { 5303 /* RPC failure to another node */ 5304 if ((mdanyrpcerror(ep)) && 5305 (sd->sd_mn_mynode->nd_nodeid != 5306 nd->nd_nodeid)) { 5307 rval = 205; 5308 } else { 5309 /* Any other failure */ 5310 rval = -1; 5311 } 5312 goto out; 5313 } 5314 5315 /* Resets master to no-master on this node */ 5316 if (clnt_mnsetmaster(nd->nd_nodename, sp, 5317 "", MD_MN_INVALID_NID, ep)) { 5318 /* RPC failure to another node */ 5319 if ((mdanyrpcerror(ep)) && 5320 (sd->sd_mn_mynode->nd_nodeid != 5321 nd->nd_nodeid)) { 5322 rval = 205; 5323 } else { 5324 /* Any other failure */ 5325 rval = -1; 5326 } 5327 goto out; 5328 } 5329 5330 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5331 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 5332 /* RPC failure to another node */ 5333 if ((mdanyrpcerror(ep)) && 5334 (sd->sd_mn_mynode->nd_nodeid != 5335 nd->nd_nodeid)) { 5336 rval = 205; 5337 } else { 5338 /* Any other failure */ 5339 rval = -1; 5340 } 5341 goto out; 5342 } 5343 set_locked = 0; 5344 nd = nd->nd_next; 5345 } 5346 } 5347 5348 out: 5349 /* 5350 * If got here and set is still locked, then an error has 5351 * occurred and master_nodelist is still valid. 5352 * If error is not an RPC error, then unlock. 5353 * If error is an RPC error, skip unlocks since this could cause 5354 * yet another RPC timeout if a node has failed. 5355 * Ignore failures in unlock since unlock is just trying to 5356 * clean things up. 5357 */ 5358 if ((set_locked) && !(mdanyrpcerror(ep))) { 5359 nd = master_nodelist; 5360 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5361 while (nd) { 5362 /* Skip non-alive nodes */ 5363 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5364 nd = nd->nd_next; 5365 continue; 5366 } 5367 /* 5368 * If clnt_unlock fails, just break out since next 5369 * reconfig cycle will reset the locks anyway. 5370 */ 5371 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 5372 break; 5373 } 5374 nd = nd->nd_next; 5375 } 5376 cl_set_setkey(NULL); 5377 } 5378 /* Free master_mnsr and drive descs */ 5379 mnsr_node = master_mnsr_node; 5380 while (mnsr_node) { 5381 master_mnsr_node = mnsr_node->mmn_next; 5382 free_sr((md_set_record *)mnsr_node->mmn_mnsr); 5383 free_rem_dd(mnsr_node->mmn_dd); 5384 Free(mnsr_node); 5385 mnsr_node = master_mnsr_node; 5386 } 5387 5388 /* Frees sd->sd_drvs (which is also master_dd) */ 5389 metaflushsetname(sp); 5390 return (rval); 5391 } 5392 5393 /* 5394 * meta_mnsync_diskset_mddbs 5395 * Calling node is guaranteed to be an owner node. 5396 * Calling node is the master node. 5397 * 5398 * Master node verifies that ondisk mddb format matches its incore format. 5399 * If no nodes are joined to set, remove the change log entries. 5400 * If a node is joined to set, play the change log. 5401 * 5402 * Returns 0 - Success 5403 * 1 - Master unable to join to set. 5404 * 205 - Failure during RPC to another node 5405 * -1 - Any other failure and ep is filled in. 5406 * -1 return will eventually cause node to panic 5407 * in a SunCluster environment. 5408 */ 5409 int 5410 meta_mnsync_diskset_mddbs( 5411 mdsetname_t *sp, 5412 md_error_t *ep 5413 ) 5414 { 5415 md_set_desc *sd; 5416 mddb_config_t c; 5417 md_mn_msgclass_t class; 5418 mddb_setflags_config_t sf; 5419 md_mnnode_desc *nd, *nd2; 5420 md_error_t xep = mdnullerror; 5421 int stale_set = 0; 5422 5423 /* If setname is there, set desc should exist. */ 5424 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5425 mde_perror(ep, dgettext(TEXT_DOMAIN, 5426 "Unable to get set %s desc information"), sp->setname); 5427 return (-1); 5428 } 5429 5430 /* Are there drives in the set? */ 5431 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5432 ep) == NULL) { 5433 if (! mdisok(ep)) { 5434 return (-1); 5435 } 5436 /* No drives in set -- nothing to sync up */ 5437 return (0); 5438 } 5439 5440 /* 5441 * Is master node (which is this node) joined to set? 5442 * If master node isn't joined (which means that no nodes 5443 * are joined to diskset), remove the change log entries 5444 * since no need to replay them - all nodes will have same 5445 * view of mddbs since all nodes are reading in the mddbs 5446 * from disk. 5447 * There is also no need to sync up the master and ondisk mddbs 5448 * since master has no incore knowledge. 5449 * Need to join master to set in order to flush the change 5450 * log entries. Don't need to block I/O during join of master 5451 * to set since no other nodes are joined to set and so no I/O 5452 * can be occurring. 5453 */ 5454 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 5455 /* Join master to set */ 5456 if (clnt_joinset(mynode(), sp, 5457 MNSET_IN_RECONFIG, ep)) { 5458 if (mdismddberror(ep, MDE_DB_STALE)) { 5459 /* 5460 * If STALE, print message and continue on. 5461 * Don't do any writes or reads to mddbs 5462 * so don't clear change log. 5463 */ 5464 mde_perror(ep, dgettext(TEXT_DOMAIN, 5465 "Join of master node to STALE set %s"), 5466 sp->setname); 5467 stale_set = 1; 5468 mdclrerror(ep); 5469 } else if (mdismddberror(ep, MDE_DB_ACCOK)) { 5470 /* ACCOK means mediator provided extra vote */ 5471 mdclrerror(ep); 5472 } else { 5473 /* 5474 * If master is unable to join set, print an 5475 * error message. Don't return failure or node 5476 * will panic during cluster reconfig cycle. 5477 * Also, withdraw node from set in order to 5478 * cleanup from failed join attempt. 5479 */ 5480 mde_perror(ep, dgettext(TEXT_DOMAIN, 5481 "Join of master node in set %s failed"), 5482 sp->setname); 5483 if (clnt_withdrawset(mynode(), sp, &xep)) 5484 mdclrerror(&xep); 5485 return (1); 5486 } 5487 } 5488 /* 5489 * Master node successfully joined. 5490 * Set local copy of flags to OWN and 5491 * send owner flag to rpc.metad. If not stale, 5492 * flush the change log. 5493 */ 5494 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 5495 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET, 5496 MNSET_IN_RECONFIG, ep)) { 5497 mde_perror(ep, dgettext(TEXT_DOMAIN, 5498 "Flag update of master node join in set %s failed"), 5499 sp->setname); 5500 return (-1); 5501 } 5502 5503 if (!stale_set) { 5504 if (mdmn_reset_changelog(sp, ep, 5505 MDMN_CLF_RESETLOG) != 0) { 5506 mde_perror(ep, dgettext(TEXT_DOMAIN, 5507 "Unable to reset changelog.")); 5508 return (-1); 5509 } 5510 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5511 "Removed changelog entries for set %s: %s"), 5512 sp->setname, 5513 meta_print_hrtime(gethrtime() - start_time)); 5514 } 5515 /* Reset new master flag before return */ 5516 (void) memset(&sf, 0, sizeof (sf)); 5517 sf.sf_setno = sp->setno; 5518 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5519 sf.sf_flags = MDDB_NM_RESET; 5520 /* Use magic to help protect ioctl against attack. */ 5521 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5522 /* Ignore failure, failure to reset flag isn't catastrophic */ 5523 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5524 &sf.sf_mde, NULL); 5525 5526 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5527 "Reset new master flag for set %s: %s"), 5528 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5529 5530 return (0); 5531 } 5532 5533 /* 5534 * Is master already joined to STALE set (< 50% mddbs avail)? 5535 * If so, can make no config changes to mddbs so don't check or play 5536 * changelog and don't sync master node to ondisk mddbs. 5537 * To get out of the stale state all nodes must be withdrawn 5538 * from set. Then as nodes are re-joined, all nodes will 5539 * have same view of mddbs since all nodes are reading the 5540 * mddbs from disk. 5541 */ 5542 (void) memset(&c, 0, sizeof (c)); 5543 c.c_id = 0; 5544 c.c_setno = sp->setno; 5545 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 5546 (void) mdstealerror(ep, &c.c_mde); 5547 return (-1); 5548 } 5549 if (c.c_flags & MDDB_C_STALE) { 5550 return (0); 5551 } 5552 5553 /* 5554 * If this node is NOT a newly chosen master, then there's 5555 * nothing else to do since the change log should be empty and 5556 * the ondisk and incore mddbs are already consistent. 5557 * 5558 * A newly chosen master is a node that was not the master 5559 * at the beginning of the reconfig cycle. If a node is a new 5560 * master, then the new master state is reset after the ondisk 5561 * and incore mddbs are consistent and the change log has 5562 * been replayed. 5563 */ 5564 (void) memset(&sf, 0, sizeof (sf)); 5565 sf.sf_setno = sp->setno; 5566 sf.sf_flags = MDDB_NM_GET; 5567 /* Use magic to help protect ioctl against attack. */ 5568 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5569 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) && 5570 ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) { 5571 return (0); 5572 } 5573 5574 /* 5575 * Now, sync up incore master view to ondisk mddbs. 5576 * This is needed in the case where a master node 5577 * had made a change to the mddb, but this change 5578 * may not have been relayed to the slaves yet. 5579 * So, the new master needs to verify that the ondisk 5580 * mddbs match what the new master has incore - 5581 * if different, new master rewrites all of the mddbs. 5582 * Then the new master will replay the changelog and the 5583 * new master will then execute what the old master had 5584 * done. 5585 * 5586 * Block all I/Os to disks in this diskset on all nodes in 5587 * the diskset. This will allow the rewriting of the mddbs 5588 * (if needed), to proceed in a timely manner. 5589 * 5590 * If block of I/Os fail, return a -1. 5591 */ 5592 5593 nd = sd->sd_nodelist; 5594 while (nd) { 5595 /* Skip non-alive and non-owner nodes */ 5596 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5597 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5598 nd = nd->nd_next; 5599 continue; 5600 } 5601 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5602 MN_SUSP_IO, ep)) { 5603 mde_perror(ep, dgettext(TEXT_DOMAIN, 5604 "Unable to suspend I/O on node %s in set %s"), 5605 nd->nd_nodename, sp->setname); 5606 5607 /* 5608 * Resume all other nodes that had been suspended. 5609 * (Reconfig return step also resumes I/Os 5610 * for all sets.) 5611 */ 5612 nd2 = sd->sd_nodelist; 5613 while (nd2) { 5614 /* Stop when reaching failed node */ 5615 if (nd2->nd_nodeid == nd->nd_nodeid) 5616 break; 5617 /* Skip non-alive and non-owner nodes */ 5618 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5619 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5620 nd2 = nd2->nd_next; 5621 continue; 5622 } 5623 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5624 sp->setno, MN_RES_IO, &xep)); 5625 nd2 = nd2->nd_next; 5626 } 5627 5628 /* 5629 * If an RPC failure on another node, return a 205. 5630 * Otherwise, exit with failure. 5631 */ 5632 if ((mdanyrpcerror(ep)) && 5633 (sd->sd_mn_mynode->nd_nodeid != 5634 nd->nd_nodeid)) { 5635 return (205); 5636 } else { 5637 return (-1); 5638 } 5639 5640 } 5641 nd = nd->nd_next; 5642 } 5643 5644 (void) memset(&c, 0, sizeof (c)); 5645 c.c_id = 0; 5646 c.c_setno = sp->setno; 5647 /* Master can't sync up to ondisk mddbs? Kick it out of cluster */ 5648 if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0) 5649 return (-1); 5650 5651 /* 5652 * Resume I/Os that were suspended above. 5653 */ 5654 nd = sd->sd_nodelist; 5655 while (nd) { 5656 /* Skip non-alive and non-owner nodes */ 5657 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5658 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5659 nd = nd->nd_next; 5660 continue; 5661 } 5662 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5663 MN_RES_IO, ep)) { 5664 mde_perror(ep, dgettext(TEXT_DOMAIN, 5665 "Unable to resume I/O on node %s in set %s"), 5666 nd->nd_nodename, sp->setname); 5667 5668 /* 5669 * If an RPC failure then don't do any 5670 * more RPC calls, since one timeout is enough 5671 * to endure. If RPC failure to another node, return 5672 * 205. If RPC failure to my node, return -1. 5673 * If not an RPC failure, continue resuming the 5674 * rest of the nodes and then return -1. 5675 */ 5676 if (mdanyrpcerror(ep)) { 5677 if (sd->sd_mn_mynode->nd_nodeid == 5678 nd->nd_nodeid) { 5679 return (-1); 5680 } else { 5681 return (205); 5682 } 5683 } 5684 5685 /* 5686 * If not an RPC error, continue resuming rest of 5687 * nodes, ignoring any failures except for an 5688 * RPC failure which constitutes an immediate exit. 5689 * Start in middle of list with failing node. 5690 */ 5691 nd2 = nd->nd_next; 5692 while (nd2) { 5693 /* Skip non-alive and non-owner nodes */ 5694 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5695 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5696 nd2 = nd2->nd_next; 5697 continue; 5698 } 5699 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5700 sp->setno, MN_RES_IO, &xep)); 5701 if (mdanyrpcerror(&xep)) { 5702 return (-1); 5703 } 5704 nd2 = nd2->nd_next; 5705 } 5706 } 5707 nd = nd->nd_next; 5708 } 5709 5710 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed " 5711 "checking/writing the mddb for set %s: %s"), sp->setname, 5712 meta_print_hrtime(gethrtime() - start_time)); 5713 5714 /* 5715 * Send (aka replay) all messages we find in the changelog. 5716 * Flag the messages with 5717 * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them 5718 * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd. 5719 */ 5720 for (class = MD_MN_NCLASSES - 1; class > 0; class--) { 5721 mdmn_changelog_record_t *lr; 5722 md_error_t xep = mdnullerror; 5723 md_mn_result_t *resultp = NULL; 5724 int ret; 5725 5726 lr = mdmn_get_changelogrec(sp->setno, class); 5727 if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) { 5728 /* no entry for this class */ 5729 continue; 5730 } 5731 5732 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5733 "replaying message ID=(%d, 0x%llx-%d)\n"), 5734 MSGID_ELEMS(lr->lr_msg.msg_msgid)); 5735 5736 ret = mdmn_send_message_with_msgid( 5737 lr->lr_msg.msg_setno, 5738 lr->lr_msg.msg_type, 5739 lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | 5740 MD_MSGF_OVERRIDE_SUSPEND, 5741 lr->lr_msg.msg_recipient, 5742 lr->lr_msg.msg_event_data, 5743 lr->lr_msg.msg_event_size, 5744 &resultp, 5745 &lr->lr_msg.msg_msgid, 5746 &xep); 5747 5748 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5749 "mdmn_send_message returned %d\n"), ret); 5750 5751 if (resultp) 5752 free_result(resultp); 5753 } 5754 5755 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5756 "Playing changelog completed for set %s: %s"), 5757 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5758 5759 /* 5760 * Now that new master has ondisk and incore mddbs in sync, reset 5761 * this node's new master kernel flag (for this set). If this node 5762 * re-enters another reconfig cycle before the completion of this 5763 * reconfig cycle, this master node won't need to check if the ondisk 5764 * and incore mddbs are in sync since this node won't be considered 5765 * a new master (since this flag is being reset here in the middle of 5766 * step2). This will save time during any subsequent reconfig 5767 * cycles as long as this node continues to be master. 5768 */ 5769 (void) memset(&sf, 0, sizeof (sf)); 5770 sf.sf_setno = sp->setno; 5771 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5772 sf.sf_flags = MDDB_NM_RESET; 5773 /* Use magic to help protect ioctl against attack. */ 5774 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5775 /* Ignore failure, since failure to reset flag isn't catastrophic */ 5776 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL); 5777 5778 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5779 "Reset new master flag for set %s: %s"), 5780 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5781 5782 return (0); 5783 } 5784 5785 /* 5786 * meta_mnjoin_all will join all starting nodes in the diskset. 5787 * A starting node is considered to be any node that is not 5788 * an owner of the set but is a member of the cluster. 5789 * Master node is already joined to set (done in meta_mnsync_diskset_mddbs). 5790 * 5791 * Caller is the Master node. 5792 * 5793 * Returns 0 - Success 5794 * 205 - Failure during RPC to another node 5795 * -1 - Any other failure and ep is filled in. 5796 */ 5797 int 5798 meta_mnjoin_all( 5799 mdsetname_t *sp, 5800 md_error_t *ep 5801 ) 5802 { 5803 md_set_desc *sd; 5804 md_mnnode_desc *nd, *nd2; 5805 int rval = 0; 5806 int stale_flag = 0; 5807 mddb_config_t c; 5808 int susp_res_flag = 0; 5809 md_error_t xep = mdnullerror; 5810 5811 /* If setname is there, set desc should exist. */ 5812 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5813 mde_perror(ep, dgettext(TEXT_DOMAIN, 5814 "Unable to get set %s desc information"), sp->setname); 5815 return (-1); 5816 } 5817 5818 /* Are there drives in the set? */ 5819 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5820 ep) == NULL) { 5821 if (! mdisok(ep)) { 5822 return (-1); 5823 } 5824 /* No drives in set -- nothing to join */ 5825 return (0); 5826 } 5827 5828 /* 5829 * Is set currently stale? 5830 */ 5831 (void) memset(&c, 0, sizeof (c)); 5832 c.c_id = 0; 5833 c.c_setno = sp->setno; 5834 /* Ignore failure since master node may not be joined yet */ 5835 (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); 5836 if (c.c_flags & MDDB_C_STALE) { 5837 stale_flag = MNSET_IS_STALE; 5838 } 5839 5840 /* 5841 * If any nodes are going to be joined to diskset, then 5842 * suspend I/O to all disks in diskset so that nodes can join 5843 * (read in mddbs) in a reasonable amount of time even under 5844 * high I/O load. Don't need to do this if set is STALE since 5845 * no I/O can be occurring to a STALE set. 5846 */ 5847 if (stale_flag != MNSET_IS_STALE) { 5848 nd = sd->sd_nodelist; 5849 while (nd) { 5850 /* Found a node that will be joined to diskset */ 5851 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5852 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5853 /* Set flag that diskset should be suspended */ 5854 susp_res_flag = 1; 5855 break; 5856 } 5857 nd = nd->nd_next; 5858 } 5859 } 5860 5861 if (susp_res_flag) { 5862 /* 5863 * Block all I/Os to disks in this diskset on all joined 5864 * nodes in the diskset. 5865 * If block of I/Os fails due to an RPC failure on another 5866 * node, return 205; otherwise, return -1. 5867 */ 5868 nd = sd->sd_nodelist; 5869 while (nd) { 5870 /* Skip non-alive and non-owner nodes */ 5871 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5872 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5873 nd = nd->nd_next; 5874 continue; 5875 } 5876 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5877 MN_SUSP_IO, ep)) { 5878 mde_perror(ep, dgettext(TEXT_DOMAIN, 5879 "Unable to suspend I/O on node %s" 5880 " in set %s"), nd->nd_nodename, 5881 sp->setname); 5882 /* 5883 * Resume other nodes that had been suspended. 5884 * (Reconfig return step also resumes I/Os 5885 * for all sets.) 5886 */ 5887 nd2 = sd->sd_nodelist; 5888 while (nd2) { 5889 /* Stop when reaching failed node */ 5890 if (nd2->nd_nodeid == nd->nd_nodeid) 5891 break; 5892 /* Skip non-alive/non-owner nodes */ 5893 if ((!(nd2->nd_flags & 5894 MD_MN_NODE_ALIVE)) || 5895 (!(nd2->nd_flags & 5896 MD_MN_NODE_OWN))) { 5897 nd2 = nd2->nd_next; 5898 continue; 5899 } 5900 (void) (clnt_mn_susp_res_io( 5901 nd2->nd_nodename, sp->setno, 5902 MN_RES_IO, &xep)); 5903 nd2 = nd2->nd_next; 5904 } 5905 5906 /* 5907 * If the suspend failed due to an 5908 * RPC failure on another node, return 5909 * a 205. 5910 * Otherwise, exit with failure. 5911 * The return reconfig step will resume 5912 * I/Os for all disksets. 5913 */ 5914 if ((mdanyrpcerror(ep)) && 5915 (sd->sd_mn_mynode->nd_nodeid != 5916 nd->nd_nodeid)) { 5917 return (205); 5918 } else { 5919 return (-1); 5920 } 5921 } 5922 nd = nd->nd_next; 5923 } 5924 } 5925 5926 nd = sd->sd_nodelist; 5927 while (nd) { 5928 /* 5929 * If a node is in the membership list but isn't joined 5930 * to the set, try to join the node. 5931 */ 5932 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5933 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5934 if (clnt_joinset(nd->nd_nodename, sp, 5935 (MNSET_IN_RECONFIG | stale_flag), ep)) { 5936 /* 5937 * If RPC failure to another node 5938 * then exit without attempting anything else. 5939 * (Reconfig return step will resume I/Os 5940 * for all sets.) 5941 */ 5942 if (mdanyrpcerror(ep)) { 5943 mde_perror(ep, ""); 5944 return (205); 5945 } 5946 /* 5947 * STALE and ACCOK failures aren't true 5948 * failures. STALE means that <50% mddbs 5949 * are available. ACCOK means that the 5950 * mediator provided the extra vote. 5951 * If a true failure, then print messasge 5952 * and withdraw node from set in order to 5953 * cleanup from failed join attempt. 5954 */ 5955 if ((!mdismddberror(ep, MDE_DB_STALE)) && 5956 (!mdismddberror(ep, MDE_DB_ACCOK))) { 5957 mde_perror(ep, 5958 "WARNING: Unable to join node %s " 5959 "to set %s", nd->nd_nodename, 5960 sp->setname); 5961 mdclrerror(ep); 5962 if (clnt_withdrawset(nd->nd_nodename, 5963 sp, &xep)) 5964 mdclrerror(&xep); 5965 nd = nd->nd_next; 5966 continue; 5967 } 5968 } 5969 /* Set owner flag even if STALE or ACCOK */ 5970 nd->nd_flags |= MD_MN_NODE_OWN; 5971 } 5972 nd = nd->nd_next; 5973 } 5974 /* 5975 * Resume I/Os if suspended above. 5976 */ 5977 if (susp_res_flag) { 5978 nd = sd->sd_nodelist; 5979 while (nd) { 5980 /* 5981 * Skip non-alive and non-owner nodes 5982 * (this list doesn't include any of 5983 * the nodes that were joined). 5984 */ 5985 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5986 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5987 nd = nd->nd_next; 5988 continue; 5989 } 5990 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5991 MN_RES_IO, ep)) { 5992 mde_perror(ep, dgettext(TEXT_DOMAIN, 5993 "Unable to resume I/O on node %s" 5994 " in set %s"), nd->nd_nodename, 5995 sp->setname); 5996 5997 /* 5998 * If an RPC failure then don't do any 5999 * more RPC calls, since one timeout is enough 6000 * to endure. If RPC failure to another node, 6001 * return 205. If RPC failure to my node, 6002 * return -1. 6003 * (Reconfig return step will resume I/Os 6004 * for all sets.) 6005 * If not an RPC failure, continue resuming the 6006 * rest of the nodes and then return -1. 6007 */ 6008 if (mdanyrpcerror(ep)) { 6009 if (sd->sd_mn_mynode->nd_nodeid == 6010 nd->nd_nodeid) { 6011 return (-1); 6012 } else { 6013 return (205); 6014 } 6015 } 6016 6017 /* 6018 * If not an RPC error, continue resuming rest 6019 * of nodes, ignoring any failures except for 6020 * an RPC failure which constitutes an 6021 * immediate exit. 6022 * Start in middle of list with failing node. 6023 */ 6024 nd2 = nd->nd_next; 6025 while (nd2) { 6026 /* Skip non-owner nodes */ 6027 if ((!(nd2->nd_flags & 6028 MD_MN_NODE_ALIVE)) || 6029 (!(nd2->nd_flags & 6030 MD_MN_NODE_OWN))) { 6031 nd2 = nd2->nd_next; 6032 continue; 6033 } 6034 (void) (clnt_mn_susp_res_io( 6035 nd2->nd_nodename, sp->setno, 6036 MN_RES_IO, &xep)); 6037 if (mdanyrpcerror(&xep)) { 6038 return (-1); 6039 } 6040 nd2 = nd2->nd_next; 6041 } 6042 } 6043 nd = nd->nd_next; 6044 } 6045 } 6046 6047 nd = sd->sd_nodelist; 6048 while (nd) { 6049 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 6050 nd = nd->nd_next; 6051 continue; 6052 } 6053 /* 6054 * If 1 node fails - go ahead and update the rest except 6055 * in the case of an RPC failure, fail immediately. 6056 */ 6057 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 6058 sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 6059 /* RPC failure to another node */ 6060 if (mdanyrpcerror(ep)) { 6061 return (205); 6062 } 6063 nd = nd->nd_next; 6064 rval = -1; 6065 continue; 6066 } 6067 nd = nd->nd_next; 6068 } 6069 6070 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 6071 "Join of all nodes completed for set %s: %s"), 6072 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 6073 6074 return (rval); 6075 } 6076