1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Just in case we're not in a build environment, make sure that 30 * TEXT_DOMAIN gets set to something. 31 */ 32 #if !defined(TEXT_DOMAIN) 33 #define TEXT_DOMAIN "SYS_TEST" 34 #endif 35 36 /* 37 * Metadevice diskset interfaces 38 */ 39 40 #include "meta_set_prv.h" 41 #include <meta.h> 42 #include <metad.h> 43 #include <mdmn_changelog.h> 44 #include <sys/lvm/md_crc.h> 45 #include <sys/utsname.h> 46 #include <sdssc.h> 47 48 #include <sys/sysevent/eventdefs.h> 49 #include <sys/sysevent/svm.h> 50 extern char *blkname(char *); 51 52 static md_drive_desc * 53 dr2drivedesc( 54 mdsetname_t *sp, 55 side_t sideno, 56 int flags, 57 md_error_t *ep 58 ) 59 { 60 md_set_record *sr; 61 md_drive_record *dr; 62 mddrivename_t *dnp; 63 md_drive_desc *dd_head = NULL; 64 md_set_desc *sd; 65 66 if (flags & MD_BYPASS_DAEMON) { 67 if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL) 68 return (NULL); 69 sd = metaget_setdesc(sp, ep); 70 sideno = getnodeside(mynode(), sd); 71 sp = metafakesetname(sp->setno, sr->sr_setname); 72 } else { 73 if ((sr = getsetbyname(sp->setname, ep)) == NULL) 74 return (NULL); 75 } 76 77 assert(sideno != MD_SIDEWILD); 78 79 /* 80 * WARNING: 81 * The act of getting the dnp from the namespace means that we 82 * will get the devid of the disk as recorded in the namespace. 83 * This devid has the potential to be stale if the disk is being 84 * replaced via a rebind, this means that any code that relies 85 * on any of the dnp information should take the appropriate action 86 * to preserve that information. For example in the rebind code the 87 * devid of the new disk is saved off and then copied back in once 88 * the code that has called this function has completed. 89 */ 90 for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { 91 if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key, 92 flags, ep)) == NULL) { 93 if (!(flags & MD_BYPASS_DAEMON)) 94 free_sr(sr); 95 metafreedrivedesc(&dd_head); 96 return (NULL); 97 } 98 99 (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt, 100 dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags); 101 } 102 103 if (!(flags & MD_BYPASS_DAEMON)) { 104 free_sr(sr); 105 } 106 return (dd_head); 107 } 108 109 static int 110 get_sidenmlist( 111 mdsetname_t *sp, 112 mddrivename_t *dnp, 113 md_error_t *ep 114 ) 115 { 116 md_set_desc *sd; 117 mdsidenames_t *sn, **sn_next; 118 int i; 119 120 if ((sd = metaget_setdesc(sp, ep)) == NULL) 121 return (-1); 122 123 metaflushsidenames(dnp); 124 sn_next = &dnp->side_names; 125 if (MD_MNSET_DESC(sd)) { 126 /* 127 * Only get sidenames for this node since 128 * that is the only side information stored in 129 * the local mddb for a multi-node diskset. 130 */ 131 if (sd->sd_mn_mynode) { 132 sn = Zalloc(sizeof (*sn)); 133 sn->sideno = sd->sd_mn_mynode->nd_nodeid; 134 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 135 sn->sideno, dnp->side_names_key, &sn->dname, 136 &sn->mnum, NULL, ep)) == NULL) { 137 if (sn->dname != NULL) 138 Free(sn->dname); 139 Free(sn); 140 return (-1); 141 } 142 143 /* Add to the end of the linked list */ 144 assert(*sn_next == NULL); 145 *sn_next = sn; 146 sn_next = &sn->next; 147 } 148 } else { 149 for (i = 0; i < MD_MAXSIDES; i++) { 150 /* Skip empty slots */ 151 if (sd->sd_nodes[i][0] == '\0') 152 continue; 153 154 sn = Zalloc(sizeof (*sn)); 155 sn->sideno = i; 156 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 157 i+SKEW, dnp->side_names_key, &sn->dname, 158 &sn->mnum, NULL, ep)) == NULL) { 159 /* 160 * It is possible that during the add of a 161 * host to have a 'missing' side as the side 162 * for this disk will be added later. So ignore 163 * the error. The 'missing' side will be added 164 * once the addhosts process has completed. 165 */ 166 if (mdissyserror(ep, ENOENT)) { 167 mdclrerror(ep); 168 Free(sn); 169 continue; 170 } 171 172 if (sn->dname != NULL) 173 Free(sn->dname); 174 Free(sn); 175 return (-1); 176 } 177 178 /* Add to the end of the linked list */ 179 assert(*sn_next == NULL); 180 *sn_next = sn; 181 sn_next = &sn->next; 182 } 183 } 184 185 return (0); 186 } 187 188 static md_drive_desc * 189 rl_to_dd( 190 mdsetname_t *sp, 191 md_replicalist_t *rlp, 192 md_error_t *ep 193 ) 194 { 195 md_replicalist_t *rl; 196 md_replica_t *r; 197 md_drive_desc *dd = NULL; 198 md_drive_desc *d; 199 int found; 200 md_set_desc *sd; 201 daddr_t nblks = 0; 202 203 if ((sd = metaget_setdesc(sp, ep)) == NULL) 204 return (NULL); 205 206 /* find the smallest existing replica */ 207 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 208 r = rl->rl_repp; 209 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 210 } 211 212 if (nblks <= 0) 213 nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 214 215 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 216 r = rl->rl_repp; 217 218 found = 0; 219 for (d = dd; d != NULL; d = d->dd_next) { 220 if (strcmp(r->r_namep->drivenamep->cname, 221 d->dd_dnp->cname) == 0) { 222 found = 1; 223 dd->dd_dbcnt++; 224 break; 225 } 226 } 227 228 if (! found) 229 (void) metadrivedesc_append(&dd, r->r_namep->drivenamep, 230 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK); 231 } 232 233 return (dd); 234 } 235 236 /* 237 * Exported Entry Points 238 */ 239 240 set_t 241 get_max_sets(md_error_t *ep) 242 { 243 244 static set_t max_sets = 0; 245 246 if (max_sets == 0) 247 if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0) 248 return (0); 249 250 return (max_sets); 251 } 252 253 int 254 get_max_meds(md_error_t *ep) 255 { 256 static int max_meds = 0; 257 258 if (max_meds == 0) 259 if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0) 260 return (0); 261 262 return (max_meds); 263 } 264 265 side_t 266 getmyside(mdsetname_t *sp, md_error_t *ep) 267 { 268 md_set_desc *sd; 269 char *node = NULL; 270 side_t sideno; 271 272 if (sp->setno == 0) 273 return (0); 274 275 if ((sd = metaget_setdesc(sp, ep)) == NULL) 276 return (MD_SIDEWILD); 277 278 node = mynode(); 279 280 assert(node != NULL); 281 282 sideno = getnodeside(node, sd); 283 284 if (sideno != MD_SIDEWILD) 285 return (sideno); 286 287 return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node)); 288 } 289 290 /* 291 * get set info from name 292 */ 293 md_set_record * 294 getsetbyname(char *setname, md_error_t *ep) 295 { 296 md_set_record *sr = NULL; 297 md_mnset_record *mnsr = NULL; 298 char *p; 299 size_t len; 300 301 /* get set info from daemon */ 302 if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1) 303 return (NULL); 304 if (sr != NULL) { 305 /* 306 * Returned record could be for a multi-node set or a 307 * non-multi-node set. 308 */ 309 if (MD_MNSET_REC(sr)) { 310 /* 311 * Record is for a multi-node set. Reissue call 312 * to get mnset information. Need to free 313 * record as if a non-multi-node set record since 314 * that is what clnt_getset gave us. If in 315 * the daemon, don't free since this is a pointer 316 * into the setrecords array. 317 */ 318 if (! md_in_daemon) { 319 sr->sr_flags &= ~MD_SR_MN; 320 free_sr(sr); 321 } 322 if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr, 323 ep) == -1) 324 return (NULL); 325 if (mnsr != NULL) 326 return ((struct md_set_record *)mnsr); 327 } else { 328 return (sr); 329 } 330 } 331 332 /* no such set */ 333 len = strlen(setname) + 30; 334 p = Malloc(len); 335 (void) snprintf(p, len, "setname \"%s\"", setname); 336 (void) mderror(ep, MDE_NO_SET, p); 337 Free(p); 338 return (NULL); 339 } 340 341 /* 342 * get set info from number 343 */ 344 md_set_record * 345 getsetbynum(set_t setno, md_error_t *ep) 346 { 347 md_set_record *sr; 348 md_mnset_record *mnsr = NULL; 349 char buf[100]; 350 351 if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1) 352 return (NULL); 353 354 if (sr != NULL) { 355 /* 356 * Record is for a multi-node set. Reissue call 357 * to get mnset information. Need to free 358 * record as if a non-multi-node set record since 359 * that is what clnt_getset gave us. If in 360 * the daemon, don't free since this is a pointer 361 * into the setrecords array. 362 */ 363 if (MD_MNSET_REC(sr)) { 364 /* 365 * Record is for a multi-node set. Reissue call 366 * to get mnset information. 367 */ 368 if (! md_in_daemon) { 369 sr->sr_flags &= ~MD_SR_MN; 370 free_sr(sr); 371 } 372 if (clnt_mngetset(mynode(), NULL, setno, &mnsr, 373 ep) == -1) 374 return (NULL); 375 if (mnsr != NULL) 376 return ((struct md_set_record *)mnsr); 377 } else { 378 return (sr); 379 } 380 } 381 382 (void) sprintf(buf, "setno %u", setno); 383 (void) mderror(ep, MDE_NO_SET, buf); 384 return (NULL); 385 } 386 387 int 388 meta_check_drive_inuse( 389 mdsetname_t *sp, 390 mddrivename_t *dnp, 391 int check_db, 392 md_error_t *ep 393 ) 394 { 395 mdnamelist_t *nlp = NULL; 396 mdnamelist_t *p; 397 int rval = 0; 398 399 /* get all underlying partitions */ 400 if (meta_getalldevs(sp, &nlp, check_db, ep) != 0) 401 return (-1); 402 403 /* search for drive */ 404 for (p = nlp; (p != NULL); p = p->next) { 405 mdname_t *np = p->namep; 406 407 if (strcmp(dnp->cname, np->drivenamep->cname) == 0) { 408 rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno, 409 NULL, dnp->cname, sp->setname)); 410 break; 411 } 412 } 413 414 /* cleanup, return success */ 415 metafreenamelist(nlp); 416 return (rval); 417 } 418 419 /* 420 * simple check for ownership 421 */ 422 int 423 meta_check_ownership(mdsetname_t *sp, md_error_t *ep) 424 { 425 int ownset; 426 md_set_desc *sd; 427 md_drive_desc *dd; 428 md_replicalist_t *rlp = NULL; 429 md_error_t xep = mdnullerror; 430 431 if (metaislocalset(sp)) 432 return (0); 433 434 ownset = own_set(sp, NULL, TRUE, ep); 435 if (! mdisok(ep)) 436 return (-1); 437 438 if ((sd = metaget_setdesc(sp, ep)) == NULL) 439 return (-1); 440 441 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 442 if (! mdisok(ep)) 443 return (-1); 444 445 /* If we have no drive descriptors, check for no ownership */ 446 if (dd == NULL) { 447 if (ownset == MD_SETOWNER_NONE) 448 return (0); 449 450 /* If ownership somehow has come to exist, we must clean up */ 451 452 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, 453 &xep) < 0) 454 mdclrerror(&xep); 455 456 if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL) 457 if (! mdisok(&xep)) 458 mdclrerror(&xep); 459 460 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 461 if (rel_own_bydd(sp, dd, TRUE, &xep)) 462 mdclrerror(&xep); 463 } 464 465 if (halt_set(sp, &xep)) 466 mdclrerror(&xep); 467 468 metafreereplicalist(rlp); 469 470 metafreedrivedesc(&dd); 471 472 return (0); 473 } 474 475 metafreedrivedesc(&sd->sd_drvs); 476 477 if (ownset == MD_SETOWNER_YES) 478 return (0); 479 480 return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL, 481 sp->setname)); 482 } 483 484 /* 485 * simple check for ownership 486 */ 487 int 488 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep) 489 { 490 md_set_desc *sd; 491 md_drive_desc *dd; 492 int bool; 493 494 if (metaislocalset(sp)) 495 return (0); 496 497 if ((sd = metaget_setdesc(sp, ep)) == NULL) 498 return (-1); 499 500 if (getnodeside(hostname, sd) == MD_SIDEWILD) 501 return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 502 hostname, NULL, sp->setname)); 503 504 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 505 if (! mdisok(ep)) 506 return (-1); 507 508 if (clnt_ownset(hostname, sp, &bool, ep) == -1) 509 return (-1); 510 511 if (dd == NULL) 512 return (0); 513 514 metafreedrivedesc(&sd->sd_drvs); 515 516 if (bool == TRUE) 517 return (0); 518 519 return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL, 520 sp->setname)); 521 } 522 523 /* 524 * Function that determines if a node is in the multinode diskset 525 * membership list. Calling node passes in node to be checked and 526 * the nodelist as returned from meta_read_nodelist. This routine 527 * anticipates being called many times using the same diskset membership 528 * list which is why the alloc and free of the diskset membership list 529 * is left to the calling routine. 530 * Returns: 531 * 1 - if a member 532 * 0 - not a member 533 */ 534 int 535 meta_is_member( 536 char *node_name, 537 md_mn_nodeid_t node_id, 538 mndiskset_membershiplist_t *nl 539 ) 540 { 541 mndiskset_membershiplist_t *nl2; 542 int flag_check_name; 543 544 if (node_id != 0) 545 flag_check_name = 0; 546 else if (node_name != NULL) 547 flag_check_name = 1; 548 else 549 return (0); 550 551 nl2 = nl; 552 while (nl2) { 553 if (flag_check_name) { 554 /* Compare given name against name in member list */ 555 if (strcmp(nl2->msl_node_name, node_name) == 0) 556 break; 557 } else { 558 /* Compare given nodeid against nodeid in member list */ 559 if (nl2->msl_node_id == node_id) 560 break; 561 } 562 nl2 = nl2->next; 563 } 564 /* No match found in member list */ 565 if (nl2 == NULL) { 566 return (0); 567 } 568 /* Return 1 if node is in member list */ 569 return (1); 570 } 571 572 /* 573 * meta_getnext_devinfo should go to the host that 574 * has the device, to return the device name, driver name, minor num. 575 * We can take the big cheat for now, since it is a requirement 576 * that the device names and device numbers are the same, and 577 * just get the info locally. 578 * 579 * This routine is very similar to meta_getnextside_devinfo except 580 * that the specific side to be used is being passed in. 581 * 582 * Exit status: 583 * 0 - No more side info to return 584 * 1 - More side info's to return 585 * -1 - An error has been detected 586 */ 587 /*ARGSUSED*/ 588 int 589 meta_getside_devinfo( 590 mdsetname_t *sp, /* for this set */ 591 char *bname, /* local block name (myside) */ 592 side_t sideno, /* sideno */ 593 char **ret_bname, /* block device name of returned side */ 594 char **ret_dname, /* driver name of returned side */ 595 minor_t *ret_mnum, /* minor number of returned side */ 596 md_error_t *ep 597 ) 598 { 599 mdname_t *np; 600 601 if (ret_bname != NULL) 602 *ret_bname = NULL; 603 if (ret_dname != NULL) 604 *ret_dname = NULL; 605 if (ret_mnum != NULL) 606 *ret_mnum = NODEV32; 607 608 609 if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL) 610 return (-1); 611 612 /* 613 * NOTE (future) - There will be more work here once devids are integrated 614 * into disksets. Then the side should be used to find the correct 615 * host and the b/d names should be gotten from that host. 616 */ 617 618 /* 619 * Return the side info. 620 */ 621 if (ret_bname != NULL) 622 *ret_bname = Strdup(np->bname); 623 624 if (ret_dname != NULL) { 625 mdcinfo_t *cinfo; 626 627 if ((cinfo = metagetcinfo(np, ep)) == NULL) 628 return (-1); 629 630 *ret_dname = Strdup(cinfo->dname); 631 } 632 633 if (ret_mnum != NULL) 634 *ret_mnum = meta_getminor(np->dev); 635 636 return (1); 637 } 638 639 /* 640 * Get the information on the device from the remote node using the devid 641 * of the disk. 642 * 643 * Exit status: 644 * 0 - No more side info to return 645 * 1 - More side info's to return 646 * -1 - An error has been detected 647 */ 648 int 649 meta_getnextside_devinfo( 650 mdsetname_t *sp, /* for this set */ 651 char *bname, /* local block name (myside) */ 652 side_t *sideno, /* previous sideno & returned sideno */ 653 char **ret_bname, /* block device name of returned side */ 654 char **ret_dname, /* driver name of returned side */ 655 minor_t *ret_mnum, /* minor number of returned side */ 656 md_error_t *ep 657 ) 658 { 659 md_set_desc *sd; 660 int i; 661 mdname_t *np; 662 mddrivename_t *dnp; 663 char *devidstr = NULL; 664 int devidstrlen; 665 md_dev64_t retdev = NODEV64; 666 char *ret_devname = NULL; 667 char *ret_blkdevname = NULL; 668 char *ret_driver = NULL; 669 char *nodename; 670 int fd; 671 int ret = -1; 672 char *minor_name = NULL; 673 md_mnnode_desc *nd; 674 675 676 if (ret_bname != NULL) 677 *ret_bname = NULL; 678 if (ret_dname != NULL) 679 *ret_dname = NULL; 680 if (ret_mnum != NULL) 681 *ret_mnum = NODEV32; 682 683 if (metaislocalset(sp)) { 684 /* no more sides - we are done */ 685 if (*sideno != MD_SIDEWILD) 686 return (0); 687 688 /* First time through - set up return sideno */ 689 *sideno = 0; 690 } else { 691 692 /* 693 * Find the next sideno, starting after the one given. 694 */ 695 if ((sd = metaget_setdesc(sp, ep)) == NULL) 696 return (-1); 697 698 if (MD_MNSET_DESC(sd)) { 699 nd = sd->sd_nodelist; 700 if ((*sideno == MD_SIDEWILD) && 701 (nd != (struct md_mnnode_desc *)NULL)) { 702 *sideno = nd->nd_nodeid; 703 } else { 704 while (nd) { 705 /* 706 * Found given sideno, now find 707 * next sideno, if there is one. 708 */ 709 if ((*sideno == nd->nd_nodeid) && 710 (nd->nd_next != 711 (struct md_mnnode_desc *)NULL)) { 712 *sideno = 713 nd->nd_next->nd_nodeid; 714 break; 715 } 716 nd = nd->nd_next; 717 } 718 if (nd == NULL) { 719 return (0); 720 } 721 } 722 if (*sideno == MD_SIDEWILD) 723 return (0); 724 } else { 725 for (i = (*sideno)+1; i < MD_MAXSIDES; i++) 726 /* Find next full slot */ 727 if (sd->sd_nodes[i][0] != '\0') 728 break; 729 730 /* No more sides - we are done */ 731 if (i == MD_MAXSIDES) 732 return (0); 733 734 /* Set up the return sideno */ 735 *sideno = i; 736 nodename = (char *)sd->sd_nodes[i]; 737 } 738 } 739 740 /* 741 * Need to pass the node the devid of the disk and get it to 742 * send back the details of the disk from that side. 743 */ 744 if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL) 745 return (-1); 746 747 dnp = np->drivenamep; 748 749 /* 750 * By default, set up the parameters so that they are copied out. 751 */ 752 if (ret_bname != NULL) 753 *ret_bname = Strdup(np->bname); 754 755 if (ret_dname != NULL) { 756 mdcinfo_t *cinfo; 757 758 if ((cinfo = metagetcinfo(np, ep)) == NULL) 759 return (-1); 760 761 *ret_dname = Strdup(cinfo->dname); 762 } 763 764 if (ret_mnum != NULL) 765 *ret_mnum = meta_getminor(np->dev); 766 767 /* 768 * Try some optimization. If this is the local set or the device 769 * is a metadevice then just copy the information. If the device 770 * does not have a devid (due to not having a minor name) then 771 * fall back to the pre-devid behaviour of copying the information 772 * on the device: this is okay because the sanity checks before this 773 * call would have found any issues with the device. If it's a 774 * multi-node diskset also just return ie. copy. 775 */ 776 if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) || 777 (MD_MNSET_DESC(sd))) 778 return (1); 779 780 if (np->minor_name == (char *)NULL) { 781 /* 782 * Have to get the minor name then. The slice should exist 783 * on the disk because it will have already been repartitioned 784 * up prior to getting to this point. 785 */ 786 if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) { 787 (void) mdsyserror(ep, errno, np->bname); 788 return (-1); 789 } 790 (void) devid_get_minor_name(fd, &minor_name); 791 np->minor_name = Strdup(minor_name); 792 devid_str_free(minor_name); 793 (void) close(fd); 794 } 795 796 /* allocate extra space for "/" and NULL hence +2 */ 797 devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2; 798 devidstr = (char *)Malloc(devidstrlen); 799 800 /* 801 * As a minor name is supplied then the ret_devname will be 802 * appropriate to that minor_name and in this case it will be 803 * a block device ie /dev/dsk. 804 */ 805 (void) snprintf(devidstr, devidstrlen, 806 "%s/%s", dnp->devid, np->minor_name); 807 808 ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev, 809 np->bname, &ret_devname, &ret_driver, ep); 810 811 Free(devidstr); 812 813 /* 814 * If the other side is not running device id in disksets, 815 * 'ret' is set to ENOTSUP in which case we fallback to 816 * the existing behaviour 817 */ 818 if (ret == ENOTSUP) 819 return (1); 820 else if (ret == -1) 821 return (-1); 822 823 /* 824 * ret_devname comes from the rpc call and is a 825 * raw device name. We need to make this into a 826 * block device via blkname for further processing. 827 * Unfortunately, when our device id isn't found in 828 * the system, the rpc call will return a " " in 829 * ret_devname in which case we need to fill that in 830 * as ret_blkname because blkname of " " returns NULL. 831 */ 832 if (ret_bname != NULL && ret_devname != NULL) { 833 ret_blkdevname = blkname(ret_devname); 834 if (ret_blkdevname == NULL) 835 *ret_bname = Strdup(ret_devname); 836 else 837 *ret_bname = Strdup(ret_blkdevname); 838 } 839 840 if (ret_dname != NULL && ret_driver != NULL) 841 *ret_dname = Strdup(ret_driver); 842 843 if (ret_mnum != NULL) 844 *ret_mnum = meta_getminor(retdev); 845 846 return (1); 847 } 848 849 int 850 meta_is_drive_in_anyset( 851 mddrivename_t *dnp, 852 mdsetname_t **spp, 853 int bypass_daemon, 854 md_error_t *ep 855 ) 856 { 857 set_t setno; 858 mdsetname_t *this_sp; 859 int is_it; 860 set_t max_sets; 861 862 if ((max_sets = get_max_sets(ep)) == 0) 863 return (-1); 864 865 assert(spp != NULL); 866 *spp = NULL; 867 868 for (setno = 1; setno < max_sets; setno++) { 869 if (!bypass_daemon) { 870 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 871 if (mdismddberror(ep, MDE_DB_NODB)) { 872 mdclrerror(ep); 873 return (0); 874 } 875 if (mdiserror(ep, MDE_NO_SET)) { 876 mdclrerror(ep); 877 continue; 878 } 879 return (-1); 880 } 881 } else 882 this_sp = metafakesetname(setno, NULL); 883 884 if ((is_it = meta_is_drive_in_thisset(this_sp, dnp, 885 bypass_daemon, ep)) == -1) { 886 if (mdiserror(ep, MDE_NO_SET)) { 887 mdclrerror(ep); 888 continue; 889 } 890 return (-1); 891 } 892 if (is_it) { 893 *spp = this_sp; 894 return (0); 895 } 896 } 897 return (0); 898 } 899 900 int 901 meta_is_drive_in_thisset( 902 mdsetname_t *sp, 903 mddrivename_t *dnp, 904 int bypass_daemon, 905 md_error_t *ep 906 ) 907 { 908 md_drive_desc *dd, *p; 909 910 if (bypass_daemon) 911 dd = dr2drivedesc(sp, MD_SIDEWILD, 912 (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep); 913 else 914 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 915 916 if (dd == NULL) { 917 if (! mdisok(ep)) 918 return (-1); 919 return (0); 920 } 921 922 923 for (p = dd; p != NULL; p = p->dd_next) 924 if (strcmp(p->dd_dnp->cname, dnp->cname) == 0) 925 return (1); 926 return (0); 927 } 928 929 /* 930 * Check to see if devid is in use in any diskset. 931 * This is used in the case when a partial diskset is being imported 932 * to make sure that the unvailable drive isn't already in use in an 933 * already imported partial diskset. Can't check on the cname since the 934 * unavailable disk's cname is from the previous system and may collide 935 * with a cname on this system. 936 * Return values: 937 * 1: devid has been found in a diskset 938 * 0: devid not found in any diskset 939 */ 940 int 941 meta_is_devid_in_anyset( 942 void *devid, 943 mdsetname_t **spp, 944 md_error_t *ep 945 ) 946 { 947 set_t setno; 948 mdsetname_t *this_sp; 949 int is_it; 950 set_t max_sets; 951 952 if ((max_sets = get_max_sets(ep)) == 0) 953 return (-1); 954 955 assert(spp != NULL); 956 *spp = NULL; 957 958 for (setno = 1; setno < max_sets; setno++) { 959 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 960 if (mdismddberror(ep, MDE_DB_NODB)) { 961 mdclrerror(ep); 962 return (0); 963 } 964 if (mdiserror(ep, MDE_NO_SET)) { 965 mdclrerror(ep); 966 continue; 967 } 968 return (-1); 969 } 970 971 if ((is_it = meta_is_devid_in_thisset(this_sp, 972 devid, ep)) == -1) { 973 if (mdiserror(ep, MDE_NO_SET)) { 974 mdclrerror(ep); 975 continue; 976 } 977 return (-1); 978 } 979 if (is_it) { 980 *spp = this_sp; 981 return (0); 982 } 983 } 984 return (0); 985 } 986 987 int 988 meta_is_devid_in_thisset( 989 mdsetname_t *sp, 990 void *devid, 991 md_error_t *ep 992 ) 993 { 994 md_drive_desc *dd, *p; 995 ddi_devid_t dd_devid; 996 997 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 998 if (dd == NULL) { 999 if (! mdisok(ep)) 1000 return (-1); 1001 return (0); 1002 } 1003 1004 for (p = dd; p != NULL; p = p->dd_next) { 1005 if (p->dd_dnp->devid == NULL) 1006 continue; 1007 (void) devid_str_decode(p->dd_dnp->devid, 1008 &dd_devid, NULL); 1009 if (dd_devid == NULL) 1010 continue; 1011 if (devid_compare(devid, dd_devid) == 0) { 1012 devid_free(dd_devid); 1013 return (1); 1014 } 1015 devid_free(dd_devid); 1016 } 1017 return (0); 1018 } 1019 1020 int 1021 meta_set_balance( 1022 mdsetname_t *sp, 1023 md_error_t *ep 1024 ) 1025 { 1026 md_set_desc *sd; 1027 md_drive_desc *dd, *curdd; 1028 daddr_t dbsize; 1029 daddr_t nblks; 1030 int i; 1031 int rval = 0; 1032 sigset_t oldsigs; 1033 md_setkey_t *cl_sk; 1034 md_error_t xep = mdnullerror; 1035 md_mnnode_desc *nd; 1036 int suspend1_flag = 0; 1037 1038 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1039 return (-1); 1040 1041 dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 1042 1043 /* Make sure we own the set */ 1044 if (meta_check_ownership(sp, ep) != 0) 1045 return (-1); 1046 1047 /* END CHECK CODE */ 1048 1049 /* 1050 * Get drive descriptors for the drives that are currently in the set. 1051 */ 1052 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); 1053 1054 if (! mdisok(ep)) 1055 return (-1); 1056 1057 /* Find the minimum replica size in use is or use the default */ 1058 if ((nblks = meta_db_minreplica(sp, ep)) < 0) 1059 mdclrerror(ep); 1060 else 1061 dbsize = nblks; /* adjust replica size */ 1062 1063 /* Make sure we are blocking all signals */ 1064 if (procsigs(TRUE, &oldsigs, &xep) < 0) 1065 mdclrerror(&xep); 1066 1067 /* 1068 * Lock the set on current set members. 1069 * For MN diskset lock_set and SUSPEND are used to protect against 1070 * other meta* commands running on the other nodes. 1071 */ 1072 if (MD_MNSET_DESC(sd)) { 1073 nd = sd->sd_nodelist; 1074 while (nd) { 1075 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1076 nd = nd->nd_next; 1077 continue; 1078 } 1079 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1080 rval = -1; 1081 goto out; 1082 } 1083 nd = nd->nd_next; 1084 } 1085 /* 1086 * Lock out other meta* commands by suspending 1087 * class 1 messages across the diskset. 1088 */ 1089 nd = sd->sd_nodelist; 1090 while (nd) { 1091 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1092 nd = nd->nd_next; 1093 continue; 1094 } 1095 if (clnt_mdcommdctl(nd->nd_nodename, 1096 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 1097 MD_MSCF_NO_FLAGS, ep)) { 1098 rval = -1; 1099 goto out; 1100 } 1101 suspend1_flag = 1; 1102 nd = nd->nd_next; 1103 } 1104 } else { 1105 for (i = 0; i < MD_MAXSIDES; i++) { 1106 /* Skip empty slots */ 1107 if (sd->sd_nodes[i][0] == '\0') continue; 1108 1109 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1110 rval = -1; 1111 goto out; 1112 } 1113 } 1114 } 1115 1116 /* We are not adding or deleting any drives, just balancing */ 1117 dd = NULL; 1118 1119 /* 1120 * Balance the DB's according to the list of existing drives and the 1121 * list of added drives. 1122 */ 1123 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) 1124 goto out; 1125 1126 out: 1127 /* 1128 * Unlock diskset by resuming class 1 messages across the diskset. 1129 * Just resume all classes so that resume is the same whether 1130 * just one class was locked or all classes were locked. 1131 */ 1132 if (suspend1_flag) { 1133 nd = sd->sd_nodelist; 1134 while (nd) { 1135 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1136 nd = nd->nd_next; 1137 continue; 1138 } 1139 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 1140 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 1141 /* 1142 * We are here because we failed to resume 1143 * rpc.mdcommd. However we potentially have 1144 * an error from the previous call 1145 * (meta_db_balance). If the previous call 1146 * did fail, we capture that error and 1147 * generate a perror withthe string, 1148 * "Unable to resume...". 1149 * Setting rval to -1 ensures that in the 1150 * next iteration of the loop, ep is not 1151 * clobbered. 1152 */ 1153 if (rval == 0) 1154 (void) mdstealerror(ep, &xep); 1155 else 1156 mdclrerror(&xep); 1157 rval = -1; 1158 mde_perror(ep, dgettext(TEXT_DOMAIN, 1159 "Unable to resume rpc.mdcommd.")); 1160 } 1161 nd = nd->nd_next; 1162 } 1163 } 1164 1165 /* Unlock the set */ 1166 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1167 if (MD_MNSET_DESC(sd)) { 1168 nd = sd->sd_nodelist; 1169 while (nd) { 1170 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1171 nd = nd->nd_next; 1172 continue; 1173 } 1174 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1175 if (rval == 0) 1176 (void) mdstealerror(ep, &xep); 1177 else 1178 mdclrerror(&xep); 1179 rval = -1; 1180 } 1181 nd = nd->nd_next; 1182 } 1183 } else { 1184 for (i = 0; i < MD_MAXSIDES; i++) { 1185 /* Skip empty slots */ 1186 if (sd->sd_nodes[i][0] == '\0') 1187 continue; 1188 1189 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1190 if (rval == 0) 1191 (void) mdstealerror(ep, &xep); 1192 rval = -1; 1193 } 1194 } 1195 } 1196 1197 /* release signals back to what they were on entry */ 1198 if (procsigs(FALSE, &oldsigs, &xep) < 0) 1199 mdclrerror(&xep); 1200 1201 cl_set_setkey(NULL); 1202 1203 metaflushsetname(sp); 1204 1205 return (rval); 1206 } 1207 1208 int 1209 meta_set_destroy( 1210 mdsetname_t *sp, 1211 int lock_set, 1212 md_error_t *ep 1213 ) 1214 { 1215 int i; 1216 med_rec_t medr; 1217 md_set_desc *sd; 1218 md_drive_desc *dd, *p, *p1; 1219 mddrivename_t *dnp; 1220 mdname_t *np; 1221 mdnamelist_t *nlp = NULL; 1222 int num_users = 0; 1223 int has_set; 1224 side_t mysideno; 1225 sigset_t oldsigs; 1226 md_error_t xep = mdnullerror; 1227 md_setkey_t *cl_sk; 1228 int rval = 0; 1229 int delete_end = 1; 1230 1231 /* Make sure we are blocking all signals */ 1232 if (procsigs(TRUE, &oldsigs, ep) < 0) 1233 return (-1); 1234 1235 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1236 if (! mdisok(ep)) 1237 rval = -1; 1238 goto out; 1239 } 1240 1241 /* 1242 * meta_set_destroy should not be called for a MN diskset. 1243 * This routine destroys a set without communicating this information 1244 * to the other nodes which would lead to an inconsistency in 1245 * the MN diskset. 1246 */ 1247 if (MD_MNSET_DESC(sd)) { 1248 rval = -1; 1249 goto out; 1250 } 1251 1252 /* Continue if a traditional diskset */ 1253 1254 /* 1255 * Check to see who has the set. If we are not the last user of the 1256 * set, we will not touch the replicas. 1257 */ 1258 for (i = 0; i < MD_MAXSIDES; i++) { 1259 /* Skip empty slots */ 1260 if (sd->sd_nodes[i][0] == '\0') 1261 continue; 1262 1263 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, 1264 ep); 1265 1266 if (has_set < 0) { 1267 mdclrerror(ep); 1268 } else 1269 num_users++; 1270 } 1271 1272 if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) { 1273 if (! mdisok(ep)) { 1274 rval = -1; 1275 goto out; 1276 } 1277 } 1278 1279 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 1280 rval = -1; 1281 goto out; 1282 } 1283 1284 if (lock_set == TRUE) { 1285 /* Lock the set on our side */ 1286 if (clnt_lock_set(mynode(), sp, ep)) { 1287 rval = -1; 1288 goto out; 1289 } 1290 } 1291 1292 /* 1293 * A traditional diskset has no diskset stale information to send 1294 * since there can only be one owner node at a time. 1295 */ 1296 if (snarf_set(sp, FALSE, ep)) 1297 mdclrerror(ep); 1298 1299 if (dd != NULL) { 1300 /* 1301 * Make sure that no drives are in use as parts of metadrives 1302 * or hot spare pools, this is one of the few error conditions 1303 * that will stop this routine, unless the environment has 1304 * META_DESTROY_SET_OK set, in which case, the operation will 1305 * proceed. 1306 */ 1307 if (getenv("META_DESTROY_SET_OK") == NULL) { 1308 for (p = dd; p != NULL; p = p->dd_next) { 1309 dnp = p->dd_dnp; 1310 1311 i = meta_check_drive_inuse(sp, dnp, FALSE, ep); 1312 if (i == -1) { 1313 /* need xep - wire calls clear error */ 1314 i = metaget_setownership(sp, &xep); 1315 if (i == -1) { 1316 rval = -1; 1317 goto out; 1318 } 1319 1320 mysideno = getmyside(sp, &xep); 1321 1322 if (mysideno == MD_SIDEWILD) { 1323 rval = -1; 1324 goto out; 1325 } 1326 1327 if (sd->sd_isown[mysideno] == FALSE) 1328 if (halt_set(sp, &xep)) { 1329 rval = -1; 1330 goto out; 1331 } 1332 1333 rval = -1; 1334 goto out; 1335 } 1336 } 1337 } 1338 1339 for (i = 0; i < MD_MAXSIDES; i++) { 1340 /* Skip empty slots */ 1341 if (sd->sd_nodes[i][0] == '\0') 1342 continue; 1343 1344 /* Skip non local nodes */ 1345 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1346 continue; 1347 1348 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep)) 1349 mdclrerror(ep); 1350 } 1351 1352 /* 1353 * Go thru each drive and individually delete the replicas. 1354 * This way we can ignore individual errors. 1355 */ 1356 for (p = dd; p != NULL; p = p->dd_next) { 1357 uint_t rep_slice; 1358 1359 dnp = p->dd_dnp; 1360 if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 1361 (((np = metaslicename(dnp, rep_slice, ep)) 1362 == NULL) && 1363 ((np = metaslicename(dnp, MD_SLICE0, ep)) 1364 == NULL))) { 1365 rval = -1; 1366 goto out; 1367 } 1368 1369 if ((np = metaslicename(dnp, 1370 rep_slice, ep)) == NULL) { 1371 if ((np = metaslicename(dnp, 1372 MD_SLICE0, ep)) == NULL) { 1373 rval = -1; 1374 goto out; 1375 } 1376 mdclrerror(ep); 1377 } 1378 1379 /* Yes this is UGLY!!! */ 1380 p1 = p->dd_next; 1381 p->dd_next = NULL; 1382 if (rel_own_bydd(sp, p, FALSE, ep)) 1383 mdclrerror(ep); 1384 p->dd_next = p1; 1385 1386 if (p->dd_dbcnt == 0) 1387 continue; 1388 1389 /* 1390 * Skip the replica removal if we are not the last user 1391 */ 1392 if (num_users != 1) 1393 continue; 1394 1395 nlp = NULL; 1396 (void) metanamelist_append(&nlp, np); 1397 if (meta_db_detach(sp, nlp, 1398 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep)) 1399 mdclrerror(ep); 1400 metafreenamelist(nlp); 1401 } 1402 } 1403 1404 if (halt_set(sp, ep)) { 1405 rval = -1; 1406 goto out; 1407 } 1408 1409 /* Setup the mediator record */ 1410 (void) memset(&medr, '\0', sizeof (med_rec_t)); 1411 medr.med_rec_mag = MED_REC_MAGIC; 1412 medr.med_rec_rev = MED_REC_REV; 1413 medr.med_rec_fl = 0; 1414 medr.med_rec_sn = sp->setno; 1415 (void) strcpy(medr.med_rec_snm, sp->setname); 1416 medr.med_rec_meds = sd->sd_med; /* structure assigment */ 1417 (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t)); 1418 medr.med_rec_foff = 0; 1419 1420 /* 1421 * If we are the last remaining user, then remove the mediator hosts 1422 */ 1423 if (num_users == 1) { 1424 for (i = 0; i < MED_MAX_HOSTS; i++) { 1425 if (medr.med_rec_meds.n_lst[i].a_cnt != 0) 1426 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, 1427 SVM_TAG_MEDIATOR, sp->setno, i); 1428 (void) memset(&medr.med_rec_meds.n_lst[i], '\0', 1429 sizeof (md_h_t)); 1430 } 1431 medr.med_rec_meds.n_cnt = 0; 1432 } else { /* Remove this host from the mediator node list. */ 1433 for (i = 0; i < MD_MAXSIDES; i++) { 1434 /* Skip empty slots */ 1435 if (sd->sd_nodes[i][0] == '\0') 1436 continue; 1437 1438 /* Copy non local node */ 1439 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) { 1440 (void) strcpy(medr.med_rec_nodes[i], 1441 sd->sd_nodes[i]); 1442 continue; 1443 } 1444 1445 /* Clear local node */ 1446 (void) memset(&medr.med_rec_nodes[i], '\0', 1447 sizeof (md_node_nm_t)); 1448 } 1449 } 1450 1451 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); 1452 1453 /* 1454 * If the client is part of a cluster put the DCS service 1455 * into a deleteing state. 1456 */ 1457 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1458 if (metad_isautotakebyname(sp->setname)) { 1459 delete_end = 0; 1460 } else { 1461 mdclrerror(ep); 1462 goto out; 1463 } 1464 } 1465 1466 /* Inform the mediator hosts of the new information */ 1467 for (i = 0; i < MED_MAX_HOSTS; i++) { 1468 if (sd->sd_med.n_lst[i].a_cnt == 0) 1469 continue; 1470 1471 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) 1472 mdclrerror(ep); 1473 } 1474 1475 /* Delete the set locally */ 1476 for (i = 0; i < MD_MAXSIDES; i++) { 1477 /* Skip empty slots */ 1478 if (sd->sd_nodes[i][0] == '\0') 1479 continue; 1480 1481 /* Skip non local nodes */ 1482 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1483 continue; 1484 1485 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) 1486 mdclrerror(ep); 1487 } 1488 if (delete_end && 1489 sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) 1490 rval = -1; 1491 1492 out: 1493 /* release signals back to what they were on entry */ 1494 if (procsigs(FALSE, &oldsigs, &xep) < 0) { 1495 if (rval == 0) 1496 (void) mdstealerror(ep, &xep); 1497 rval = -1; 1498 } 1499 1500 if (lock_set == TRUE) { 1501 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1502 if (clnt_unlock_set(mynode(), cl_sk, &xep)) { 1503 if (rval == 0) 1504 (void) mdstealerror(ep, &xep); 1505 rval = -1; 1506 } 1507 cl_set_setkey(NULL); 1508 } 1509 1510 metaflushsetname(sp); 1511 return (rval); 1512 } 1513 1514 int 1515 meta_set_purge( 1516 mdsetname_t *sp, 1517 int bypass_cluster, 1518 int forceflg, 1519 md_error_t *ep 1520 ) 1521 { 1522 char *thishost = mynode(); 1523 md_set_desc *sd; 1524 md_setkey_t *cl_sk; 1525 md_error_t xep = mdnullerror; 1526 int rval = 0; 1527 int i, num_hosts = 0; 1528 int has_set = 0; 1529 int max_node = 0; 1530 int delete_end = 1; 1531 md_mnnode_desc *nd; 1532 1533 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1534 /* unable to find set description */ 1535 rval = 1; 1536 return (rval); 1537 } 1538 1539 if (MD_MNSET_DESC(sd)) { 1540 /* 1541 * Get a count of the hosts in the set and also lock the set 1542 * on those hosts that know about it. 1543 */ 1544 nd = sd->sd_nodelist; 1545 while (nd) { 1546 /* 1547 * Only deal with those nodes that are members of 1548 * the set (MD_MN_NODE_ALIVE) or the node on which 1549 * the purge is being run. We must lock the set 1550 * on the purging node because the delset call 1551 * requires the lock to be set. 1552 */ 1553 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) && 1554 nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) { 1555 nd = nd->nd_next; 1556 continue; 1557 } 1558 has_set = nodehasset(sp, nd->nd_nodename, 1559 NHS_NST_EQ, ep); 1560 1561 /* 1562 * The host is not aware of this set (has_set < 0) or 1563 * the set does not match (has_set == 0). This check 1564 * prevents the code getting confused by an apparent 1565 * inconsistancy in the set's state, this is in the 1566 * purge code so something is broken in any case and 1567 * this is just trying to fix the brokeness. 1568 */ 1569 if (has_set <= 0) { 1570 mdclrerror(ep); 1571 nd->nd_flags |= MD_MN_NODE_NOSET; 1572 } else { 1573 num_hosts++; 1574 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1575 /* 1576 * If the force flag is set then 1577 * ignore any RPC failures because we 1578 * are only really interested with 1579 * the set on local node. 1580 */ 1581 if (forceflg && mdanyrpcerror(ep)) { 1582 mdclrerror(ep); 1583 } else { 1584 /* 1585 * set max_node so that in the 1586 * unlock code nodes in the 1587 * set that have not been 1588 * locked are not unlocked. 1589 */ 1590 max_node = nd->nd_nodeid; 1591 rval = 2; 1592 goto out1; 1593 } 1594 } 1595 1596 } 1597 nd = nd->nd_next; 1598 } 1599 max_node = 0; 1600 } else { 1601 /* 1602 * Get a count of the hosts in the set and also lock the set 1603 * on those hosts that know about it. 1604 */ 1605 for (i = 0; i < MD_MAXSIDES; i++) { 1606 /* Skip empty slots */ 1607 if (sd->sd_nodes[i][0] == '\0') 1608 continue; 1609 1610 has_set = nodehasset(sp, sd->sd_nodes[i], 1611 NHS_NST_EQ, ep); 1612 1613 /* 1614 * The host is not aware of this set (has_set < 0) or 1615 * the set does not match (has_set == 0). This check 1616 * prevents the code getting confused by an apparent 1617 * inconsistancy in the set's state, this is in the 1618 * purge code so something is broken in any case and 1619 * this is just trying to fix the brokeness. 1620 */ 1621 if (has_set <= 0) { 1622 mdclrerror(ep); 1623 /* 1624 * set the node to NULL to prevent further 1625 * requests to this unresponsive node. 1626 */ 1627 sd->sd_nodes[i][0] = '\0'; 1628 } else { 1629 num_hosts++; 1630 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1631 /* 1632 * If the force flag is set then 1633 * ignore any RPC failures because we 1634 * are only really interested with 1635 * the set on local node. 1636 */ 1637 if (forceflg && mdanyrpcerror(ep)) { 1638 mdclrerror(ep); 1639 } else { 1640 rval = 2; 1641 /* 1642 * set max_node so that in the 1643 * unlock code nodes in the 1644 * set that have not been 1645 * locked are not unlocked. 1646 */ 1647 max_node = i; 1648 goto out1; 1649 } 1650 } 1651 } 1652 } 1653 max_node = i; /* now MD_MAXSIDES */ 1654 } 1655 if (!bypass_cluster) { 1656 /* 1657 * If there is only one host associated with the 1658 * set then remove the set from the cluster. 1659 */ 1660 if (num_hosts == 1) { 1661 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1662 if (metad_isautotakebyname(sp->setname)) { 1663 delete_end = 0; 1664 } else { 1665 mdclrerror(ep); 1666 rval = 3; 1667 goto out1; 1668 } 1669 } 1670 } 1671 } 1672 1673 if (MD_MNSET_DESC(sd)) { 1674 nd = sd->sd_nodelist; 1675 while (nd) { 1676 if (nd->nd_nodeid == sd->sd_mn_mynode->nd_nodeid) { 1677 /* 1678 * This is the node on which the purge is 1679 * being run. We do not care if it is 1680 * alive or not, just want to get rid of 1681 * the set. 1682 */ 1683 if (clnt_delset(nd->nd_nodename, sp, 1684 ep) == -1) { 1685 md_perror(dgettext(TEXT_DOMAIN, 1686 "delset")); 1687 if (!bypass_cluster && num_hosts == 1) 1688 (void) sdssc_delete_end( 1689 sp->setname, SDSSC_CLEANUP); 1690 mdclrerror(ep); 1691 goto out1; 1692 } 1693 nd = nd->nd_next; 1694 continue; 1695 } 1696 1697 /* 1698 * Only contact those nodes that are members of 1699 * the set. 1700 */ 1701 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1702 nd = nd->nd_next; 1703 continue; 1704 } 1705 1706 /* 1707 * Tell the remote node to remove this node 1708 */ 1709 if (clnt_delhosts(nd->nd_nodename, sp, 1, &thishost, 1710 ep) == -1) { 1711 /* 1712 * If we fail to delete ourselves 1713 * from the remote host it does not 1714 * really matter because the set is 1715 * being "purged" from this node. The 1716 * set can be purged from the other 1717 * node at a later time. 1718 */ 1719 mdclrerror(ep); 1720 } 1721 nd = nd->nd_next; 1722 } 1723 } else { 1724 for (i = 0; i < MD_MAXSIDES; i++) { 1725 /* Skip empty slots */ 1726 if (sd->sd_nodes[i][0] == '\0') 1727 continue; 1728 if (strcmp(thishost, sd->sd_nodes[i]) != 0) { 1729 /* 1730 * Tell the remote node to remove this node 1731 */ 1732 if (clnt_delhosts(sd->sd_nodes[i], sp, 1, 1733 &thishost, ep) == -1) { 1734 /* 1735 * If we fail to delete ourselves 1736 * from the remote host it does not 1737 * really matter because the set is 1738 * being "purged" from this node. The 1739 * set can be purged from the other 1740 * node at a later time. 1741 */ 1742 mdclrerror(ep); 1743 } 1744 continue; 1745 } 1746 1747 /* remove the set from this host */ 1748 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) { 1749 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1750 if (!bypass_cluster && num_hosts == 1) 1751 (void) sdssc_delete_end(sp->setname, 1752 SDSSC_CLEANUP); 1753 mdclrerror(ep); 1754 goto out1; 1755 } 1756 } 1757 } 1758 1759 if (!bypass_cluster && num_hosts == 1) { 1760 if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == 1761 SDSSC_ERROR) { 1762 rval = 4; 1763 } 1764 } 1765 1766 out1: 1767 1768 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1769 1770 /* 1771 * Remove the set lock on those nodes that had the set locked 1772 * max_node will either be MD_MAXSIDES or array index of the last 1773 * node contacted (or rather failed to contact) for traditional 1774 * diskset. For a MN diskset, max_node is the node_id of the node 1775 * that failed the lock. 1776 */ 1777 if (MD_MNSET_DESC(sd)) { 1778 nd = sd->sd_nodelist; 1779 while (nd) { 1780 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1781 nd = nd->nd_next; 1782 continue; 1783 } 1784 if (nd->nd_nodeid == max_node) 1785 break; 1786 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1787 if (forceflg && mdanyrpcerror(&xep)) { 1788 mdclrerror(&xep); 1789 nd = nd->nd_next; 1790 continue; 1791 } 1792 if (rval == 0) 1793 (void) mdstealerror(ep, &xep); 1794 rval = 5; 1795 } 1796 nd = nd->nd_next; 1797 } 1798 } else { 1799 for (i = 0; i < max_node; i++) { 1800 /* Skip empty slots */ 1801 if (sd->sd_nodes[i][0] == '\0') 1802 continue; 1803 1804 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1805 if (forceflg && mdanyrpcerror(&xep)) { 1806 mdclrerror(&xep); 1807 continue; 1808 } 1809 if (rval == 0) 1810 (void) mdstealerror(ep, &xep); 1811 rval = 5; 1812 } 1813 } 1814 } 1815 1816 cl_set_setkey(NULL); 1817 1818 return (rval); 1819 } 1820 1821 int 1822 meta_set_query( 1823 mdsetname_t *sp, 1824 mddb_dtag_lst_t **dtlpp, 1825 md_error_t *ep 1826 ) 1827 { 1828 mddb_dtag_get_parm_t dtgp; 1829 1830 (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t)); 1831 dtgp.dtgp_setno = sp->setno; 1832 1833 /*CONSTCOND*/ 1834 while (1) { 1835 if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0) 1836 if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) || 1837 *dtlpp == NULL) 1838 return (mdstealerror(ep, &dtgp.dtgp_mde)); 1839 else 1840 break; 1841 1842 /* 1843 * Run to the end of the list 1844 */ 1845 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) 1846 /* void */; 1847 1848 *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t)); 1849 1850 (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt, 1851 sizeof (mddb_dtag_t)); 1852 1853 dtgp.dtgp_dt.dt_id++; 1854 } 1855 return (0); 1856 } 1857 1858 /* 1859 * return drivename get by key 1860 */ 1861 mddrivename_t * 1862 metadrivename_withdrkey( 1863 mdsetname_t *sp, 1864 side_t sideno, 1865 mdkey_t key, 1866 int flags, 1867 md_error_t *ep 1868 ) 1869 { 1870 char *nm; 1871 mdname_t *np; 1872 mddrivename_t *dnp; 1873 ddi_devid_t devidp; 1874 md_set_desc *sd; 1875 1876 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1877 return (NULL); 1878 } 1879 1880 1881 /* 1882 * Get the devid associated with the key. 1883 * 1884 * If a devid was returned, it MUST be valid even in 1885 * the case where a device id has been "updated". The 1886 * "update" of the device id may have occured due to 1887 * a firmware upgrade. 1888 */ 1889 if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) 1890 != NULL) { 1891 /* 1892 * Look for the correct dnp using the devid for comparison. 1893 */ 1894 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 1895 free(devidp); 1896 dnp->side_names_key = key; 1897 } else { 1898 /* 1899 * We didn't get a devid. We'll try for a dnp using the 1900 * name. If we have a MN diskset or if the dnp is a did 1901 * device, we're done because then we don't have devids. 1902 * Otherwise we'll try to set the devid 1903 * and get the dnp via devid again. 1904 * We also need to clear the ep structure. When the 1905 * above call to meta_getdidbykey returned a null, it 1906 * also put an error code into ep. In this case, the null 1907 * return is actually OK and any errors can be ignored. The 1908 * reason it is OK is because this could be a MN set or 1909 * we could be running without devids (ex cluster). 1910 */ 1911 mdclrerror(ep); 1912 1913 if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, key, 1914 ep)) == NULL) 1915 return (NULL); 1916 /* get device name */ 1917 if (flags & PRINT_FAST) { 1918 if ((np = metaname_fast(&sp, nm, 1919 LOGICAL_DEVICE, ep)) == NULL) { 1920 Free(nm); 1921 return (NULL); 1922 } 1923 } else { 1924 if ((np = metaname(&sp, nm, LOGICAL_DEVICE, 1925 ep)) == NULL) { 1926 Free(nm); 1927 return (NULL); 1928 } 1929 } 1930 Free(nm); 1931 /* make sure it's OK */ 1932 if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, 1933 ep) != 0)) 1934 return (NULL); 1935 1936 /* get drivename */ 1937 dnp = np->drivenamep; 1938 dnp->side_names_key = key; 1939 /* 1940 * Skip the devid set/check for the following cases: 1941 * 1) If MN diskset, there are no devid's 1942 * 2) if dnp is did device 1943 * The device id is disabled for did device due to the 1944 * lack of minor name support in the did driver. The following 1945 * devid code path can set and propagate the error and 1946 * eventually prevent did disks from being added to the 1947 * diskset under SunCluster systems 1948 * 1949 * Note that this code can be called through rpc.mdcommd. 1950 * sdssc_version cannot be used because the library won't 1951 * be bound. 1952 */ 1953 if ((strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) 1954 == 0) || (MD_MNSET_DESC(sd))) 1955 goto out; 1956 1957 /* 1958 * It is okay if replica is not in devid mode 1959 */ 1960 if (mdissyserror(ep, MDDB_F_NODEVID)) { 1961 mdclrerror(ep); 1962 goto out; 1963 } 1964 1965 /* 1966 * We're not MN or did devices but 1967 * devid is missing so this means that we have 1968 * just upgraded from a configuration where 1969 * devid's were not used so try to add in 1970 * the devid and requery. If the devid still isn't there, 1971 * that's OK. dnp->devid will be null as it is in any 1972 * configuration with no devids. 1973 */ 1974 if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, ep) < 0) 1975 return (NULL); 1976 if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, 1977 sideno+SKEW, key, ep)) != NULL) { 1978 /* 1979 * Found a devid so look for the dnp using the 1980 * devid as the search mechanism. 1981 */ 1982 dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep); 1983 free(devidp); 1984 dnp->side_names_key = key; 1985 } 1986 } 1987 1988 1989 1990 out: 1991 if (flags & MD_BYPASS_DAEMON) 1992 return (dnp); 1993 1994 if (get_sidenmlist(sp, dnp, ep)) 1995 return (NULL); 1996 1997 /* return success */ 1998 return (dnp); 1999 } 2000 2001 void 2002 metafreedrivedesc(md_drive_desc **dd) 2003 { 2004 md_drive_desc *p, *next = NULL; 2005 2006 for (p = *dd; p != NULL; p = next) { 2007 next = p->dd_next; 2008 Free(p); 2009 } 2010 *dd = NULL; 2011 } 2012 2013 md_drive_desc * 2014 metaget_drivedesc( 2015 mdsetname_t *sp, 2016 int flags, 2017 md_error_t *ep 2018 ) 2019 { 2020 side_t sideno = MD_SIDEWILD; 2021 2022 assert(! (flags & MD_BYPASS_DAEMON)); 2023 2024 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 2025 return (NULL); 2026 2027 return (metaget_drivedesc_sideno(sp, sideno, flags, ep)); 2028 } 2029 2030 md_drive_desc * 2031 metaget_drivedesc_fromnamelist( 2032 mdsetname_t *sp, 2033 mdnamelist_t *nlp, 2034 md_error_t *ep 2035 ) 2036 { 2037 md_set_desc *sd; 2038 mdnamelist_t *p; 2039 md_drive_desc *dd = NULL; 2040 2041 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2042 return (NULL); 2043 2044 for (p = nlp; p != NULL; p = p->next) 2045 (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0, 2046 sd->sd_ctime, sd->sd_genid, MD_DR_ADD); 2047 2048 return (dd); 2049 } 2050 2051 md_drive_desc * 2052 metaget_drivedesc_sideno( 2053 mdsetname_t *sp, 2054 side_t sideno, 2055 int flags, 2056 md_error_t *ep 2057 ) 2058 { 2059 md_set_desc *sd = NULL; 2060 2061 assert(! (flags & MD_BYPASS_DAEMON)); 2062 2063 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2064 return (NULL); 2065 2066 if (sd->sd_drvs) 2067 return (sd->sd_drvs); 2068 2069 if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL) 2070 return (NULL); 2071 2072 return (sd->sd_drvs); 2073 } 2074 2075 int 2076 metaget_setownership( 2077 mdsetname_t *sp, 2078 md_error_t *ep 2079 ) 2080 { 2081 md_set_desc *sd; 2082 int bool; 2083 int i; 2084 md_mnnode_desc *nd; 2085 2086 if ((sd = metaget_setdesc(sp, ep)) == NULL) 2087 return (-1); 2088 2089 if (MD_MNSET_DESC(sd)) { 2090 nd = sd->sd_nodelist; 2091 while (nd) { 2092 /* If node isn't alive, can't own diskset */ 2093 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2094 nd->nd_flags &= ~MD_MN_NODE_OWN; 2095 nd = nd->nd_next; 2096 continue; 2097 } 2098 /* 2099 * If can't communicate with rpc.metad, then mark 2100 * this node as not an owner. That node may 2101 * in fact, be an owner, but without rpc.metad running 2102 * that node can't do much. 2103 */ 2104 if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) { 2105 nd->nd_flags &= ~MD_MN_NODE_OWN; 2106 } else if (bool == TRUE) { 2107 nd->nd_flags |= MD_MN_NODE_OWN; 2108 } else { 2109 nd->nd_flags &= ~MD_MN_NODE_OWN; 2110 } 2111 nd = nd->nd_next; 2112 } 2113 return (0); 2114 } 2115 2116 /* Rest of code handles traditional disksets */ 2117 2118 for (i = 0; i < MD_MAXSIDES; i++) 2119 sd->sd_isown[i] = 0; 2120 2121 if (clnt_ownset(mynode(), sp, &bool, ep) == -1) 2122 return (-1); 2123 2124 if (bool == TRUE) 2125 sd->sd_isown[getmyside(sp, ep)] = 1; 2126 2127 return (0); 2128 } 2129 2130 char * 2131 mynode(void) 2132 { 2133 static struct utsname myuname; 2134 static int done = 0; 2135 2136 if (! done) { 2137 if (uname(&myuname) == -1) { 2138 md_perror(dgettext(TEXT_DOMAIN, "uname")); 2139 assert(0); 2140 } 2141 done = 1; 2142 } 2143 return (myuname.nodename); 2144 } 2145 2146 int 2147 strinlst(char *str, int cnt, char **lst) 2148 { 2149 int i; 2150 2151 for (i = 0; i < cnt; i++) 2152 if (strcmp(lst[i], str) == 0) 2153 return (TRUE); 2154 2155 return (FALSE); 2156 } 2157 2158 /* 2159 * meta_get_reserved_names 2160 * returns an mdnamelist_t of reserved slices 2161 * reserved slices are those that are used but don't necessarily 2162 * show up as metadevices (ex. reserved slice for db in sets, logs) 2163 */ 2164 2165 /*ARGSUSED*/ 2166 int 2167 meta_get_reserved_names( 2168 mdsetname_t *sp, 2169 mdnamelist_t **nlpp, 2170 int options, 2171 md_error_t *ep) 2172 { 2173 int count = 0; 2174 mdname_t *np = NULL; 2175 mdnamelist_t *transnlp = NULL; 2176 mdnamelist_t **tailpp = nlpp; 2177 mdnamelist_t *nlp; 2178 md_drive_desc *dd, *di; 2179 2180 if (metaislocalset(sp)) 2181 goto out; 2182 2183 if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) { 2184 count = -1; 2185 goto out; 2186 } 2187 2188 /* db in for sets on reserved slice */ 2189 for (di = dd; di && count >= 0; di = di->dd_next) { 2190 uint_t rep_slice; 2191 2192 /* 2193 * Add the name struct to the end of the 2194 * namelist but keep a pointer to the last 2195 * element so that we don't incur the overhead 2196 * of traversing the list each time 2197 */ 2198 if (di->dd_dnp && 2199 (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) && 2200 (np = metaslicename(di->dd_dnp, rep_slice, ep)) && 2201 (tailpp = meta_namelist_append_wrapper(tailpp, np))) 2202 count++; 2203 else 2204 count = -1; 2205 } 2206 2207 /* now find logs */ 2208 if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) { 2209 count = -1; 2210 goto out; 2211 } 2212 2213 for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) { 2214 mdname_t *transnp = nlp->namep; 2215 md_trans_t *transp; 2216 2217 if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) { 2218 count = -1; 2219 goto out; 2220 } 2221 if (transp->lognamep) { 2222 /* 2223 * Add the name struct to the end of the 2224 * namelist but keep a pointer to the last 2225 * element so that we don't incur the overhead 2226 * of traversing the list each time 2227 */ 2228 tailpp = meta_namelist_append_wrapper( 2229 tailpp, transp->lognamep); 2230 } 2231 } 2232 out: 2233 metafreenamelist(transnlp); 2234 return (count); 2235 } 2236 2237 /* 2238 * Entry point to join a node to MultiNode diskset. 2239 * 2240 * Validate host in diskset. 2241 * - Should be in membership list from API 2242 * - Should not already be joined into diskset. 2243 * - Set must have drives 2244 * Assume valid configuration is stored in the set/drive/node records 2245 * in the local mddb since no node or drive can be added to the MNset 2246 * unless all drives and nodes are available. Reconfig steps will 2247 * resync all ALIVE nodes in case of panic in critical areas. 2248 * 2249 * Lock down the set. 2250 * Verify host is a member of this diskset. 2251 * If drives exist in the configuration, load the mddbs. 2252 * Set this node to active by notifying master if one exists. 2253 * If this is the first node active in the diskset, this node 2254 * becomes the master. 2255 * Unlock the set. 2256 * 2257 * Mirror Resync: 2258 * If this node is the last node to join the set and clustering 2259 * isn't running, then start the 'metasync -r' type resync 2260 * on all mirrors in this diskset. 2261 * If clustering is running, this resync operation will 2262 * be handled by the reconfig steps and should NOT 2263 * be handled during a join operation. 2264 * 2265 * There are multiple return values in order to assist 2266 * the join operation of all sets in the metaset command. 2267 * 2268 * Return values: 2269 * 0 - Node successfully joined to set. 2270 * -1 - Join attempted but failed 2271 * - any failure from libmeta calls 2272 * - node not in the member list 2273 * -2 - Join not attempted since 2274 * - this set had no drives in set 2275 * - this node already joined to set 2276 * - set is not a multinode set 2277 * -3 - Node joined to STALE set. 2278 */ 2279 extern int 2280 meta_set_join( 2281 mdsetname_t *sp, 2282 md_error_t *ep 2283 ) 2284 { 2285 md_set_desc *sd; 2286 md_drive_desc *dd; 2287 md_mnnode_desc *nd, *nd2, my_nd; 2288 int rval = 0; 2289 md_setkey_t *cl_sk; 2290 md_error_t xep = mdnullerror; 2291 md_error_t ep_snarf = mdnullerror; 2292 int master_flag = 0; 2293 md_mnset_record *mas_mnsr = NULL; 2294 int clear_nr_flags = 0; 2295 md_mnnode_record *nr; 2296 int stale_set = 0; 2297 int rb_flags = 0; 2298 int stale_bool = FALSE; 2299 int suspendall_flag = 0; 2300 int suspend1_flag = 0; 2301 sigset_t oldsigs; 2302 int send_reinit = 0; 2303 2304 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2305 return (-1); 2306 } 2307 2308 /* Must be a multinode diskset */ 2309 if (!MD_MNSET_DESC(sd)) { 2310 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2311 return (-2); 2312 } 2313 2314 /* Verify that the node is ALIVE (i.e. is in the API membership list) */ 2315 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) { 2316 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, 2317 sd->sd_mn_mynode->nd_nodename, NULL, sp->setname); 2318 return (-1); 2319 } 2320 2321 /* Make sure we are blocking all signals */ 2322 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2323 mdclrerror(&xep); 2324 2325 /* 2326 * Lock the set on current set members. 2327 * For MN diskset lock_set and SUSPEND are used to protect against 2328 * other meta* commands running on the other nodes. 2329 */ 2330 nd = sd->sd_nodelist; 2331 while (nd) { 2332 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2333 nd = nd->nd_next; 2334 continue; 2335 } 2336 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2337 rval = -1; 2338 goto out; 2339 } 2340 nd = nd->nd_next; 2341 } 2342 2343 /* 2344 * Lock out other meta* commands by suspending 2345 * class 1 messages across the diskset. 2346 */ 2347 nd = sd->sd_nodelist; 2348 while (nd) { 2349 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2350 nd = nd->nd_next; 2351 continue; 2352 } 2353 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2354 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2355 rval = -1; 2356 goto out; 2357 } 2358 suspend1_flag = 1; 2359 nd = nd->nd_next; 2360 } 2361 2362 /* 2363 * Verify that this host is a member (in the host list) of the set. 2364 */ 2365 nd = sd->sd_nodelist; 2366 while (nd) { 2367 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2368 break; 2369 } 2370 nd = nd->nd_next; 2371 } 2372 if (!nd) { 2373 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2374 sd->sd_mn_mynode->nd_nodename, NULL, 2375 sp->setname); 2376 rval = -1; 2377 goto out; 2378 } 2379 2380 /* 2381 * Need to return failure if host is already 'joined' 2382 * into the set. This is done so that if later the user 2383 * issues a command to join all sets and a failure is 2384 * encountered - that the resulting cleanup effort 2385 * (withdrawing from all sets that were joined 2386 * during that command) won't withdraw from this set. 2387 */ 2388 if (nd->nd_flags & MD_MN_NODE_OWN) { 2389 rval = -2; 2390 goto out2; 2391 } 2392 2393 /* 2394 * Call metaget_setownership that calls each node in diskset and 2395 * marks in set descriptor if node is an owner of the set or not. 2396 * metaget_setownership checks to see if a node is an owner by 2397 * checking to see if that node's kernel has the mddb loaded. 2398 * If a node had panic'd during a reconfig or an 2399 * add/delete/join/withdraw operation, the other nodes' node 2400 * records may not reflect the current state of the diskset, 2401 * so calling metaget_setownership is the safest thing to do. 2402 */ 2403 if (metaget_setownership(sp, ep) == -1) { 2404 rval = -1; 2405 goto out; 2406 } 2407 2408 /* If first active member of diskset, become the master. */ 2409 nd = sd->sd_nodelist; 2410 while (nd) { 2411 if (nd->nd_flags & MD_MN_NODE_OWN) 2412 break; 2413 nd = nd->nd_next; 2414 } 2415 if (nd == NULL) 2416 master_flag = 1; 2417 2418 /* 2419 * If not first active member of diskset, then get the 2420 * master information from a node that is already joined 2421 * and set the master information for this node. Be sure 2422 * that this node (the already joined node) has its own 2423 * join flag set. If not, then this diskset isn't currently 2424 * consistent and shouldn't allow a node to join. This diskset 2425 * inconsistency should only occur when a node has panic'd in 2426 * the set while doing a metaset operation and the sysadmin is 2427 * attempting to join a node into the set. This inconsistency 2428 * will be fixed during a reconfig cycle which should be occurring 2429 * soon since a node panic'd. 2430 * 2431 * If unable to get this information from an owning node, then 2432 * this diskset isn't currently consistent and shouldn't 2433 * allow a node to join. 2434 */ 2435 if (!master_flag) { 2436 /* get master information from an owner (joined) node */ 2437 if (clnt_mngetset(nd->nd_nodename, sp->setname, 2438 sp->setno, &mas_mnsr, ep) == -1) { 2439 rval = -1; 2440 goto out; 2441 } 2442 2443 /* Verify that owner (joined) node has its own JOIN flag set */ 2444 nr = mas_mnsr->sr_nodechain; 2445 while (nr) { 2446 if ((nd->nd_nodeid == nr->nr_nodeid) && 2447 ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) { 2448 (void) mddserror(ep, MDE_DS_NODENOSET, 2449 sp->setno, nd->nd_nodename, NULL, 2450 nd->nd_nodename); 2451 free_sr((md_set_record *)mas_mnsr); 2452 rval = -1; 2453 goto out; 2454 } 2455 nr = nr->nr_next; 2456 } 2457 2458 /* 2459 * Does master have set marked as STALE? 2460 * If so, need to pass this down to kernel when 2461 * this node snarfs the set. 2462 */ 2463 if (clnt_mn_is_stale(nd->nd_nodename, sp, 2464 &stale_bool, ep) == -1) { 2465 rval = -1; 2466 goto out; 2467 } 2468 2469 /* set master information in my rpc.metad's set record */ 2470 if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm, 2471 mas_mnsr->sr_master_nodeid, ep)) { 2472 free_sr((md_set_record *)mas_mnsr); 2473 rval = -1; 2474 goto out; 2475 } 2476 2477 /* set master information in my cached set desc */ 2478 (void) strcpy(sd->sd_mn_master_nodenm, 2479 mas_mnsr->sr_master_nodenm); 2480 sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid; 2481 nd2 = sd->sd_nodelist; 2482 while (nd2) { 2483 if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) { 2484 sd->sd_mn_masternode = nd2; 2485 break; 2486 } 2487 nd2 = nd2->nd_next; 2488 } 2489 free_sr((md_set_record *)mas_mnsr); 2490 2491 /* 2492 * Set the node flags in mynode's rpc.metad node records for 2493 * the nodes that are in the diskset. Can use my sd 2494 * since earlier call to metaget_setownership set the 2495 * owner flags based on whether that node had snarfed 2496 * the MN diskset mddb. Reconfig steps guarantee that 2497 * return of metaget_setownership will match the owning 2498 * node's owner list except in the case where a node 2499 * has just panic'd and in this case, a reconfig will 2500 * be starting immediately and the owner lists will 2501 * be sync'd up by the reconfig. 2502 * 2503 * Flag of SET means to take no action except to 2504 * set the node flags as given in the nodelist linked list. 2505 */ 2506 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 2507 MD_NR_SET, NULL, ep)) { 2508 rval = -1; 2509 goto out; 2510 } 2511 } 2512 2513 /* 2514 * Read in the mddb if there are drives in the set. 2515 */ 2516 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2517 ep)) == NULL) { 2518 /* No drives in list */ 2519 if (! mdisok(ep)) { 2520 rval = -1; 2521 goto out; 2522 } 2523 rval = -2; 2524 goto out; 2525 } 2526 2527 /* 2528 * Notify rpc.mdcommd on all nodes of a nodelist change. 2529 * Start by suspending rpc.mdcommd (which drains it of all messages), 2530 * then change the nodelist followed by a reinit and resume. 2531 */ 2532 nd = sd->sd_nodelist; 2533 while (nd) { 2534 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2535 nd = nd->nd_next; 2536 continue; 2537 } 2538 2539 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, 2540 MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2541 rval = -1; 2542 goto out; 2543 } 2544 suspendall_flag = 1; 2545 nd = nd->nd_next; 2546 } 2547 2548 /* Set master in my set record in rpc.metad */ 2549 if (master_flag) { 2550 if (clnt_mnsetmaster(mynode(), sp, 2551 sd->sd_mn_mynode->nd_nodename, 2552 sd->sd_mn_mynode->nd_nodeid, ep)) { 2553 rval = -1; 2554 goto out; 2555 } 2556 } 2557 /* 2558 * Causes mddbs to be loaded into the kernel. 2559 * Set the force flag so that replica locations can be 2560 * loaded into the kernel even if a mediator node was 2561 * unavailable. This allows a node to join an MO 2562 * diskset when there are sufficient replicas available, 2563 * but a mediator node in unavailable. 2564 */ 2565 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 2566 mde_perror(ep, dgettext(TEXT_DOMAIN, 2567 "Host not able to start diskset.")); 2568 rval = -1; 2569 goto out; 2570 } 2571 2572 if (! mdisok(ep)) { 2573 rval = -1; 2574 goto out; 2575 } 2576 2577 /* 2578 * Set rollback flags to 1 so that halt_set is called if a failure 2579 * is seen after this point. If snarf_set fails, still need to 2580 * call halt_set to cleanup the diskset. 2581 */ 2582 rb_flags = 1; 2583 2584 /* Starts the set */ 2585 if (snarf_set(sp, stale_bool, ep) != 0) { 2586 if (mdismddberror(ep, MDE_DB_STALE)) { 2587 /* 2588 * Don't fail join, STALE means that set has 2589 * < 50% mddbs. 2590 */ 2591 (void) mdstealerror(&ep_snarf, ep); 2592 stale_set = 1; 2593 } else if (mdisok(ep)) { 2594 /* If snarf failed, but no error was set - set it */ 2595 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2596 sp->setno, 0, NULL); 2597 rval = -1; 2598 goto out; 2599 } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) { 2600 /* 2601 * Don't fail join if ACCOK; ACCOK means that mediator 2602 * provided extra vote. 2603 */ 2604 rval = -1; 2605 goto out; 2606 } 2607 } 2608 2609 /* Did set really get snarfed? */ 2610 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) { 2611 if (mdisok(ep)) { 2612 /* If snarf failed, but no error was set - set it */ 2613 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2614 sp->setno, 0, NULL); 2615 } 2616 mde_perror(ep, dgettext(TEXT_DOMAIN, 2617 "Host not able to start diskset.")); 2618 rval = -1; 2619 goto out; 2620 } 2621 2622 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2623 send_reinit = 1; 2624 2625 /* If first node to enter set, setup master and clear change log */ 2626 if (master_flag) { 2627 /* Set master in my locally cached set descriptor */ 2628 (void) strcpy(sd->sd_mn_master_nodenm, 2629 sd->sd_mn_mynode->nd_nodename); 2630 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; 2631 sd->sd_mn_am_i_master = 1; 2632 2633 /* 2634 * If first node to join set, then clear out change log 2635 * entries. Change log entries are only needed when a 2636 * change of master is occurring in a diskset that has 2637 * multiple owners. Since this node is the first owner 2638 * of the diskset, clear the entries. 2639 * 2640 * Only do this if we are in a single node non-SC3.x 2641 * situation. 2642 */ 2643 if (meta_mn_singlenode() && 2644 mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) { 2645 mde_perror(ep, dgettext(TEXT_DOMAIN, 2646 "Unable to reset changelog.")); 2647 rval = -1; 2648 goto out; 2649 } 2650 } 2651 2652 /* Set my locally cached flag */ 2653 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 2654 2655 /* 2656 * Set this node's own flag on all joined nodes in the set 2657 * (including my node). 2658 */ 2659 clear_nr_flags = 1; 2660 2661 my_nd = *(sd->sd_mn_mynode); 2662 my_nd.nd_next = NULL; 2663 nd = sd->sd_nodelist; 2664 while (nd) { 2665 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2666 nd = nd->nd_next; 2667 continue; 2668 } 2669 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 2670 MD_NR_JOIN, NULL, ep)) { 2671 rval = -1; 2672 goto out; 2673 } 2674 nd = nd->nd_next; 2675 } 2676 2677 out: 2678 if (rval != NULL) { 2679 /* 2680 * If rollback flag is 1, then node was joined to set. 2681 * Since an error occurred, withdraw node from set in 2682 * order to rollback to before command was run. 2683 * Need to preserve ep so that calling function can 2684 * get error information. 2685 */ 2686 if (rb_flags == 1) { 2687 if (halt_set(sp, &xep)) { 2688 mdclrerror(&xep); 2689 } 2690 } 2691 2692 /* 2693 * If error, reset master to INVALID. 2694 * Ignore error since (next) first node to successfully join 2695 * will set master on all nodes. 2696 */ 2697 (void) clnt_mnsetmaster(mynode(), sp, "", 2698 MD_MN_INVALID_NID, &xep); 2699 mdclrerror(&xep); 2700 /* Reset master in my locally cached set descriptor */ 2701 sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; 2702 sd->sd_mn_am_i_master = 0; 2703 2704 /* 2705 * If nr flags set on other nodes, reset them. 2706 */ 2707 if (clear_nr_flags) { 2708 nd = sd->sd_nodelist; 2709 while (nd) { 2710 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2711 nd = nd->nd_next; 2712 continue; 2713 } 2714 (void) clnt_upd_nr_flags(nd->nd_nodename, sp, 2715 &my_nd, MD_NR_WITHDRAW, NULL, &xep); 2716 mdclrerror(&xep); 2717 nd = nd->nd_next; 2718 } 2719 /* Reset my locally cached flag */ 2720 sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN; 2721 } 2722 } 2723 2724 /* 2725 * Notify rpc.mdcommd on all nodes of a nodelist change. 2726 * Send reinit command to mdcommd which forces it to get 2727 * fresh set description. 2728 */ 2729 if (send_reinit) { 2730 /* Send reinit */ 2731 nd = sd->sd_nodelist; 2732 while (nd) { 2733 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2734 nd = nd->nd_next; 2735 continue; 2736 } 2737 2738 /* Class is ignored for REINIT */ 2739 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 2740 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 2741 /* 2742 * We are here because we failed to resume 2743 * rpc.mdcommd. However we potentially have 2744 * an error from the previous call 2745 * If the previous call did fail, we capture 2746 * that error and generate a perror with 2747 * the string, "Unable to resume...". 2748 * Setting rval to -1 ensures that in the 2749 * next iteration of the loop, ep is not 2750 * clobbered. 2751 */ 2752 if (rval == 0) 2753 (void) mdstealerror(ep, &xep); 2754 else 2755 mdclrerror(&xep); 2756 rval = -1; 2757 mde_perror(ep, dgettext(TEXT_DOMAIN, 2758 "Unable to reinit rpc.mdcommd.")); 2759 } 2760 nd = nd->nd_next; 2761 } 2762 2763 } 2764 2765 out2: 2766 /* 2767 * Unlock diskset by resuming messages across the diskset. 2768 * Just resume all classes so that resume is the same whether 2769 * just one class was locked or all classes were locked. 2770 */ 2771 if ((suspend1_flag) || (suspendall_flag)) { 2772 nd = sd->sd_nodelist; 2773 while (nd) { 2774 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2775 nd = nd->nd_next; 2776 continue; 2777 } 2778 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 2779 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 2780 /* 2781 * We are here because we failed to resume 2782 * rpc.mdcommd. However we potentially have 2783 * an error from the previous call 2784 * If the previous call did fail, we capture 2785 * that error and generate a perror with 2786 * the string, "Unable to resume...". 2787 * Setting rval to -1 ensures that in the 2788 * next iteration of the loop, ep is not 2789 * clobbered. 2790 */ 2791 if (rval == 0) 2792 (void) mdstealerror(ep, &xep); 2793 else 2794 mdclrerror(&xep); 2795 rval = -1; 2796 mde_perror(ep, dgettext(TEXT_DOMAIN, 2797 "Unable to resume rpc.mdcommd.")); 2798 } 2799 nd = nd->nd_next; 2800 } 2801 meta_ping_mnset(sp->setno); 2802 } 2803 2804 /* 2805 * Unlock set. This flushes the caches on the servers. 2806 */ 2807 cl_sk = cl_get_setkey(sp->setno, sp->setname); 2808 nd = sd->sd_nodelist; 2809 while (nd) { 2810 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2811 nd = nd->nd_next; 2812 continue; 2813 } 2814 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 2815 if (rval == 0) 2816 (void) mdstealerror(ep, &xep); 2817 else 2818 mdclrerror(&xep); 2819 rval = -1; 2820 } 2821 nd = nd->nd_next; 2822 } 2823 2824 /* 2825 * If this node is the last to join the diskset and clustering isn't 2826 * running, then resync the mirrors in the diskset. We have to wait 2827 * until all nodes are joined so that the status gets propagated to 2828 * all of the members of the set. 2829 * Ignore any error from the resync as the join function shouldn't fail 2830 * because the mirror resync had a problem. 2831 * 2832 * Don't start resync if set is stale. 2833 */ 2834 if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) && 2835 (stale_set != 1)) { 2836 nd = sd->sd_nodelist; 2837 while (nd) { 2838 if (!(nd->nd_flags & MD_MN_NODE_OWN)) 2839 break; 2840 nd = nd->nd_next; 2841 } 2842 /* 2843 * nd set to NULL means that we have no nodes in the set that 2844 * haven't joined. In this case we start the resync. 2845 */ 2846 if (nd == NULL) { 2847 (void) meta_mirror_resync_all(sp, 0, &xep); 2848 mdclrerror(&xep); 2849 } 2850 } 2851 2852 /* Update ABR state for all soft partitions */ 2853 (void) meta_sp_update_abr(sp, &xep); 2854 mdclrerror(&xep); 2855 2856 /* 2857 * call metaflushsetnames to reset local cache for master and 2858 * node information. 2859 */ 2860 metaflushsetname(sp); 2861 2862 /* release signals back to what they were on entry */ 2863 if (procsigs(FALSE, &oldsigs, &xep) < 0) 2864 mdclrerror(&xep); 2865 2866 /* 2867 * If no error and stale_set is set, then set ep back 2868 * to ep from snarf_set call and return -3. If another error 2869 * occurred and rval is not 0, then that error would have 2870 * caused the node to be withdrawn from the set and would 2871 * have set ep to that error information. 2872 */ 2873 if ((rval == 0) && (stale_set)) { 2874 (void) mdstealerror(ep, &ep_snarf); 2875 return (-3); 2876 } 2877 2878 return (rval); 2879 } 2880 2881 /* 2882 * Entry point to withdraw a node from MultiNode diskset. 2883 * 2884 * Validate host in diskset. 2885 * - Should be joined into diskset. 2886 * Assume valid configuration is stored in the set/drive/node records 2887 * in the local mddb since no node or drive can be added to the MNset 2888 * unless all drives and nodes are available. Reconfig steps will 2889 * resync all ALIVE nodes in case of panic in critical areas. 2890 * 2891 * Lock down the set. 2892 * Verify that drives exist in configuration. 2893 * Verify host is a member of this diskset. 2894 * Verify host is an owner of the diskset (host is joined to diskset). 2895 * Only allow withdrawal of master node if master node is the only joined 2896 * in the diskset. 2897 * Halt the diskset on this node. 2898 * Reset Master on this node. 2899 * Updated node flags that this node with withdrawn. 2900 * Unlock the set. 2901 * 2902 * Return values: 2903 * 0 - Node successfully withdrew from set. 2904 * -1 - Withdrawal attempted but failed 2905 * - any failure from libmeta calls 2906 * - node not in the member list 2907 * -2 - Withdrawal not attempted since 2908 * - this set had no drives in set 2909 * - this node not joined to set 2910 * - set is not a multinode set 2911 */ 2912 extern int 2913 meta_set_withdraw( 2914 mdsetname_t *sp, 2915 md_error_t *ep 2916 ) 2917 { 2918 md_set_desc *sd; 2919 md_drive_desc *dd = 0; 2920 md_mnnode_desc *nd, my_nd; 2921 int rval = 0; 2922 md_setkey_t *cl_sk; 2923 md_error_t xep = mdnullerror; 2924 int set_halted = 0; 2925 int suspendall_flag = 0; 2926 int suspend1_flag = 0; 2927 bool_t stale_bool = FALSE; 2928 mddb_config_t c; 2929 int node_id_list[1]; 2930 sigset_t oldsigs; 2931 int send_reinit = 0; 2932 2933 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2934 return (-1); 2935 } 2936 2937 /* Must be a multinode diskset */ 2938 if (!MD_MNSET_DESC(sd)) { 2939 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2940 return (-1); 2941 } 2942 2943 /* Make sure we are blocking all signals */ 2944 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2945 mdclrerror(&xep); 2946 2947 /* 2948 * Lock the set on current set members. 2949 * For MN diskset lock_set and SUSPEND are used to protect against 2950 * other meta* commands running on the other nodes. 2951 */ 2952 nd = sd->sd_nodelist; 2953 while (nd) { 2954 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2955 nd = nd->nd_next; 2956 continue; 2957 } 2958 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2959 rval = -1; 2960 goto out; 2961 } 2962 nd = nd->nd_next; 2963 } 2964 /* 2965 * Lock out other meta* commands by suspending 2966 * class 1 messages across the diskset. 2967 */ 2968 nd = sd->sd_nodelist; 2969 while (nd) { 2970 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2971 nd = nd->nd_next; 2972 continue; 2973 } 2974 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2975 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2976 rval = -1; 2977 goto out; 2978 } 2979 suspend1_flag = 1; 2980 nd = nd->nd_next; 2981 } 2982 2983 /* Get list of drives - needed in case of failure */ 2984 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2985 ep)) == NULL) { 2986 /* Error getting drives in list */ 2987 if (! mdisok(ep)) { 2988 rval = -1; 2989 goto out2; 2990 } 2991 /* no drives in list */ 2992 rval = -2; 2993 goto out2; 2994 } 2995 2996 /* 2997 * Verify that this host is a member (in the host list) of the set. 2998 */ 2999 nd = sd->sd_nodelist; 3000 while (nd) { 3001 if (strcmp(mynode(), nd->nd_nodename) == 0) { 3002 break; 3003 } 3004 nd = nd->nd_next; 3005 } 3006 if (!nd) { 3007 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 3008 sd->sd_mn_mynode->nd_nodename, NULL, 3009 sp->setname); 3010 rval = -1; 3011 goto out2; 3012 } 3013 3014 /* 3015 * Call metaget_setownership that calls each node in diskset and 3016 * marks in set descriptor if node is an owner of the set or not. 3017 * metaget_setownership checks to see if a node is an owner by 3018 * checking to see if that node's kernel has the mddb loaded. 3019 * If a node had panic'd during a reconfig or an 3020 * add/delete/join/withdraw operation, the other nodes' node 3021 * records may not reflect the current state of the diskset, 3022 * so calling metaget_setownership is the safest thing to do. 3023 */ 3024 if (metaget_setownership(sp, ep) == -1) { 3025 rval = -1; 3026 goto out2; 3027 } 3028 3029 /* 3030 * Verify that this node is joined 3031 * to diskset (i.e. is an owner of the diskset). 3032 */ 3033 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 3034 rval = -2; 3035 goto out2; 3036 } 3037 3038 /* 3039 * For a MN diskset, only withdraw master if it is 3040 * the only joined node. 3041 */ 3042 if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) { 3043 nd = sd->sd_nodelist; 3044 while (nd) { 3045 /* Skip my node since checking for other owners */ 3046 if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { 3047 nd = nd->nd_next; 3048 continue; 3049 } 3050 /* If another owner node if found, error */ 3051 if (nd->nd_flags & MD_MN_NODE_OWN) { 3052 (void) mddserror(ep, MDE_DS_WITHDRAWMASTER, 3053 sp->setno, 3054 sd->sd_mn_mynode->nd_nodename, NULL, 3055 sp->setname); 3056 rval = -1; 3057 goto out2; 3058 } 3059 nd = nd->nd_next; 3060 } 3061 } 3062 3063 /* 3064 * Is current set STALE? 3065 */ 3066 (void) memset(&c, 0, sizeof (c)); 3067 c.c_id = 0; 3068 c.c_setno = sp->setno; 3069 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 3070 (void) mdstealerror(ep, &c.c_mde); 3071 rval = -1; 3072 goto out; 3073 } 3074 if (c.c_flags & MDDB_C_STALE) { 3075 stale_bool = TRUE; 3076 } 3077 3078 /* 3079 * Notify rpc.mdcommd on all nodes of a nodelist change. 3080 * Start by suspending rpc.mdcommd (which drains it of all messages), 3081 * then change the nodelist followed by a reinit and resume. 3082 */ 3083 nd = sd->sd_nodelist; 3084 while (nd) { 3085 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3086 nd = nd->nd_next; 3087 continue; 3088 } 3089 3090 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 3091 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 3092 rval = -1; 3093 goto out; 3094 } 3095 suspendall_flag = 1; 3096 nd = nd->nd_next; 3097 } 3098 3099 /* 3100 * Withdraw the set - halt set. 3101 * This will fail if any I/O is occuring to any metadevice which 3102 * includes a resync to a mirror metadevice. 3103 */ 3104 set_halted = 1; 3105 if (halt_set(sp, ep)) { 3106 /* Was set actually halted? */ 3107 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) { 3108 set_halted = 0; 3109 } 3110 rval = -1; 3111 goto out; 3112 } 3113 3114 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 3115 send_reinit = 1; 3116 3117 /* Reset master on withdrawn node */ 3118 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "", 3119 MD_MN_INVALID_NID, ep)) { 3120 rval = -1; 3121 goto out; 3122 } 3123 3124 /* Mark my node as withdrawn and send to other nodes */ 3125 nd = sd->sd_nodelist; 3126 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3127 my_nd.nd_next = NULL; 3128 while (nd) { 3129 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3130 nd = nd->nd_next; 3131 continue; 3132 } 3133 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3134 MD_NR_WITHDRAW, NULL, ep)) { 3135 rval = -1; 3136 goto out; 3137 } 3138 nd = nd->nd_next; 3139 } 3140 3141 /* 3142 * If withdrawn node is a mirror owner, reset mirror owner 3143 * to NULL. If an error occurs, print a warning and continue. 3144 * Don't fail metaset because of mirror owner reset problem since 3145 * next node to grab mirror will resolve this issue. 3146 * Before next node grabs mirrors, metaset will show the withdrawn 3147 * node as owner which is why an attempt to reset the mirror owner 3148 * is made. 3149 */ 3150 node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */ 3151 nd = sd->sd_nodelist; 3152 while (nd) { 3153 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3154 nd = nd->nd_next; 3155 continue; 3156 } 3157 if (clnt_reset_mirror_owner(nd->nd_nodename, sp, 3158 1, &node_id_list[0], &xep) == 01) { 3159 mde_perror(&xep, dgettext(TEXT_DOMAIN, 3160 "Unable to reset mirror owner on node %s"), 3161 nd->nd_nodename); 3162 mdclrerror(&xep); 3163 } 3164 nd = nd->nd_next; 3165 } 3166 3167 out: 3168 if (rval == -1) { 3169 /* Rejoin node - Mark node as joined and send to other nodes */ 3170 nd = sd->sd_nodelist; 3171 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3172 my_nd.nd_next = NULL; 3173 while (nd) { 3174 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3175 nd = nd->nd_next; 3176 continue; 3177 } 3178 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3179 MD_NR_JOIN, NULL, &xep)) { 3180 mdclrerror(&xep); 3181 } 3182 nd = nd->nd_next; 3183 } 3184 3185 /* Set master on withdrawn node */ 3186 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, 3187 sd->sd_mn_master_nodenm, 3188 sd->sd_mn_master_nodeid, &xep)) { 3189 mdclrerror(&xep); 3190 } 3191 3192 /* Join set if halt_set had succeeded */ 3193 if (set_halted) { 3194 /* 3195 * Causes mddbs to be loaded into the kernel. 3196 * Set the force flag so that replica locations can be 3197 * loaded into the kernel even if a mediator node was 3198 * unavailable. This allows a node to join an MO 3199 * diskset when there are sufficient replicas available, 3200 * but a mediator node in unavailable. 3201 */ 3202 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) { 3203 mdclrerror(&xep); 3204 } 3205 /* If set previously stale - make it so at re-join */ 3206 if (snarf_set(sp, stale_bool, &xep) != 0) { 3207 mdclrerror(&xep); 3208 (void) halt_set(sp, &xep); 3209 mdclrerror(&xep); 3210 } 3211 } 3212 } 3213 3214 /* 3215 * Notify rpc.mdcommd on all nodes of a nodelist change. 3216 * Send reinit command to mdcommd which forces it to get 3217 * fresh set description. 3218 */ 3219 if (send_reinit) { 3220 /* Send reinit */ 3221 nd = sd->sd_nodelist; 3222 while (nd) { 3223 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3224 nd = nd->nd_next; 3225 continue; 3226 } 3227 3228 /* Class is ignored for REINIT */ 3229 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 3230 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 3231 /* 3232 * We are here because we failed to resume 3233 * rpc.mdcommd. However we potentially have 3234 * an error from the previous call. 3235 * If the previous call did fail, we 3236 * capture that error and generate a perror 3237 * withthe string, "Unable to resume...". 3238 * Setting rval to -1 ensures that in the 3239 * next iteration of the loop, ep is not 3240 * clobbered. 3241 */ 3242 if (rval == 0) 3243 (void) mdstealerror(ep, &xep); 3244 else 3245 mdclrerror(&xep); 3246 rval = -1; 3247 mde_perror(ep, dgettext(TEXT_DOMAIN, 3248 "Unable to reinit rpc.mdcommd.")); 3249 } 3250 nd = nd->nd_next; 3251 } 3252 } 3253 3254 out2: 3255 /* 3256 * Unlock diskset by resuming messages across the diskset. 3257 * Just resume all classes so that resume is the same whether 3258 * just one class was locked or all classes were locked. 3259 */ 3260 if ((suspend1_flag) || (suspendall_flag)) { 3261 nd = sd->sd_nodelist; 3262 while (nd) { 3263 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3264 nd = nd->nd_next; 3265 continue; 3266 } 3267 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 3268 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 3269 /* 3270 * We are here because we failed to resume 3271 * rpc.mdcommd. However we potentially have 3272 * an error from the previous call 3273 * If the previous call did fail, we capture 3274 * that error and generate a perror with 3275 * the string, "Unable to resume...". 3276 * Setting rval to -1 ensures that in the 3277 * next iteration of the loop, ep is not 3278 * clobbered. 3279 */ 3280 if (rval == 0) 3281 (void) mdstealerror(ep, &xep); 3282 else 3283 mdclrerror(&xep); 3284 rval = -1; 3285 mde_perror(ep, dgettext(TEXT_DOMAIN, 3286 "Unable to resume rpc.mdcommd.")); 3287 } 3288 nd = nd->nd_next; 3289 } 3290 meta_ping_mnset(sp->setno); 3291 } 3292 3293 /* 3294 * Unlock set. This flushes the caches on the servers. 3295 */ 3296 cl_sk = cl_get_setkey(sp->setno, sp->setname); 3297 nd = sd->sd_nodelist; 3298 while (nd) { 3299 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3300 nd = nd->nd_next; 3301 continue; 3302 } 3303 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 3304 if (rval == 0) 3305 (void) mdstealerror(ep, &xep); 3306 else 3307 mdclrerror(&xep); 3308 rval = -1; 3309 } 3310 nd = nd->nd_next; 3311 } 3312 3313 /* 3314 * call metaflushsetnames to reset local cache for master and 3315 * node information. 3316 */ 3317 metaflushsetname(sp); 3318 3319 /* release signals back to what they were on entry */ 3320 if (procsigs(FALSE, &oldsigs, &xep) < 0) 3321 mdclrerror(&xep); 3322 3323 return (rval); 3324 3325 } 3326 3327 /* 3328 * Update nodelist with cluster member information. 3329 * A node not in the member list will be marked 3330 * as not ALIVE and not OWN. 3331 * A node in the member list will be marked ALIVE, but 3332 * the OWN bit will not be changed. 3333 * 3334 * If mynode isn't in the membership list, fail causing 3335 * another reconfig cycle to be started since a non-member 3336 * node shouldn't be taking part in the reconfig cycle. 3337 * 3338 * Return values: 3339 * 0 - No problem. 3340 * 1 - Any failure including RPC failure to my node. 3341 */ 3342 int 3343 meta_reconfig_update_nodelist( 3344 mdsetname_t *sp, 3345 mndiskset_membershiplist_t *nl, 3346 md_set_desc *sd, 3347 md_error_t *ep 3348 ) 3349 { 3350 mndiskset_membershiplist_t *nl2; 3351 md_mnnode_desc *nd; 3352 md_error_t xep = mdnullerror; 3353 int rval = 0; 3354 3355 /* 3356 * Walk through nodelist, checking to see if each 3357 * node is in the member list. 3358 * If node is not a member, reset ALIVE and OWN node flag. 3359 * If node is a member, set ALIVE. 3360 * If mynode's OWN flag gets reset, then halt the diskset on this node. 3361 */ 3362 nd = sd->sd_nodelist; 3363 while (nd) { 3364 nl2 = nl; 3365 while (nl2) { 3366 /* If node is in member list, set ALIVE */ 3367 if (nl2->msl_node_id == nd->nd_nodeid) { 3368 nd->nd_flags |= MD_MN_NODE_ALIVE; 3369 break; 3370 } else { 3371 nl2 = nl2->next; 3372 } 3373 /* node is not in member list, mark !ALIVE and !OWN */ 3374 if (nl2 == NULL) { 3375 /* If node is mynode, then halt set if needed */ 3376 if (strcmp(mynode(), nd->nd_nodename) == 0) { 3377 /* 3378 * This shouldn't happen, but just 3379 * in case... Any node not in the 3380 * membership list should be dead and 3381 * not running reconfig step1. 3382 */ 3383 if (nd->nd_flags & MD_MN_NODE_OWN) { 3384 if (halt_set(sp, &xep)) { 3385 mde_perror(&xep, ""); 3386 mdclrerror(&xep); 3387 } 3388 } 3389 /* 3390 * Return failure since this node 3391 * (mynode) is not in the membership 3392 * list, but process the rest of the 3393 * nodelist first so that rpc.metad 3394 * can be updated with the latest 3395 * membership information. 3396 */ 3397 (void) mddserror(ep, 3398 MDE_DS_NOTINMEMBERLIST, 3399 sp->setno, nd->nd_nodename, NULL, 3400 sp->setname); 3401 rval = 1; 3402 } 3403 nd->nd_flags &= ~MD_MN_NODE_ALIVE; 3404 nd->nd_flags &= ~MD_MN_NODE_OWN; 3405 } 3406 } 3407 nd = nd->nd_next; 3408 } 3409 3410 /* Send this information to rpc.metad */ 3411 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 3412 MD_NR_SET, MNSET_IN_RECONFIG, &xep)) { 3413 /* Return failure if can't send node flags to rpc.metad */ 3414 if (rval == 0) { 3415 (void) mdstealerror(ep, &xep); 3416 rval = 1; 3417 } 3418 } 3419 return (rval); 3420 } 3421 3422 /* 3423 * Choose master determines the master for a diskset. 3424 * Each node determines the master on its own and 3425 * adds this information to its local rpc.metad nodelist 3426 * and also sends it to the kernel. 3427 * 3428 * Nodelist in set descriptor (sd) is sorted in 3429 * monotonically increasing sequence of nodeid. 3430 * 3431 * Return values: 3432 * 0 - No problem. 3433 * 205 - There was an RPC problem to another node. 3434 * -1 - There was an error. This could be an RPC error to my node. 3435 * This is a catastrophic failure causing node to panic. 3436 */ 3437 int 3438 meta_reconfig_choose_master_for_set( 3439 mdsetname_t *sp, 3440 md_set_desc *sd, 3441 md_error_t *ep 3442 ) 3443 { 3444 int is_owner; 3445 md_mnset_record *mnsr = NULL; 3446 int lowest_alive_nodeid = 0; 3447 uint_t master_nodeid; 3448 md_mnnode_desc *nd, *nd2; 3449 md_mnnode_record *nr; 3450 md_drive_desc *dd; 3451 md_setkey_t *cl_sk; 3452 int rval = 0; 3453 md_error_t xep = mdnullerror; 3454 mddb_setflags_config_t sf; 3455 3456 /* 3457 * Is current node joined to diskset? 3458 * Don't trust flags, really check to see if mddb is snarfed. 3459 */ 3460 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 3461 /* 3462 * If a node is joined to the diskset, this node checks 3463 * to see if the current master of the diskset is valid and 3464 * is still in the membership list (ALIVE) and is 3465 * still joined (OWN). Need to verify if master is 3466 * really joined - don't trust the flags. (Can trust 3467 * ALIVE since set during earlier part of reconfig cycle.) 3468 * If the current master is valid, still in the membership 3469 * list and joined, then master is not changed on this node. 3470 * Just return. 3471 * 3472 * Verify that nodeid is valid before accessing masternode. 3473 */ 3474 if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) && 3475 (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) { 3476 if (clnt_ownset(sd->sd_mn_master_nodenm, sp, 3477 &is_owner, ep) == -1) { 3478 /* If RPC failure to another node return 205 */ 3479 if ((mdanyrpcerror(ep)) && 3480 (sd->sd_mn_mynode->nd_nodeid != 3481 sd->sd_mn_master_nodeid)) { 3482 return (205); 3483 } else { 3484 /* Any other failure */ 3485 return (-1); 3486 } 3487 } else { 3488 if (is_owner == TRUE) { 3489 3490 meta_mc_log(MC_LOG5, dgettext( 3491 TEXT_DOMAIN, "Set %s previous " 3492 "master chosen %s (%d): %s"), 3493 sp->setname, 3494 sd->sd_mn_master_nodenm, 3495 sd->sd_mn_master_nodeid, 3496 meta_print_hrtime(gethrtime() - 3497 start_time)); 3498 3499 /* Previous master is ok - done */ 3500 return (0); 3501 } 3502 } 3503 } 3504 3505 /* 3506 * If current master is no longer in the membership list or 3507 * is no longer joined, then this node uses the following 3508 * algorithm: 3509 * - node calls RPC routine clnt_ownset to get latest 3510 * information on which nodes are owners of diskset. 3511 * clnt_ownset checks on each node to see if its kernel 3512 * has that diskset snarfed. 3513 */ 3514 nd = sd->sd_nodelist; 3515 while (nd) { 3516 /* Don't consider node that isn't in member list */ 3517 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3518 nd = nd->nd_next; 3519 continue; 3520 } 3521 3522 if (clnt_ownset(nd->nd_nodename, sp, 3523 &is_owner, ep) == -1) { 3524 /* If RPC failure to another node return 205 */ 3525 if ((mdanyrpcerror(ep)) && 3526 (sd->sd_mn_mynode->nd_nodeid != 3527 nd->nd_nodeid)) { 3528 return (205); 3529 } else { 3530 /* Any other failure */ 3531 return (-1); 3532 } 3533 } 3534 3535 /* 3536 * Set owner flag for each node based on whether 3537 * that node really has a diskset mddb snarfed in 3538 * or not. 3539 */ 3540 if (is_owner == TRUE) 3541 nd->nd_flags |= MD_MN_NODE_OWN; 3542 else 3543 nd->nd_flags &= ~MD_MN_NODE_OWN; 3544 3545 nd = nd->nd_next; 3546 } 3547 3548 /* 3549 * - node walks through nodelist looking for nodes that are 3550 * owners of the diskset that are in the membership list. 3551 * - for each owner, node calls RPC routine clnt_getset to 3552 * see if that node has its node record set to OK. 3553 * - If so, master is chosen to be this owner node. 3554 */ 3555 nd = sd->sd_nodelist; 3556 while (nd) { 3557 /* Don't consider node that isn't in member list */ 3558 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3559 nd = nd->nd_next; 3560 continue; 3561 } 3562 3563 /* Don't consider a node that isn't an owner */ 3564 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3565 nd = nd->nd_next; 3566 continue; 3567 } 3568 3569 /* Does node has its own node record set to OK? */ 3570 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3571 MD_SET_BAD, &mnsr, ep) == -1) { 3572 /* If RPC failure to another node return 205 */ 3573 if ((mdanyrpcerror(ep)) && 3574 (sd->sd_mn_mynode->nd_nodeid != 3575 nd->nd_nodeid)) { 3576 return (205); 3577 } else { 3578 /* Any other failure */ 3579 return (-1); 3580 } 3581 } 3582 nr = mnsr->sr_nodechain; 3583 while (nr) { 3584 if (nd->nd_nodeid == nr->nr_nodeid) { 3585 if (nr->nr_flags & MD_MN_NODE_OK) { 3586 /* Found a master */ 3587 free_sr( 3588 (md_set_record *)mnsr); 3589 goto found_master; 3590 } 3591 } 3592 nr = nr->nr_next; 3593 } 3594 free_sr((md_set_record *)mnsr); 3595 nd = nd->nd_next; 3596 } 3597 3598 /* 3599 * - If no owner node has its own node record on its own node 3600 * set to OK, then this node checks all of the non-owner 3601 * nodes that are in the membership list. 3602 * - for each non-owner, node calls RPC routine clnt_getset to 3603 * see if that node has its node record set to OK. 3604 * - If set doesn't exist, don't choose node for master. 3605 * - If so, master is chosen to be this non-owner node. 3606 * 3607 */ 3608 nd = sd->sd_nodelist; 3609 while (nd) { 3610 /* Don't consider node that isn't in member list */ 3611 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3612 nd = nd->nd_next; 3613 continue; 3614 } 3615 3616 /* Only checking non-owner nodes this time around */ 3617 if (nd->nd_flags & MD_MN_NODE_OWN) { 3618 nd = nd->nd_next; 3619 continue; 3620 } 3621 3622 /* Does node has its own node record set to OK? */ 3623 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3624 MD_SET_BAD, &mnsr, ep) == -1) { 3625 /* 3626 * If set doesn't exist on non-owner node, 3627 * don't consider this node for master. 3628 */ 3629 if (mdiserror(ep, MDE_NO_SET)) { 3630 nd = nd->nd_next; 3631 continue; 3632 } else if ((mdanyrpcerror(ep)) && 3633 (sd->sd_mn_mynode->nd_nodeid != 3634 nd->nd_nodeid)) { 3635 /* RPC failure to another node */ 3636 return (205); 3637 } else { 3638 /* Any other failure */ 3639 return (-1); 3640 } 3641 } 3642 nr = mnsr->sr_nodechain; 3643 while (nr) { 3644 if (nd->nd_nodeid == nr->nr_nodeid) { 3645 if (nr->nr_flags & MD_MN_NODE_OK) { 3646 /* Found a master */ 3647 free_sr( 3648 (md_set_record *)mnsr); 3649 goto found_master; 3650 } 3651 } 3652 nr = nr->nr_next; 3653 } 3654 free_sr((md_set_record *)mnsr); 3655 nd = nd->nd_next; 3656 } 3657 3658 /* 3659 * - If no node can be found that has its own node record on 3660 * its node to be set to OK, then all alive nodes 3661 * were in the process of being added to or deleted 3662 * from set. Each alive node will remove all 3663 * information pertaining to this set from its node. 3664 * 3665 * If all nodes in set are ALIVE, then call sdssc end routines 3666 * since set was truly being initially created or destroyed. 3667 */ 3668 goto delete_set; 3669 } else { 3670 3671 /* 3672 * If node is not joined to diskset, then this 3673 * node uses the following algorithm: 3674 * - If unjoined node doesn't have a node record for itself, 3675 * just delete the diskset since diskset was in the 3676 * process of being created. 3677 * - node needs to find master of diskset before 3678 * reconfig cycle, if a master existed. 3679 * - node calls RPC routine clnt_ownset to get latest 3680 * information on which nodes are owners of diskset. 3681 * clnt_ownset checks on each node to see if its 3682 * kernel has that diskset snarfed. 3683 */ 3684 3685 /* 3686 * Is my node in the set description? 3687 * If not, delete the set from this node. 3688 * sr2setdesc sets sd_mn_mynode pointer to the node 3689 * descriptor for this node if there was a node 3690 * record for this node. 3691 * 3692 */ 3693 if (sd->sd_mn_mynode == NULL) { 3694 goto delete_set; 3695 } 3696 3697 nd = sd->sd_nodelist; 3698 while (nd) { 3699 /* Don't consider node that isn't in member list */ 3700 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3701 nd = nd->nd_next; 3702 continue; 3703 } 3704 3705 if (clnt_ownset(nd->nd_nodename, sp, 3706 &is_owner, ep) == -1) { 3707 /* If RPC failure to another node return 205 */ 3708 if ((mdanyrpcerror(ep)) && 3709 (sd->sd_mn_mynode->nd_nodeid != 3710 nd->nd_nodeid)) { 3711 return (205); 3712 } else { 3713 /* Any other failure */ 3714 return (-1); 3715 } 3716 } 3717 3718 /* 3719 * Set owner flag for each node based on whether 3720 * that node really has a diskset mddb snarfed in 3721 * or not. 3722 */ 3723 if (is_owner == TRUE) 3724 nd->nd_flags |= MD_MN_NODE_OWN; 3725 else 3726 nd->nd_flags &= ~MD_MN_NODE_OWN; 3727 3728 nd = nd->nd_next; 3729 } 3730 3731 /* 3732 * - node walks through nodelist looking for nodes that 3733 * are owners of the diskset that are in 3734 * the membership list. 3735 * - for each owner, node calls RPC routine clnt_getset to 3736 * see if that node has a master set and to get the 3737 * diskset description. 3738 * - If the owner node has a set description that doesn't 3739 * include the non-joined node in the nodelist, this node 3740 * removes its set description of that diskset 3741 * (i.e. removes the set from its local mddbs). This is 3742 * handling the case of when a node was removed from a 3743 * diskset while it was not in the cluster membership 3744 * list. 3745 * - If that node has a master set and the master is in the 3746 * membership list and is an owner, then either this was 3747 * the master from before the reconfig cycle or this 3748 * node has already chosen a new master - either way, 3749 * the master value is valid as long as it is in the 3750 * membership list and is an owner 3751 * - master is chosen to be owner node's master 3752 */ 3753 nd = sd->sd_nodelist; 3754 while (nd) { 3755 /* Don't consider node that isn't in member list */ 3756 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3757 nd = nd->nd_next; 3758 continue; 3759 } 3760 3761 /* Don't consider a node that isn't an owner */ 3762 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3763 nd = nd->nd_next; 3764 continue; 3765 } 3766 3767 /* Get owner node's set record */ 3768 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3769 MD_SET_BAD, &mnsr, ep) == -1) { 3770 /* If RPC failure to another node return 205 */ 3771 if ((mdanyrpcerror(ep)) && 3772 (sd->sd_mn_mynode->nd_nodeid != 3773 nd->nd_nodeid)) { 3774 return (205); 3775 } else { 3776 /* Any other failure */ 3777 return (-1); 3778 } 3779 } 3780 3781 /* Is this node in the owner node's set record */ 3782 nr = mnsr->sr_nodechain; 3783 while (nr) { 3784 if (sd->sd_mn_mynode->nd_nodeid == 3785 nr->nr_nodeid) { 3786 break; 3787 } 3788 nr = nr->nr_next; 3789 } 3790 if (nr == NULL) { 3791 /* my node not found - delete set */ 3792 free_sr((md_set_record *)mnsr); 3793 goto delete_set; 3794 } 3795 3796 /* Is owner's node's master valid? */ 3797 master_nodeid = mnsr->sr_master_nodeid; 3798 free_sr((md_set_record *)mnsr); 3799 if (master_nodeid == MD_MN_INVALID_NID) { 3800 nd = nd->nd_next; 3801 continue; 3802 } 3803 3804 nd2 = sd->sd_nodelist; 3805 while (nd2) { 3806 if ((nd2->nd_nodeid == master_nodeid) && 3807 (nd2->nd_flags & MD_MN_NODE_ALIVE) && 3808 (nd2->nd_flags & MD_MN_NODE_OWN)) { 3809 nd = nd2; 3810 goto found_master; 3811 } 3812 nd2 = nd2->nd_next; 3813 } 3814 nd = nd->nd_next; 3815 } 3816 3817 /* 3818 * - If no owner node has a valid master, then follow 3819 * algorithm of when a node is joined to the diskset. 3820 * - node walks through nodelist looking for nodes that are 3821 * owners of the diskset that are in the membership list. 3822 * - for each owner, node calls RPC routine clnt_getset to 3823 * see if that node has its node record set to OK. 3824 * - If so, master is chosen to be this owner node. 3825 */ 3826 nd = sd->sd_nodelist; 3827 while (nd) { 3828 /* Don't consider node that isn't in member list */ 3829 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3830 nd = nd->nd_next; 3831 continue; 3832 } 3833 3834 /* Don't consider a node that isn't an owner */ 3835 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3836 nd = nd->nd_next; 3837 continue; 3838 } 3839 3840 /* Does node has its own node record set to OK? */ 3841 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3842 MD_SET_BAD, &mnsr, ep) == -1) { 3843 /* If RPC failure to another node return 205 */ 3844 if ((mdanyrpcerror(ep)) && 3845 (sd->sd_mn_mynode->nd_nodeid != 3846 nd->nd_nodeid)) { 3847 return (205); 3848 } else { 3849 /* Any other failure */ 3850 return (-1); 3851 } 3852 } 3853 nr = mnsr->sr_nodechain; 3854 while (nr) { 3855 if (nd->nd_nodeid == nr->nr_nodeid) { 3856 if (nr->nr_flags & MD_MN_NODE_OK) { 3857 /* Found a master */ 3858 free_sr( 3859 (md_set_record *)mnsr); 3860 goto found_master; 3861 } 3862 } 3863 nr = nr->nr_next; 3864 } 3865 free_sr((md_set_record *)mnsr); 3866 nd = nd->nd_next; 3867 } 3868 3869 /* 3870 * - If no owner node has its own node record on its own node 3871 * set to OK, then this node checks all of the non-owner 3872 * nodes that are in the membership list. 3873 * - for each non-owner, node calls RPC routine clnt_getset to 3874 * see if that node has its node record set to OK. 3875 * - If set doesn't exist, don't choose node for master. 3876 * - If this node doesn't exist in the nodelist on any of the 3877 * non-owner nodes, this node removes its set description 3878 * of that diskset (i.e. removes the set from its local 3879 * mddbs). This is handling the case of when a node was 3880 * removed from a diskset while it was not in the 3881 * cluster membership list. 3882 * - If non-owner node has its node record set to OK and if 3883 * this node hasn't removed this diskset (step directly 3884 * before this one), then the master is chosen to be this 3885 * non-owner node. 3886 */ 3887 nd = sd->sd_nodelist; 3888 while (nd) { 3889 /* Don't consider node that isn't in member list */ 3890 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3891 nd->nd_flags |= MD_MN_NODE_DEL; 3892 nd = nd->nd_next; 3893 continue; 3894 } 3895 3896 /* Don't consider owner nodes since none are OK */ 3897 if (nd->nd_flags & MD_MN_NODE_OWN) { 3898 nd->nd_flags |= MD_MN_NODE_DEL; 3899 nd = nd->nd_next; 3900 continue; 3901 } 3902 3903 /* 3904 * Don't need to get nodelist from my node since 3905 * this is where sd_nodelist was obtained. 3906 */ 3907 if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3908 nd = nd->nd_next; 3909 continue; 3910 } 3911 3912 /* 3913 * If node has already been decided against for 3914 * master, then skip it. 3915 */ 3916 if (nd->nd_flags & MD_MN_NODE_DEL) { 3917 nd = nd->nd_next; 3918 continue; 3919 } 3920 3921 /* 3922 * Does node in my nodelist have its own node 3923 * record marked OK on its node? And does node 3924 * in my nodelist exist on all other nodes? 3925 * Don't want to choose a node for master unless 3926 * that node is marked OK on its own node and that 3927 * node exists on all other alive nodes. 3928 * 3929 * This is guarding against the case when several 3930 * nodes are down and one of the downed nodes is 3931 * deleted from the diskset. When the down nodes 3932 * are rebooted into the cluster, you don't want 3933 * any node to pick the deleted node as the master. 3934 */ 3935 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3936 MD_SET_BAD, &mnsr, ep) == -1) { 3937 /* 3938 * If set doesn't exist on non-owner node, 3939 * don't consider this node for master. 3940 */ 3941 if (mdiserror(ep, MDE_NO_SET)) { 3942 nd->nd_flags |= MD_MN_NODE_DEL; 3943 nd = nd->nd_next; 3944 continue; 3945 } else if (mdanyrpcerror(ep)) { 3946 /* RPC failure to another node */ 3947 return (205); 3948 } else { 3949 /* Any other failure */ 3950 return (-1); 3951 } 3952 } 3953 /* 3954 * Is my node in the nodelist gotten from the other 3955 * node? If not, then remove the set from my node 3956 * since set was deleted from my node while my node 3957 * was out of the cluster. 3958 */ 3959 nr = mnsr->sr_nodechain; 3960 while (nr) { 3961 if (sd->sd_mn_mynode->nd_nodeid == 3962 nr->nr_nodeid) { 3963 break; 3964 } 3965 nr = nr->nr_next; 3966 } 3967 if (nr == NULL) { 3968 /* my node not found - delete set */ 3969 free_sr((md_set_record *)mnsr); 3970 goto delete_set; 3971 } 3972 3973 /* Is node being checked marked OK on its own node? */ 3974 nr = mnsr->sr_nodechain; 3975 while (nr) { 3976 if (nd->nd_nodeid == nr->nr_nodeid) { 3977 if (!(nr->nr_flags & MD_MN_NODE_OK)) { 3978 nd->nd_flags |= MD_MN_NODE_DEL; 3979 } 3980 break; 3981 } 3982 nr = nr->nr_next; 3983 } 3984 /* 3985 * If node being checked doesn't exist on its 3986 * own node - don't choose it as master. 3987 */ 3988 if (nr == NULL) { 3989 nd->nd_flags |= MD_MN_NODE_DEL; 3990 } 3991 3992 /* 3993 * Check every node in my node's nodelist against 3994 * the nodelist gotten from the other node. 3995 * If a node in my node's nodelist is not found in the 3996 * other node's nodelist, then set the DEL flag. 3997 */ 3998 nd2 = sd->sd_nodelist; 3999 while (nd2) { 4000 nr = mnsr->sr_nodechain; 4001 while (nr) { 4002 if (nd2->nd_nodeid == nr->nr_nodeid) { 4003 break; 4004 } 4005 nr = nr->nr_next; 4006 } 4007 /* nd2 not found in other node's nodelist */ 4008 if (nr == NULL) { 4009 nd2->nd_flags |= MD_MN_NODE_DEL; 4010 } 4011 nd2 = nd2->nd_next; 4012 } 4013 4014 free_sr((md_set_record *)mnsr); 4015 nd = nd->nd_next; 4016 } 4017 4018 /* 4019 * Rescan list look for node that has not been marked DEL. 4020 * First node found is the master. 4021 */ 4022 nd = sd->sd_nodelist; 4023 while (nd) { 4024 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4025 break; 4026 } 4027 nd = nd->nd_next; 4028 continue; 4029 } 4030 if (nd) { 4031 /* Found a master */ 4032 goto found_master; 4033 } 4034 4035 /* 4036 * - If no node can be found that has its own node record on 4037 * its node to be set to OK, then all alive nodes 4038 * were in the process of being added to or deleted 4039 * from set. Each alive node will remove all 4040 * information pertaining to this set from its node. 4041 * 4042 * If all nodes in set are ALIVE, then call sdssc end routines 4043 * since set was truly being initially created or destroyed. 4044 */ 4045 goto delete_set; 4046 } 4047 4048 found_master: 4049 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4050 "Set %s master chosen %s (%d): %s"), 4051 sp->setname, nd->nd_nodename, nd->nd_nodeid, 4052 meta_print_hrtime(gethrtime() - start_time)); 4053 4054 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4055 return (-1); 4056 } 4057 4058 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4059 4060 if (clnt_mnsetmaster(mynode(), sp, 4061 nd->nd_nodename, nd->nd_nodeid, ep)) { 4062 rval = -1; 4063 } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 4064 /* If this node is new master, set flag in this node's kernel */ 4065 (void) memset(&sf, 0, sizeof (sf)); 4066 sf.sf_setno = sp->setno; 4067 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 4068 /* Use magic to help protect ioctl against attack. */ 4069 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4070 sf.sf_flags = MDDB_NM_SET; 4071 4072 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4073 "Setting new master flag for set %s: %s"), 4074 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4075 4076 /* 4077 * Fail reconfig cycle if ioctl fails since it is critical 4078 * to set new master flag. 4079 */ 4080 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, 4081 NULL) != NULL) { 4082 (void) mdstealerror(ep, &sf.sf_mde); 4083 rval = -1; 4084 } 4085 } 4086 4087 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4088 if (rval == 0) { 4089 (void) mdstealerror(ep, &xep); 4090 rval = -1; 4091 } 4092 } 4093 4094 cl_set_setkey(NULL); 4095 4096 metaflushsetname(sp); 4097 4098 return (rval); 4099 4100 delete_set: 4101 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4102 "Master not chosen, deleting set %s: %s"), 4103 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4104 4105 /* 4106 * Remove all set information from this node: 4107 * - node records for this set 4108 * - drive records for this set 4109 * - set record for this set 4110 * (Only do this on this node since each node 4111 * will do it for its own local mddb.) 4112 * 4113 * If all nodes in set are ALIVE, then 4114 * the lowest numbered ALIVE nodeid in set 4115 * (irregardless of whether an owner node or not) will 4116 * call the DCS service to cleanup for create/delete of set. 4117 * sdssc_create_end(cleanup) if set was being created or 4118 * sdssc_delete_end(cleanup) if set was being deleted. 4119 * A node record with flag ADD denotes a set being 4120 * created. A node record with flag DEL denotes a 4121 * set being deleted. 4122 */ 4123 nd = sd->sd_nodelist; 4124 while (nd) { 4125 /* Found a node that isn't alive */ 4126 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) 4127 break; 4128 4129 /* Is my node the lowest numbered ALIVE node? */ 4130 if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) { 4131 break; 4132 } 4133 nd = nd->nd_next; 4134 } 4135 if (nd == NULL) { 4136 /* All nodes ALIVE and this is the lowest nodeid */ 4137 lowest_alive_nodeid = 1; 4138 } 4139 4140 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4141 return (-1); 4142 } 4143 4144 4145 /* 4146 * If this node had been joined, withdraw and reset master. 4147 * 4148 * This could happen if a node was being added to or removed 4149 * from a diskset and the node doing the add/delete operation and 4150 * all other nodes in the diskset have left the cluster. 4151 */ 4152 if (sd->sd_mn_mynode) { 4153 nd = sd->sd_mn_mynode; 4154 if (nd->nd_flags & MD_MN_NODE_OWN) { 4155 if (clnt_withdrawset(mynode(), sp, ep)) { 4156 rval = -1; 4157 goto out; 4158 } 4159 if (clnt_mnsetmaster(mynode(), sp, "", 4160 MD_MN_INVALID_NID, ep)) { 4161 rval = -1; 4162 goto out; 4163 } 4164 } 4165 } 4166 4167 /* 4168 * Remove side records for this node (side) from local mddb 4169 * (clnt_deldrvs does this) if there are drives in the set. 4170 * 4171 * Don't need to mark this node as DEL since already marked as 4172 * ADD or DEL (or this node would have been chosen as master). 4173 * Don't need to mark other node records, drive records or 4174 * set records as DEL. If a panic occurs during clnt_delset, 4175 * these records will be deleted the next time this node 4176 * becomes a member and goes through the reconfig cycle. 4177 */ 4178 /* Get the drive descriptors for this set */ 4179 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4180 ep)) == NULL) { 4181 if (! mdisok(ep)) { 4182 /* 4183 * Ignore and clear out any failures from 4184 * metaget_drivedesc since a panic could have 4185 * occurred when a node was partially added to a set. 4186 */ 4187 mdclrerror(ep); 4188 } 4189 } else { 4190 if (clnt_deldrvs(mynode(), sp, dd, ep)) { 4191 rval = -1; 4192 goto out; 4193 } 4194 } 4195 4196 /* 4197 * Now, delete the set - this removes the node, drive 4198 * and set records from the local mddb. 4199 */ 4200 if (clnt_delset(mynode(), sp, ep)) { 4201 rval = -1; 4202 goto out; 4203 } 4204 4205 out: 4206 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4207 4208 /* 4209 * Ignore errors from unlock of set since set is no longer 4210 * known (if clnt_delset worked). 4211 */ 4212 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4213 mdclrerror(&xep); 4214 } 4215 4216 cl_set_setkey(NULL); 4217 4218 metaflushsetname(sp); 4219 4220 /* 4221 * If this node is the lowest numbered nodeid then 4222 * call sdssc_create/delete_end depending on whether 4223 * this node is marked as ADD or DEL in the node record. 4224 */ 4225 if (lowest_alive_nodeid) { 4226 if (nd->nd_flags & MD_MN_NODE_ADD) 4227 sdssc_create_end(sp->setname, SDSSC_CLEANUP); 4228 else if (nd->nd_flags & MD_MN_NODE_DEL) 4229 sdssc_delete_end(sp->setname, SDSSC_CLEANUP); 4230 } 4231 4232 /* Finished with this set -- return */ 4233 return (rval); 4234 } 4235 4236 /* 4237 * Reconfig step to choose a new master for all MN disksets. 4238 * Return values: 4239 * 0 - Everything is great. 4240 * 1 - This node failed to reconfig. 4241 * 205 - Cause another reconfig due to a nodelist problem 4242 * or RPC failure to another node 4243 */ 4244 int 4245 meta_reconfig_choose_master( 4246 long timeout, 4247 md_error_t *ep 4248 ) 4249 { 4250 set_t max_sets, setno; 4251 int nodecnt; 4252 mndiskset_membershiplist_t *nl; 4253 md_set_desc *sd; 4254 mdsetname_t *sp; 4255 int rval = 0; 4256 mddb_setflags_config_t sf; 4257 int start_node_delayed = 0; 4258 4259 if ((max_sets = get_max_sets(ep)) == 0) { 4260 mde_perror(ep, dgettext(TEXT_DOMAIN, 4261 "Unable to get number of sets")); 4262 return (1); 4263 } 4264 4265 /* 4266 * Get membershiplist from API routine. If there's 4267 * an error, return a 205 to cause another reconfig. 4268 */ 4269 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { 4270 mde_perror(ep, ""); 4271 return (205); 4272 } 4273 4274 for (setno = 1; setno < max_sets; setno++) { 4275 if ((sp = metasetnosetname(setno, ep)) == NULL) { 4276 if (mdiserror(ep, MDE_NO_SET)) { 4277 /* No set for this setno - continue */ 4278 mdclrerror(ep); 4279 continue; 4280 } else { 4281 /* 4282 * If encountered an RPC error from my node, 4283 * then immediately fail. 4284 */ 4285 if (mdanyrpcerror(ep)) { 4286 mde_perror(ep, ""); 4287 return (1); 4288 } 4289 /* Can't get set information */ 4290 mde_perror(ep, dgettext(TEXT_DOMAIN, 4291 "Unable to get information for " 4292 "set number %d"), setno); 4293 mdclrerror(ep); 4294 continue; 4295 } 4296 } 4297 4298 /* If setname is there, set desc should exist. */ 4299 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4300 /* 4301 * If encountered an RPC error from my node, 4302 * then immediately fail. 4303 */ 4304 if (mdanyrpcerror(ep)) { 4305 mde_perror(ep, ""); 4306 return (1); 4307 } 4308 mde_perror(ep, dgettext(TEXT_DOMAIN, 4309 "Unable to get set %s desc information"), 4310 sp->setname); 4311 mdclrerror(ep); 4312 continue; 4313 } 4314 4315 /* Only reconfig MN disksets */ 4316 if (!MD_MNSET_DESC(sd)) { 4317 continue; 4318 } 4319 4320 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4321 "Begin choose master for set %s: %s"), 4322 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4323 4324 /* Update nodelist with member information. */ 4325 if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) { 4326 /* 4327 * If encountered an RPC error from my node, 4328 * then immediately fail. 4329 */ 4330 if (mdanyrpcerror(ep)) { 4331 mde_perror(ep, ""); 4332 return (1); 4333 } 4334 mde_perror(ep, ""); 4335 mdclrerror(ep); 4336 continue; 4337 } 4338 4339 /* 4340 * If all nodes in a cluster are starting, then 4341 * all nodes will attempt to contact all other nodes 4342 * to determine a master node. This can lead to a 4343 * problem where node 1 is trying to contact the rpc.metad 4344 * node 2 and node 2 is trying to contact the rpc.metad 4345 * on node 1 -- and this causes the rpc call to fail 4346 * on both nodes and causes a new reconfig cycle. 4347 * 4348 * In order to break this problem, a newly starting node 4349 * will delay a small amount of time (nodeid mod 4 seconds) 4350 * and will then run the code to choose a master for the 4351 * first set. Delay will only be done once regardless of the 4352 * number of sets. 4353 */ 4354 if (start_node_delayed == 0) { 4355 (void) memset(&sf, 0, sizeof (sf)); 4356 sf.sf_setno = sp->setno; 4357 sf.sf_flags = MDDB_NM_GET; 4358 /* Use magic to help protect ioctl against attack. */ 4359 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4360 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, 4361 &sf.sf_mde, NULL) == 0) && 4362 ((sf.sf_setflags & MD_SET_MN_START_RC) == 4363 MD_SET_MN_START_RC)) { 4364 (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4); 4365 } 4366 start_node_delayed = 1; 4367 } 4368 4369 /* Choose master for this set */ 4370 rval = meta_reconfig_choose_master_for_set(sp, sd, ep); 4371 if (rval == -1) { 4372 mde_perror(ep, ""); 4373 return (1); 4374 } else if (rval == 205) { 4375 mde_perror(ep, ""); 4376 return (205); 4377 } 4378 4379 /* reinit rpc.mdcommd with new nodelist */ 4380 if (mdmn_reinit_set(sp->setno, timeout)) { 4381 md_eprintf(dgettext(TEXT_DOMAIN, 4382 "Could not re-initialise rpc.mdcommd for " 4383 "set %s\n"), sp->setname); 4384 return (1); 4385 } 4386 4387 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4388 "Choose master for set %s completed: %s"), 4389 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4390 } 4391 4392 /* 4393 * Each node turns on I/Os for all MN disksets. 4394 * This is to recover from the situation where the master died 4395 * during a previous reconfig cycle when I/Os were suspended 4396 * for a MN diskset. 4397 * If a failure occurs return a 1 which will force this node to 4398 * panic. Cannot leave node in the situation where I/Os are 4399 * not resumed. 4400 */ 4401 setno = 0; /* 0 means all MN sets */ 4402 if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) { 4403 mde_perror(ep, ""); 4404 return (1); 4405 } 4406 4407 /* Free the nodelist */ 4408 if (nodecnt) 4409 meta_free_nodelist(nl); 4410 4411 return (0); 4412 } 4413 4414 /* 4415 * meta_mnsync_user_records will synchronize the diskset user records across 4416 * all nodes in the diskset. The diskset user records are stored in 4417 * each node's local set mddb. 4418 * 4419 * This needs to be done even if there is no master change during the 4420 * reconfig cycle since this routine should clean up any mess left by 4421 * the untimely termination of a metaset or metadb command (due to a 4422 * node panic or to user intervention). 4423 * 4424 * Caller is the Master node. 4425 * 4426 * Returns 0 - Success 4427 * 205 - Failure during RPC to another node 4428 * -1 - Any other failure and ep is filled in. 4429 */ 4430 int 4431 meta_mnsync_user_records( 4432 mdsetname_t *sp, 4433 md_error_t *ep 4434 ) 4435 { 4436 md_set_desc *sd; 4437 md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail; 4438 md_mnset_record *mnsr; 4439 md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL; 4440 md_mnnode_record *nr; 4441 md_drive_record *dr; 4442 int dr_cnt, dd_cnt; 4443 int found_my_nr; 4444 md_drive_desc *dd, *dd_prev, *master_dd, *other_dd; 4445 int all_drives_ok; 4446 int rval = 0; 4447 int max_genid = 0; 4448 int num_alive_nodes, num_alive_nodes_del = 0; 4449 int set_locked = 0; 4450 md_setkey_t *cl_sk; 4451 md_error_t xep = mdnullerror; 4452 char *anode[1]; 4453 mddb_setflags_config_t sf; 4454 4455 /* 4456 * Sync up node records first. 4457 * Construct a master nodelist using the nodelist from this 4458 * node's rpc.metad node records and then setting the state of each 4459 * node following these rules: 4460 * - If a node record is marked OK on its node, mark it OK 4461 * in the master nodelist (and later OK on all nodes) 4462 * If a node record is also marked OWN on its node, 4463 * mark it OWN in the master nodelist. 4464 * - If a node record is not marked OK on its node, then mark 4465 * it as DEL in the master list (later deleting it) 4466 * - If node record doesn't exist on that node, then mark it DEL 4467 * (later deleting it) 4468 * - If set record doesn't exist on that node, mark node as DEL 4469 * - If a node record doesn't exist on all nodes, then mark it DEL 4470 * - If a node is not ALIVE, then 4471 * - If that node marked DEL on any node - mark it DEL 4472 * in master list but leave in nodelist 4473 * - If that node is marked as ADD on any node, mark it 4474 * ADD in the master list but leave in nodelist 4475 * - When that node returns to the living, the DEL 4476 * node record will be removed and the ADD node 4477 * record may be removed if marked ADD on that 4478 * node. 4479 * The key rule is to not remove a node from the nodelist until 4480 * that node record is removed from its own node. Do not want to 4481 * remove a node's record from all other nodes and then have 4482 * that node have its own record marked OK so that a node will pick 4483 * a different master than the other nodes. 4484 * 4485 * Next, 4486 * If node is ALIVE and node record is marked DEL in master nodelist, 4487 * remove node from set. 4488 * If node is ALIVE and node record is marked OK in master nodelist, 4489 * mark it OK on all other nodes. 4490 * If node is not ALIVE and node record is marked DEL in master 4491 * nodelist, mark it DEL on all other nodes. 4492 * If node is not ALIVE and node record is marked ADD in master, 4493 * nodelist, mark it ADD on all other nodes. 4494 */ 4495 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4496 return (-1); 4497 } 4498 master_nodelist = sd->sd_nodelist; 4499 4500 /* 4501 * Walk through nodelist creating a master nodelist. 4502 */ 4503 num_alive_nodes = 0; 4504 nd = master_nodelist; 4505 while (nd) { 4506 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4507 nd = nd->nd_next; 4508 continue; 4509 } 4510 num_alive_nodes++; 4511 if (clnt_mngetset(nd->nd_nodename, sp->setname, 4512 MD_SET_BAD, &mnsr, ep) == -1) { 4513 if (mdiserror(ep, MDE_NO_SET)) { 4514 /* set doesn't exist, mark node as DEL */ 4515 nd->nd_flags &= ~MD_MN_NODE_OK; 4516 nd->nd_flags &= ~MD_MN_NODE_ADD; 4517 nd->nd_flags |= MD_MN_NODE_DEL; 4518 nd->nd_flags |= MD_MN_NODE_NOSET; 4519 nd = nd->nd_next; 4520 continue; 4521 } else { 4522 /* If RPC failure to another node return 205 */ 4523 if ((mdanyrpcerror(ep)) && 4524 (sd->sd_mn_mynode->nd_nodeid != 4525 nd->nd_nodeid)) { 4526 rval = 205; 4527 } else { 4528 /* Any other failure */ 4529 rval = -1; 4530 } 4531 goto out; 4532 } 4533 } 4534 /* Find biggest genid in records for this diskset */ 4535 if (mnsr->sr_genid > max_genid) 4536 max_genid = mnsr->sr_genid; 4537 4538 dr = mnsr->sr_drivechain; 4539 while (dr) { 4540 /* Find biggest genid in records for this diskset */ 4541 if (dr->dr_genid > max_genid) { 4542 max_genid = dr->dr_genid; 4543 } 4544 dr = dr->dr_next; 4545 } 4546 4547 found_my_nr = 0; 4548 nr = mnsr->sr_nodechain; 4549 /* nr is the list of node recs from nd_nodename node */ 4550 while (nr) { 4551 /* Find biggest genid in records for this diskset */ 4552 if (nr->nr_genid > max_genid) 4553 max_genid = nr->nr_genid; 4554 nd2 = master_nodelist; 4555 ndtail = NULL; 4556 /* For each node record, is it in master list? */ 4557 while (nd2) { 4558 if (nd2->nd_nodeid == nr->nr_nodeid) 4559 break; 4560 if (nd2->nd_next == NULL) 4561 ndtail = nd2; 4562 nd2 = nd2->nd_next; 4563 } 4564 /* 4565 * Found node record not in master list -- add it 4566 * to list marking it as DEL since node record 4567 * should exist on all nodes unless a panic occurred 4568 * during addition or deletion of host to diskset. 4569 */ 4570 if (nd2 == NULL) { 4571 nd2 = Zalloc(sizeof (*nd2)); 4572 (void) strcpy(nd2->nd_nodename, 4573 nr->nr_nodename); 4574 nd2->nd_flags = nr->nr_flags; 4575 nd2->nd_flags |= MD_MN_NODE_DEL; 4576 nd2->nd_nodeid = nr->nr_nodeid; 4577 nd2->nd_next = NULL; 4578 ndtail->nd_next = nd2; 4579 nd2 = NULL; 4580 nr = nr->nr_next; 4581 continue; 4582 } 4583 /* 4584 * Is this the node record for the node that 4585 * we requested the set desc from? 4586 * If so, check if node has its own node record 4587 * marked OK. If marked OK, check for the OWN bit. 4588 */ 4589 if (nr->nr_nodeid == nd->nd_nodeid) { 4590 found_my_nr = 1; 4591 if (nr->nr_flags & MD_MN_NODE_OK) { 4592 /* 4593 * If node record is marked OK 4594 * on its own node, then mark it OK 4595 * in the master list. Node record 4596 * would have to exist on all nodes 4597 * in the ADD state before it could 4598 * be put into the OK state. 4599 */ 4600 nd->nd_flags |= MD_MN_NODE_OK; 4601 nd->nd_flags &= 4602 ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL); 4603 /* 4604 * Mark own in master list as marked 4605 * on own node. 4606 */ 4607 if (nr->nr_flags & MD_MN_NODE_OWN) 4608 nd->nd_flags |= MD_MN_NODE_OWN; 4609 else 4610 nd->nd_flags &= ~MD_MN_NODE_OWN; 4611 } else { 4612 /* Otherwise, mark node as DEL */ 4613 nd->nd_flags &= ~MD_MN_NODE_OK; 4614 nd->nd_flags &= ~MD_MN_NODE_ADD; 4615 nd->nd_flags |= MD_MN_NODE_DEL; 4616 } 4617 } 4618 /* 4619 * If node is not ALIVE and marked DEL 4620 * on any node, make it DEL in master list. 4621 * If node is not ALIVE and marked ADD 4622 * on any node, make it ADD in master list 4623 * unless node record has already been marked DEL. 4624 */ 4625 if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) { 4626 if (nr->nr_flags & MD_MN_NODE_ADD) { 4627 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4628 /* If not DEL - mark it ADD */ 4629 nd->nd_flags |= MD_MN_NODE_ADD; 4630 nd->nd_flags &= ~MD_MN_NODE_OK; 4631 } 4632 } 4633 if (nr->nr_flags & MD_MN_NODE_DEL) { 4634 nd->nd_flags |= MD_MN_NODE_DEL; 4635 nd->nd_flags &= ~MD_MN_NODE_OK; 4636 /* Could already be ADD - make it DEL */ 4637 nd->nd_flags &= ~MD_MN_NODE_ADD; 4638 } 4639 } 4640 nr = nr->nr_next; 4641 } 4642 /* 4643 * If a node record doesn't exist on its own node, 4644 * then mark node as DEL. 4645 */ 4646 if (found_my_nr == 0) { 4647 nd->nd_flags &= ~MD_MN_NODE_OK; 4648 nd->nd_flags |= MD_MN_NODE_DEL; 4649 } 4650 4651 /* 4652 * If node is OK - put mnsr onto master_mnsr_node list for 4653 * later use when syncing up the drive records in the set. 4654 */ 4655 if (nd->nd_flags & MD_MN_NODE_OK) { 4656 mnsr_node = Zalloc(sizeof (*mnsr_node)); 4657 mnsr_node->mmn_mnsr = mnsr; 4658 (void) strncpy(mnsr_node->mmn_nodename, 4659 nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1); 4660 mnsr_node->mmn_next = master_mnsr_node; 4661 master_mnsr_node = mnsr_node; 4662 } else { 4663 free_sr((struct md_set_record *)mnsr); 4664 } 4665 4666 nd = nd->nd_next; 4667 } 4668 4669 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4670 "Master nodelist created for set %s: %s"), 4671 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4672 4673 /* 4674 * Send master nodelist to the rpc.metad on all nodes (including 4675 * myself) and each node will update itself. This will set the 4676 * ADD and DEL flags on each node as setup in the master nodelist. 4677 * Don't send nodelist to node where set doesn't exist. 4678 */ 4679 nd = master_nodelist; 4680 while (nd) { 4681 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4682 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4683 nd = nd->nd_next; 4684 continue; 4685 } 4686 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 4687 master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 4688 /* If RPC failure to another node return 205 */ 4689 if ((mdanyrpcerror(ep)) && 4690 (sd->sd_mn_mynode->nd_nodeid != 4691 nd->nd_nodeid)) { 4692 rval = 205; 4693 } else { 4694 /* Any other failure */ 4695 rval = -1; 4696 } 4697 goto out; 4698 } 4699 nd = nd->nd_next; 4700 } 4701 4702 /* 4703 * Now, delete nodes that need to be deleted. 4704 */ 4705 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4706 ep)) == NULL) { 4707 if (! mdisok(ep)) { 4708 rval = -1; 4709 goto out; 4710 } 4711 } 4712 4713 /* 4714 * May be doing lots of RPC commands to the nodes, so lock the 4715 * ALIVE members of the set since most of the rpc.metad routines 4716 * require this for security reasons. 4717 */ 4718 nd = master_nodelist; 4719 while (nd) { 4720 /* Skip non-alive nodes and node without set */ 4721 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4722 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4723 nd = nd->nd_next; 4724 continue; 4725 } 4726 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 4727 /* If RPC failure to another node return 205 */ 4728 if ((mdanyrpcerror(ep)) && 4729 (sd->sd_mn_mynode->nd_nodeid != 4730 nd->nd_nodeid)) { 4731 rval = 205; 4732 } else { 4733 /* Any other failure */ 4734 rval = -1; 4735 } 4736 goto out; 4737 } 4738 set_locked = 1; 4739 nd = nd->nd_next; 4740 } 4741 4742 nd = master_nodelist; 4743 while (nd) { 4744 /* Skip non-alive nodes */ 4745 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4746 nd = nd->nd_next; 4747 continue; 4748 } 4749 if (nd->nd_flags & MD_MN_NODE_DEL) { 4750 num_alive_nodes_del++; 4751 /* 4752 * Delete this node rec from all ALIVE nodes in diskset. 4753 */ 4754 nd2 = master_nodelist; 4755 while (nd2) { 4756 /* Skip non-alive nodes and node without set */ 4757 if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) || 4758 (nd2->nd_flags & MD_MN_NODE_NOSET)) { 4759 nd2 = nd2->nd_next; 4760 continue; 4761 } 4762 4763 /* This is a node being deleted from set */ 4764 if (nd2->nd_nodeid == nd->nd_nodeid) { 4765 /* Mark set record as DEL */ 4766 if (clnt_upd_sr_flags(nd->nd_nodename, 4767 sp, MD_SR_DEL, ep)) { 4768 /* RPC failure to !my node */ 4769 if ((mdanyrpcerror(ep)) && 4770 (sd->sd_mn_mynode-> 4771 nd_nodeid 4772 != nd->nd_nodeid)) { 4773 rval = 205; 4774 } else { 4775 /* Any other failure */ 4776 rval = -1; 4777 } 4778 goto out; 4779 } 4780 if (clnt_deldrvs(nd->nd_nodename, sp, 4781 dd, ep)) { 4782 /* RPC failure to !my node */ 4783 if ((mdanyrpcerror(ep)) && 4784 (sd->sd_mn_mynode-> 4785 nd_nodeid 4786 != nd->nd_nodeid)) { 4787 rval = 205; 4788 } else { 4789 /* Any other failure */ 4790 rval = -1; 4791 } 4792 goto out; 4793 } 4794 if (clnt_delset(nd->nd_nodename, sp, 4795 ep) == -1) { 4796 /* RPC failure to !my node */ 4797 if ((mdanyrpcerror(ep)) && 4798 (sd->sd_mn_mynode-> 4799 nd_nodeid 4800 != nd->nd_nodeid)) { 4801 rval = 205; 4802 } else { 4803 /* Any other failure */ 4804 rval = -1; 4805 } 4806 goto out; 4807 } 4808 } else { 4809 /* 4810 * Delete host from sets on hosts 4811 * not being deleted. 4812 */ 4813 anode[0] = Strdup(nd->nd_nodename); 4814 if (clnt_delhosts(nd2->nd_nodename, sp, 4815 1, anode, ep) == -1) { 4816 Free(anode[0]); 4817 /* RPC failure to !my node */ 4818 if ((mdanyrpcerror(ep)) && 4819 (sd->sd_mn_mynode-> 4820 nd_nodeid 4821 != nd2->nd_nodeid)) { 4822 rval = 205; 4823 } else { 4824 /* Any other failure */ 4825 rval = -1; 4826 } 4827 goto out; 4828 } 4829 4830 meta_mc_log(MC_LOG5, 4831 dgettext(TEXT_DOMAIN, 4832 "Deleted node %s (%d) on node %s " 4833 "from set %s: %s"), 4834 nd->nd_nodename, nd->nd_nodeid, 4835 nd2->nd_nodename, 4836 sp->setname, 4837 meta_print_hrtime( 4838 gethrtime() - start_time)); 4839 4840 Free(anode[0]); 4841 } 4842 nd2 = nd2->nd_next; 4843 } 4844 } 4845 nd = nd->nd_next; 4846 } 4847 4848 nd = master_nodelist; 4849 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4850 while (nd) { 4851 /* Skip non-alive nodes and node without set */ 4852 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4853 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4854 nd = nd->nd_next; 4855 continue; 4856 } 4857 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 4858 /* If RPC failure to another node return 205 */ 4859 if ((mdanyrpcerror(ep)) && 4860 (sd->sd_mn_mynode->nd_nodeid != 4861 nd->nd_nodeid)) { 4862 rval = 205; 4863 } else { 4864 /* Any other failure */ 4865 rval = -1; 4866 } 4867 goto out; 4868 } 4869 nd = nd->nd_next; 4870 } 4871 cl_set_setkey(NULL); 4872 set_locked = 0; 4873 4874 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4875 "Nodelist syncronization complete for set %s: %s"), 4876 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4877 4878 metaflushsetname(sp); 4879 4880 /* 4881 * If all alive nodes have been deleted from set, just 4882 * return since nothing else can be done until non-alive 4883 * nodes (if there are any) rejoin the cluster. 4884 */ 4885 if (num_alive_nodes == num_alive_nodes_del) { 4886 rval = 0; 4887 goto out; 4888 } 4889 4890 /* 4891 * Sync up drive records. 4892 * 4893 * If a node panic'd (or metaset command was killed) during the 4894 * addition or deletion of a drive to the diskset, the nodes 4895 * may have a different view of the drive list. During cleanup 4896 * of the drive list during reconfig, a drive will be deleted 4897 * from the list if the master node sees that the drive has been 4898 * marked in the ADD state on any node or is marked in the DEL state 4899 * on all nodes. 4900 * This cleanup must occur even if all nodes in the cluster are 4901 * not part of the cluster so that all nodes have the same view 4902 * of the drivelist. 4903 * Then if the entire cluster goes down and comes back up, the 4904 * new master node could be a node that wasn't in the cluster when 4905 * the node was deleted. This could lead to a situation where the 4906 * master node thinks that a drive is OK, but this drive isn't 4907 * known to the other nodes. 4908 * This situation can also occur during the addition of a drive 4909 * where a node has the drive marked OK, but the node executing the 4910 * metaset command enountered a failure before marking that drive OK 4911 * on the rest of the nodes. If the node with the OK drive then 4912 * panics, then rest of the nodes will remove that drive marked ADD 4913 * and when the node with the OK drive rejoins the cluster, it will 4914 * have a drive marked OK that is unknown by the other nodes. 4915 * 4916 * There are 2 situations to consider: 4917 * A) Master knows about a drive that other nodes don't know about. 4918 * B) At least one slave node knows about a drive that the master 4919 * node doesn't know about. 4920 * 4921 * To handle these situations the following steps are followed: 4922 * 1) Count number of drives known by this master node and the 4923 * other slave nodes. 4924 * If all nodes have the same number of drives and the master has 4925 * all drives marked OK, then skip to step4. 4926 * 4927 * 2) If a node has less drives listed than the master, the master 4928 * must get the drive descriptor list from that node so that 4929 * master can determine which drive it needs to delete from that 4930 * node. Master must get the drive descriptor list since the 4931 * drive record list does not contain the name of the drive, but 4932 * only a key and the key can only be interprested on that other 4933 * node. 4934 * 4935 * 3) The master will then create the master drive list by doing: 4936 * - Master starts with drive list known by master. 4937 * - Any drive marked ADD will be removed from the list. 4938 * - Any drive not known by another node (from step2) will be 4939 * removed from the drive list. 4940 * - If a drive is marked DEL on the master, the master must 4941 * verify that the drive record is marked DEL on all nodes. 4942 * If any node has the drive record marked OK, mark it OK 4943 * on the master. (The reason why is described below). 4944 * 4945 * 4) The master sends out the master drive list and the slave 4946 * nodes will force their drive lists to match the master 4947 * drive list by deleting drives, if necessary and by changing 4948 * the drive record states from ADD->OK if master has drive 4949 * marked OK and slave has drive marked ADD. 4950 * 4951 * Interesting scenarios: 4952 * 4953 * 1) System has 4 nodes with node 1 as the master. Node 3 starts 4954 * to delete a drive record (drive record on node 1 is marked DEL), 4955 * but is stopped when node 3 panics. Node 1 also panics. 4956 * During reconfig cycle, node 2 is picked as master and the drive 4957 * record is left alone since all nodes in the cluster have it 4958 * marked OK. User now sees drive as part of diskset. 4959 * Now, entire cluster is rebooted and node 1 rejoins the cluster. 4960 * Node 1 is picked as the master and node 1 has drive record 4961 * marked DEL. Node 1 contacts all other nodes in the cluster 4962 * and since at least one node has the drive record marked OK, 4963 * the master marks the drive record OK. 4964 * User continues to see the drive as part of the diskset. 4965 */ 4966 4967 /* Reget set descriptor since flushed above */ 4968 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4969 rval = -1; 4970 goto out; 4971 } 4972 4973 /* Has side effect of setting sd->sd_drvs to same as master_dd */ 4974 if ((master_dd = metaget_drivedesc_sideno(sp, 4975 sd->sd_mn_mynode->nd_nodeid, 4976 (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { 4977 /* No drives in list */ 4978 if (!mdisok(ep)) { 4979 /* 4980 * Can't get drive list for this node, so 4981 * return -1 causing this node to be removed 4982 * cluster config and fixed. 4983 */ 4984 rval = -1; 4985 goto out; 4986 } 4987 } 4988 4989 /* Count the number of drives for all nodes */ 4990 mnsr_node = master_mnsr_node; 4991 while (mnsr_node) { 4992 dr_cnt = 0; 4993 dr = mnsr_node->mmn_mnsr->sr_drivechain; 4994 while (dr) { 4995 dr_cnt++; 4996 dr = dr->dr_next; 4997 } 4998 mnsr_node->mmn_numdrives = dr_cnt; 4999 mnsr_node = mnsr_node->mmn_next; 5000 } 5001 5002 /* Count the number of drives for the master; also check flags */ 5003 all_drives_ok = 1; 5004 dd_cnt = 0; 5005 dd = master_dd; 5006 while (dd) { 5007 dd_cnt++; 5008 if (!(dd->dd_flags & MD_DR_OK)) 5009 all_drives_ok = 0; 5010 dd = dd->dd_next; 5011 } 5012 5013 /* If all drives are ok, do quick check against number of drives */ 5014 if (all_drives_ok) { 5015 /* If all nodes have same number of drives, almost done */ 5016 mnsr_node = master_mnsr_node; 5017 while (mnsr_node) { 5018 if (mnsr_node->mmn_numdrives != dd_cnt) 5019 break; 5020 mnsr_node = mnsr_node->mmn_next; 5021 } 5022 /* All nodes have same number of drives, just send flags */ 5023 if (mnsr_node == NULL) { 5024 goto send_drive_list; 5025 } 5026 } 5027 5028 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5029 "Begin detailed drive synchronization for set %s: %s"), 5030 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5031 5032 /* Detailed check required */ 5033 mnsr_node = master_mnsr_node; 5034 while (mnsr_node) { 5035 /* Does slave node have less drives than master? */ 5036 if (mnsr_node->mmn_numdrives < dd_cnt) { 5037 /* Yes - must determine which drive is missing */ 5038 if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp, 5039 &other_dd, ep)) { 5040 /* RPC failure to !my node */ 5041 if ((mdanyrpcerror(ep)) && 5042 (strcmp(mynode(), mnsr_node->mmn_nodename) 5043 != 0)) { 5044 rval = 205; 5045 } else { 5046 /* Any other failure */ 5047 rval = -1; 5048 } 5049 mde_perror(ep, dgettext(TEXT_DOMAIN, 5050 "Master node %s unable to " 5051 "retrieve drive list from node %s"), 5052 mynode(), mnsr_node->mmn_nodename); 5053 goto out; 5054 } 5055 mnsr_node->mmn_dd = other_dd; 5056 dd = master_dd; 5057 while (dd) { 5058 if (!(dd->dd_flags & MD_DR_OK)) { 5059 dd = dd->dd_next; 5060 continue; 5061 } 5062 other_dd = mnsr_node->mmn_dd; 5063 while (other_dd) { 5064 /* Convert to devids, when available */ 5065 if (strcmp(other_dd->dd_dnp->cname, 5066 dd->dd_dnp->cname) == 0) { 5067 break; 5068 } 5069 other_dd = other_dd->dd_next; 5070 } 5071 /* 5072 * dd not found on slave so mark it 5073 * ADD for later deletion (drives in ADD 5074 * state are deleted later in this routine). 5075 */ 5076 if (other_dd == NULL) { 5077 dd->dd_flags = MD_DR_ADD; 5078 } 5079 dd = dd->dd_next; 5080 } 5081 5082 } 5083 mnsr_node = mnsr_node->mmn_next; 5084 } 5085 5086 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5087 "Drive check completed for set %s: %s"), 5088 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5089 5090 dd = master_dd; 5091 dd_prev = 0; 5092 while (dd) { 5093 /* Remove any ADD drives from list */ 5094 if (dd->dd_flags & MD_DR_ADD) { 5095 if (dd_prev) { 5096 dd_prev->dd_next = dd->dd_next; 5097 dd->dd_next = NULL; 5098 metafreedrivedesc(&dd); 5099 dd = dd_prev->dd_next; 5100 } else { 5101 /* 5102 * If removing drive descriptor from head 5103 * of linked list, also change sd->sd_drvs. 5104 */ 5105 master_dd = sd->sd_drvs = dd->dd_next; 5106 dd->dd_next = NULL; 5107 metafreedrivedesc(&dd); 5108 dd = master_dd; 5109 } 5110 /* dd setup in if/else above */ 5111 continue; 5112 } 5113 /* 5114 * If drive is marked DEL, check all other nodes. 5115 * If drive on another node is marked OK, mark drive OK 5116 * in master list. If drive is marked DEL or doesn't exist 5117 * on all nodes, remove drive from list. 5118 */ 5119 if (dd->dd_flags & MD_DR_DEL) { 5120 mnsr_node = master_mnsr_node; 5121 while (mnsr_node) { 5122 if (mnsr_node->mmn_dd == NULL) { 5123 if (clnt_getdrivedesc( 5124 mnsr_node->mmn_nodename, sp, 5125 &other_dd, ep)) { 5126 /* RPC failure to !my node */ 5127 if ((mdanyrpcerror(ep)) && 5128 (strcmp(mynode(), 5129 mnsr_node->mmn_nodename) 5130 != 0)) { 5131 rval = 205; 5132 } else { 5133 /* Any other failure */ 5134 rval = -1; 5135 } 5136 mde_perror(ep, 5137 dgettext(TEXT_DOMAIN, 5138 "Master node %s unable " 5139 "to retrieve drive list " 5140 "from node %s"), mynode(), 5141 mnsr_node->mmn_nodename); 5142 goto out; 5143 } 5144 mnsr_node->mmn_dd = other_dd; 5145 } 5146 other_dd = mnsr_node->mmn_dd; 5147 while (other_dd) { 5148 /* Found drive (OK) from other node */ 5149 if (strcmp(dd->dd_dnp->cname, 5150 other_dd->dd_dnp->cname) 5151 == 0) { 5152 /* Drive marked OK */ 5153 if (other_dd->dd_flags & 5154 MD_DR_OK) { 5155 dd->dd_flags = MD_DR_OK; 5156 } 5157 break; 5158 } 5159 other_dd = other_dd->dd_next; 5160 } 5161 if (dd->dd_flags == MD_DR_OK) 5162 break; 5163 5164 mnsr_node = mnsr_node->mmn_next; 5165 } 5166 /* 5167 * If no node had this drive marked OK, delete it. 5168 */ 5169 if (dd->dd_flags & MD_DR_DEL) { 5170 if (dd_prev) { 5171 dd_prev->dd_next = dd->dd_next; 5172 dd->dd_next = NULL; 5173 metafreedrivedesc(&dd); 5174 dd = dd_prev->dd_next; 5175 } else { 5176 /* 5177 * If removing drive descriptor from 5178 * head of linked list, also change 5179 * sd->sd_drvs. 5180 */ 5181 master_dd = sd->sd_drvs = dd->dd_next; 5182 dd->dd_next = NULL; 5183 metafreedrivedesc(&dd); 5184 dd = master_dd; 5185 } 5186 /* dd setup in if/else above */ 5187 continue; 5188 } 5189 } 5190 dd_prev = dd; 5191 dd = dd->dd_next; 5192 } 5193 5194 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5195 "Setting drive states completed for set %s: %s"), 5196 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5197 5198 send_drive_list: 5199 /* 5200 * Set genid on all drives to be the highest value seen. 5201 */ 5202 dd = master_dd; 5203 while (dd) { 5204 dd->dd_genid = max_genid; 5205 dd = dd->dd_next; 5206 } 5207 /* 5208 * Send updated drive list to all alive nodes. 5209 * Will also set genid on set and node records to have same 5210 * as the drive records. 5211 */ 5212 nd = sd->sd_nodelist; 5213 while (nd) { 5214 /* Skip non-alive nodes */ 5215 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5216 nd = nd->nd_next; 5217 continue; 5218 } 5219 if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) { 5220 /* RPC failure to another node */ 5221 if ((mdanyrpcerror(ep)) && 5222 (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) { 5223 rval = 205; 5224 } else { 5225 /* Any other failure */ 5226 rval = -1; 5227 } 5228 goto out; 5229 } 5230 nd = nd->nd_next; 5231 } 5232 5233 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5234 "Sent drive list to all nodes for set %s: %s"), 5235 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5236 5237 /* 5238 * If no drive records left in set and nodes had been joined, 5239 * withdraw the nodes. Always reset the master and mark 5240 * all nodes as withdrawn on all nodes. 5241 */ 5242 if (master_dd == NULL) { 5243 /* Reset new master flag since no longer master */ 5244 (void) memset(&sf, 0, sizeof (sf)); 5245 sf.sf_setno = sp->setno; 5246 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5247 sf.sf_flags = MDDB_NM_RESET; 5248 /* Use magic to help protect ioctl against attack. */ 5249 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5250 /* Ignore failure, failure to reset flag isn't catastrophic */ 5251 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5252 &sf.sf_mde, NULL); 5253 5254 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5255 "Reset new master flag for " "set %s: %s"), 5256 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5257 5258 nd = sd->sd_nodelist; 5259 while (nd) { 5260 /* Skip non-alive nodes */ 5261 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5262 nd = nd->nd_next; 5263 continue; 5264 } 5265 5266 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 5267 /* RPC failure to another node */ 5268 if ((mdanyrpcerror(ep)) && 5269 (sd->sd_mn_mynode->nd_nodeid != 5270 nd->nd_nodeid)) { 5271 rval = 205; 5272 } else { 5273 /* Any other failure */ 5274 rval = -1; 5275 } 5276 goto out; 5277 } 5278 set_locked = 1; 5279 5280 /* Withdraw node from set if owner */ 5281 if ((nd->nd_flags & MD_MN_NODE_OWN) && 5282 (clnt_withdrawset(nd->nd_nodename, sp, ep))) { 5283 /* RPC failure to another node */ 5284 if ((mdanyrpcerror(ep)) && 5285 (sd->sd_mn_mynode->nd_nodeid != 5286 nd->nd_nodeid)) { 5287 rval = 205; 5288 } else { 5289 /* Any other failure */ 5290 rval = -1; 5291 } 5292 goto out; 5293 } 5294 5295 /* Mark all nodes as withdrawn on this node */ 5296 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5297 sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) { 5298 /* RPC failure to another node */ 5299 if ((mdanyrpcerror(ep)) && 5300 (sd->sd_mn_mynode->nd_nodeid != 5301 nd->nd_nodeid)) { 5302 rval = 205; 5303 } else { 5304 /* Any other failure */ 5305 rval = -1; 5306 } 5307 goto out; 5308 } 5309 5310 /* Resets master to no-master on this node */ 5311 if (clnt_mnsetmaster(nd->nd_nodename, sp, 5312 "", MD_MN_INVALID_NID, ep)) { 5313 /* RPC failure to another node */ 5314 if ((mdanyrpcerror(ep)) && 5315 (sd->sd_mn_mynode->nd_nodeid != 5316 nd->nd_nodeid)) { 5317 rval = 205; 5318 } else { 5319 /* Any other failure */ 5320 rval = -1; 5321 } 5322 goto out; 5323 } 5324 5325 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5326 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 5327 /* RPC failure to another node */ 5328 if ((mdanyrpcerror(ep)) && 5329 (sd->sd_mn_mynode->nd_nodeid != 5330 nd->nd_nodeid)) { 5331 rval = 205; 5332 } else { 5333 /* Any other failure */ 5334 rval = -1; 5335 } 5336 goto out; 5337 } 5338 set_locked = 0; 5339 nd = nd->nd_next; 5340 } 5341 } 5342 5343 out: 5344 /* 5345 * If got here and set is still locked, then an error has 5346 * occurred and master_nodelist is still valid. 5347 * If error is not an RPC error, then unlock. 5348 * If error is an RPC error, skip unlocks since this could cause 5349 * yet another RPC timeout if a node has failed. 5350 * Ignore failures in unlock since unlock is just trying to 5351 * clean things up. 5352 */ 5353 if ((set_locked) && !(mdanyrpcerror(ep))) { 5354 nd = master_nodelist; 5355 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5356 while (nd) { 5357 /* Skip non-alive nodes */ 5358 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5359 nd = nd->nd_next; 5360 continue; 5361 } 5362 /* 5363 * If clnt_unlock fails, just break out since next 5364 * reconfig cycle will reset the locks anyway. 5365 */ 5366 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 5367 break; 5368 } 5369 nd = nd->nd_next; 5370 } 5371 cl_set_setkey(NULL); 5372 } 5373 /* Free master_mnsr and drive descs */ 5374 mnsr_node = master_mnsr_node; 5375 while (mnsr_node) { 5376 master_mnsr_node = mnsr_node->mmn_next; 5377 free_sr((md_set_record *)mnsr_node->mmn_mnsr); 5378 free_rem_dd(mnsr_node->mmn_dd); 5379 Free(mnsr_node); 5380 mnsr_node = master_mnsr_node; 5381 } 5382 5383 /* Frees sd->sd_drvs (which is also master_dd) */ 5384 metaflushsetname(sp); 5385 return (rval); 5386 } 5387 5388 /* 5389 * meta_mnsync_diskset_mddbs 5390 * Calling node is guaranteed to be an owner node. 5391 * Calling node is the master node. 5392 * 5393 * Master node verifies that ondisk mddb format matches its incore format. 5394 * If no nodes are joined to set, remove the change log entries. 5395 * If a node is joined to set, play the change log. 5396 * 5397 * Returns 0 - Success 5398 * 1 - Master unable to join to set. 5399 * 205 - Failure during RPC to another node 5400 * -1 - Any other failure and ep is filled in. 5401 * -1 return will eventually cause node to panic 5402 * in a SunCluster environment. 5403 */ 5404 int 5405 meta_mnsync_diskset_mddbs( 5406 mdsetname_t *sp, 5407 md_error_t *ep 5408 ) 5409 { 5410 md_set_desc *sd; 5411 mddb_config_t c; 5412 md_mn_msgclass_t class; 5413 mddb_setflags_config_t sf; 5414 md_mnnode_desc *nd, *nd2; 5415 md_error_t xep = mdnullerror; 5416 int stale_set = 0; 5417 5418 /* If setname is there, set desc should exist. */ 5419 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5420 mde_perror(ep, dgettext(TEXT_DOMAIN, 5421 "Unable to get set %s desc information"), sp->setname); 5422 return (-1); 5423 } 5424 5425 /* Are there drives in the set? */ 5426 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5427 ep) == NULL) { 5428 if (! mdisok(ep)) { 5429 return (-1); 5430 } 5431 /* No drives in set -- nothing to sync up */ 5432 return (0); 5433 } 5434 5435 /* 5436 * Is master node (which is this node) joined to set? 5437 * If master node isn't joined (which means that no nodes 5438 * are joined to diskset), remove the change log entries 5439 * since no need to replay them - all nodes will have same 5440 * view of mddbs since all nodes are reading in the mddbs 5441 * from disk. 5442 * There is also no need to sync up the master and ondisk mddbs 5443 * since master has no incore knowledge. 5444 * Need to join master to set in order to flush the change 5445 * log entries. Don't need to block I/O during join of master 5446 * to set since no other nodes are joined to set and so no I/O 5447 * can be occurring. 5448 */ 5449 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 5450 /* Join master to set */ 5451 if (clnt_joinset(mynode(), sp, 5452 MNSET_IN_RECONFIG, ep)) { 5453 if (mdismddberror(ep, MDE_DB_STALE)) { 5454 /* 5455 * If STALE, print message and continue on. 5456 * Don't do any writes or reads to mddbs 5457 * so don't clear change log. 5458 */ 5459 mde_perror(ep, dgettext(TEXT_DOMAIN, 5460 "Join of master node to STALE set %s"), 5461 sp->setname); 5462 stale_set = 1; 5463 mdclrerror(ep); 5464 } else if (mdismddberror(ep, MDE_DB_ACCOK)) { 5465 /* ACCOK means mediator provided extra vote */ 5466 mdclrerror(ep); 5467 } else { 5468 /* 5469 * If master is unable to join set, print an 5470 * error message. Don't return failure or node 5471 * will panic during cluster reconfig cycle. 5472 * Also, withdraw node from set in order to 5473 * cleanup from failed join attempt. 5474 */ 5475 mde_perror(ep, dgettext(TEXT_DOMAIN, 5476 "Join of master node in set %s failed"), 5477 sp->setname); 5478 if (clnt_withdrawset(mynode(), sp, &xep)) 5479 mdclrerror(&xep); 5480 return (1); 5481 } 5482 } 5483 /* 5484 * Master node successfully joined. 5485 * Set local copy of flags to OWN and 5486 * send owner flag to rpc.metad. If not stale, 5487 * flush the change log. 5488 */ 5489 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 5490 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET, 5491 MNSET_IN_RECONFIG, ep)) { 5492 mde_perror(ep, dgettext(TEXT_DOMAIN, 5493 "Flag update of master node join in set %s failed"), 5494 sp->setname); 5495 return (-1); 5496 } 5497 5498 if (!stale_set) { 5499 if (mdmn_reset_changelog(sp, ep, 5500 MDMN_CLF_RESETLOG) != 0) { 5501 mde_perror(ep, dgettext(TEXT_DOMAIN, 5502 "Unable to reset changelog.")); 5503 return (-1); 5504 } 5505 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5506 "Removed changelog entries for set %s: %s"), 5507 sp->setname, 5508 meta_print_hrtime(gethrtime() - start_time)); 5509 } 5510 /* Reset new master flag before return */ 5511 (void) memset(&sf, 0, sizeof (sf)); 5512 sf.sf_setno = sp->setno; 5513 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5514 sf.sf_flags = MDDB_NM_RESET; 5515 /* Use magic to help protect ioctl against attack. */ 5516 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5517 /* Ignore failure, failure to reset flag isn't catastrophic */ 5518 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5519 &sf.sf_mde, NULL); 5520 5521 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5522 "Reset new master flag for set %s: %s"), 5523 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5524 5525 return (0); 5526 } 5527 5528 /* 5529 * Is master already joined to STALE set (< 50% mddbs avail)? 5530 * If so, can make no config changes to mddbs so don't check or play 5531 * changelog and don't sync master node to ondisk mddbs. 5532 * To get out of the stale state all nodes must be withdrawn 5533 * from set. Then as nodes are re-joined, all nodes will 5534 * have same view of mddbs since all nodes are reading the 5535 * mddbs from disk. 5536 */ 5537 (void) memset(&c, 0, sizeof (c)); 5538 c.c_id = 0; 5539 c.c_setno = sp->setno; 5540 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 5541 (void) mdstealerror(ep, &c.c_mde); 5542 return (-1); 5543 } 5544 if (c.c_flags & MDDB_C_STALE) { 5545 return (0); 5546 } 5547 5548 /* 5549 * If this node is NOT a newly chosen master, then there's 5550 * nothing else to do since the change log should be empty and 5551 * the ondisk and incore mddbs are already consistent. 5552 * 5553 * A newly chosen master is a node that was not the master 5554 * at the beginning of the reconfig cycle. If a node is a new 5555 * master, then the new master state is reset after the ondisk 5556 * and incore mddbs are consistent and the change log has 5557 * been replayed. 5558 */ 5559 (void) memset(&sf, 0, sizeof (sf)); 5560 sf.sf_setno = sp->setno; 5561 sf.sf_flags = MDDB_NM_GET; 5562 /* Use magic to help protect ioctl against attack. */ 5563 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5564 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) && 5565 ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) { 5566 return (0); 5567 } 5568 5569 /* 5570 * Now, sync up incore master view to ondisk mddbs. 5571 * This is needed in the case where a master node 5572 * had made a change to the mddb, but this change 5573 * may not have been relayed to the slaves yet. 5574 * So, the new master needs to verify that the ondisk 5575 * mddbs match what the new master has incore - 5576 * if different, new master rewrites all of the mddbs. 5577 * Then the new master will replay the changelog and the 5578 * new master will then execute what the old master had 5579 * done. 5580 * 5581 * Block all I/Os to disks in this diskset on all nodes in 5582 * the diskset. This will allow the rewriting of the mddbs 5583 * (if needed), to proceed in a timely manner. 5584 * 5585 * If block of I/Os fail, return a -1. 5586 */ 5587 5588 nd = sd->sd_nodelist; 5589 while (nd) { 5590 /* Skip non-alive and non-owner nodes */ 5591 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5592 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5593 nd = nd->nd_next; 5594 continue; 5595 } 5596 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5597 MN_SUSP_IO, ep)) { 5598 mde_perror(ep, dgettext(TEXT_DOMAIN, 5599 "Unable to suspend I/O on node %s in set %s"), 5600 nd->nd_nodename, sp->setname); 5601 5602 /* 5603 * Resume all other nodes that had been suspended. 5604 * (Reconfig return step also resumes I/Os 5605 * for all sets.) 5606 */ 5607 nd2 = sd->sd_nodelist; 5608 while (nd2) { 5609 /* Stop when reaching failed node */ 5610 if (nd2->nd_nodeid == nd->nd_nodeid) 5611 break; 5612 /* Skip non-alive and non-owner nodes */ 5613 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5614 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5615 nd2 = nd2->nd_next; 5616 continue; 5617 } 5618 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5619 sp->setno, MN_RES_IO, &xep)); 5620 nd2 = nd2->nd_next; 5621 } 5622 5623 /* 5624 * If an RPC failure on another node, return a 205. 5625 * Otherwise, exit with failure. 5626 */ 5627 if ((mdanyrpcerror(ep)) && 5628 (sd->sd_mn_mynode->nd_nodeid != 5629 nd->nd_nodeid)) { 5630 return (205); 5631 } else { 5632 return (-1); 5633 } 5634 5635 } 5636 nd = nd->nd_next; 5637 } 5638 5639 (void) memset(&c, 0, sizeof (c)); 5640 c.c_id = 0; 5641 c.c_setno = sp->setno; 5642 /* Master can't sync up to ondisk mddbs? Kick it out of cluster */ 5643 if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0) 5644 return (-1); 5645 5646 /* 5647 * Resume I/Os that were suspended above. 5648 */ 5649 nd = sd->sd_nodelist; 5650 while (nd) { 5651 /* Skip non-alive and non-owner nodes */ 5652 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5653 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5654 nd = nd->nd_next; 5655 continue; 5656 } 5657 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5658 MN_RES_IO, ep)) { 5659 mde_perror(ep, dgettext(TEXT_DOMAIN, 5660 "Unable to resume I/O on node %s in set %s"), 5661 nd->nd_nodename, sp->setname); 5662 5663 /* 5664 * If an RPC failure then don't do any 5665 * more RPC calls, since one timeout is enough 5666 * to endure. If RPC failure to another node, return 5667 * 205. If RPC failure to my node, return -1. 5668 * If not an RPC failure, continue resuming the 5669 * rest of the nodes and then return -1. 5670 */ 5671 if (mdanyrpcerror(ep)) { 5672 if (sd->sd_mn_mynode->nd_nodeid == 5673 nd->nd_nodeid) { 5674 return (-1); 5675 } else { 5676 return (205); 5677 } 5678 } 5679 5680 /* 5681 * If not an RPC error, continue resuming rest of 5682 * nodes, ignoring any failures except for an 5683 * RPC failure which constitutes an immediate exit. 5684 * Start in middle of list with failing node. 5685 */ 5686 nd2 = nd->nd_next; 5687 while (nd2) { 5688 /* Skip non-alive and non-owner nodes */ 5689 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5690 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5691 nd2 = nd2->nd_next; 5692 continue; 5693 } 5694 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5695 sp->setno, MN_RES_IO, &xep)); 5696 if (mdanyrpcerror(&xep)) { 5697 return (-1); 5698 } 5699 nd2 = nd2->nd_next; 5700 } 5701 } 5702 nd = nd->nd_next; 5703 } 5704 5705 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed " 5706 "checking/writing the mddb for set %s: %s"), sp->setname, 5707 meta_print_hrtime(gethrtime() - start_time)); 5708 5709 /* 5710 * Send (aka replay) all messages we find in the changelog. 5711 * Flag the messages with 5712 * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them 5713 * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd. 5714 */ 5715 for (class = MD_MN_NCLASSES - 1; class > 0; class--) { 5716 mdmn_changelog_record_t *lr; 5717 md_error_t xep = mdnullerror; 5718 md_mn_result_t *resultp = NULL; 5719 int ret; 5720 5721 lr = mdmn_get_changelogrec(sp->setno, class); 5722 if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) { 5723 /* no entry for this class */ 5724 continue; 5725 } 5726 5727 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5728 "replaying message ID=(%d, 0x%llx-%d)\n"), 5729 MSGID_ELEMS(lr->lr_msg.msg_msgid)); 5730 5731 ret = mdmn_send_message_with_msgid( 5732 lr->lr_msg.msg_setno, 5733 lr->lr_msg.msg_type, 5734 lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | 5735 MD_MSGF_OVERRIDE_SUSPEND, 5736 lr->lr_msg.msg_event_data, 5737 lr->lr_msg.msg_event_size, 5738 &resultp, 5739 &lr->lr_msg.msg_msgid, 5740 &xep); 5741 5742 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5743 "mdmn_send_message returned %d\n"), ret); 5744 5745 if (resultp) 5746 free_result(resultp); 5747 } 5748 5749 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5750 "Playing changelog completed for set %s: %s"), 5751 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5752 5753 /* 5754 * Now that new master has ondisk and incore mddbs in sync, reset 5755 * this node's new master kernel flag (for this set). If this node 5756 * re-enters another reconfig cycle before the completion of this 5757 * reconfig cycle, this master node won't need to check if the ondisk 5758 * and incore mddbs are in sync since this node won't be considered 5759 * a new master (since this flag is being reset here in the middle of 5760 * step2). This will save time during any subsequent reconfig 5761 * cycles as long as this node continues to be master. 5762 */ 5763 (void) memset(&sf, 0, sizeof (sf)); 5764 sf.sf_setno = sp->setno; 5765 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5766 sf.sf_flags = MDDB_NM_RESET; 5767 /* Use magic to help protect ioctl against attack. */ 5768 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5769 /* Ignore failure, since failure to reset flag isn't catastrophic */ 5770 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL); 5771 5772 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5773 "Reset new master flag for set %s: %s"), 5774 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5775 5776 return (0); 5777 } 5778 5779 /* 5780 * meta_mnjoin_all will join all starting nodes in the diskset. 5781 * A starting node is considered to be any node that is not 5782 * an owner of the set but is a member of the cluster. 5783 * Master node is already joined to set (done in meta_mnsync_diskset_mddbs). 5784 * 5785 * Caller is the Master node. 5786 * 5787 * Returns 0 - Success 5788 * 205 - Failure during RPC to another node 5789 * -1 - Any other failure and ep is filled in. 5790 */ 5791 int 5792 meta_mnjoin_all( 5793 mdsetname_t *sp, 5794 md_error_t *ep 5795 ) 5796 { 5797 md_set_desc *sd; 5798 md_mnnode_desc *nd, *nd2; 5799 int rval = 0; 5800 int stale_flag = 0; 5801 mddb_config_t c; 5802 int susp_res_flag = 0; 5803 md_error_t xep = mdnullerror; 5804 5805 /* If setname is there, set desc should exist. */ 5806 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5807 mde_perror(ep, dgettext(TEXT_DOMAIN, 5808 "Unable to get set %s desc information"), sp->setname); 5809 return (-1); 5810 } 5811 5812 /* Are there drives in the set? */ 5813 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5814 ep) == NULL) { 5815 if (! mdisok(ep)) { 5816 return (-1); 5817 } 5818 /* No drives in set -- nothing to join */ 5819 return (0); 5820 } 5821 5822 /* 5823 * Is set currently stale? 5824 */ 5825 (void) memset(&c, 0, sizeof (c)); 5826 c.c_id = 0; 5827 c.c_setno = sp->setno; 5828 /* Ignore failure since master node may not be joined yet */ 5829 (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); 5830 if (c.c_flags & MDDB_C_STALE) { 5831 stale_flag = MNSET_IS_STALE; 5832 } 5833 5834 /* 5835 * If any nodes are going to be joined to diskset, then 5836 * suspend I/O to all disks in diskset so that nodes can join 5837 * (read in mddbs) in a reasonable amount of time even under 5838 * high I/O load. Don't need to do this if set is STALE since 5839 * no I/O can be occurring to a STALE set. 5840 */ 5841 if (stale_flag != MNSET_IS_STALE) { 5842 nd = sd->sd_nodelist; 5843 while (nd) { 5844 /* Found a node that will be joined to diskset */ 5845 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5846 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5847 /* Set flag that diskset should be suspended */ 5848 susp_res_flag = 1; 5849 break; 5850 } 5851 nd = nd->nd_next; 5852 } 5853 } 5854 5855 if (susp_res_flag) { 5856 /* 5857 * Block all I/Os to disks in this diskset on all joined 5858 * nodes in the diskset. 5859 * If block of I/Os fails due to an RPC failure on another 5860 * node, return 205; otherwise, return -1. 5861 */ 5862 nd = sd->sd_nodelist; 5863 while (nd) { 5864 /* Skip non-alive and non-owner nodes */ 5865 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5866 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5867 nd = nd->nd_next; 5868 continue; 5869 } 5870 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5871 MN_SUSP_IO, ep)) { 5872 mde_perror(ep, dgettext(TEXT_DOMAIN, 5873 "Unable to suspend I/O on node %s" 5874 " in set %s"), nd->nd_nodename, 5875 sp->setname); 5876 /* 5877 * Resume other nodes that had been suspended. 5878 * (Reconfig return step also resumes I/Os 5879 * for all sets.) 5880 */ 5881 nd2 = sd->sd_nodelist; 5882 while (nd2) { 5883 /* Stop when reaching failed node */ 5884 if (nd2->nd_nodeid == nd->nd_nodeid) 5885 break; 5886 /* Skip non-alive/non-owner nodes */ 5887 if ((!(nd2->nd_flags & 5888 MD_MN_NODE_ALIVE)) || 5889 (!(nd2->nd_flags & 5890 MD_MN_NODE_OWN))) { 5891 nd2 = nd2->nd_next; 5892 continue; 5893 } 5894 (void) (clnt_mn_susp_res_io( 5895 nd2->nd_nodename, sp->setno, 5896 MN_RES_IO, &xep)); 5897 nd2 = nd2->nd_next; 5898 } 5899 5900 /* 5901 * If the suspend failed due to an 5902 * RPC failure on another node, return 5903 * a 205. 5904 * Otherwise, exit with failure. 5905 * The return reconfig step will resume 5906 * I/Os for all disksets. 5907 */ 5908 if ((mdanyrpcerror(ep)) && 5909 (sd->sd_mn_mynode->nd_nodeid != 5910 nd->nd_nodeid)) { 5911 return (205); 5912 } else { 5913 return (-1); 5914 } 5915 } 5916 nd = nd->nd_next; 5917 } 5918 } 5919 5920 nd = sd->sd_nodelist; 5921 while (nd) { 5922 /* 5923 * If a node is in the membership list but isn't joined 5924 * to the set, try to join the node. 5925 */ 5926 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5927 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5928 if (clnt_joinset(nd->nd_nodename, sp, 5929 (MNSET_IN_RECONFIG | stale_flag), ep)) { 5930 /* 5931 * If RPC failure to another node 5932 * then exit without attempting anything else. 5933 * (Reconfig return step will resume I/Os 5934 * for all sets.) 5935 */ 5936 if (mdanyrpcerror(ep)) { 5937 mde_perror(ep, ""); 5938 return (205); 5939 } 5940 /* 5941 * STALE and ACCOK failures aren't true 5942 * failures. STALE means that <50% mddbs 5943 * are available. ACCOK means that the 5944 * mediator provided the extra vote. 5945 * If a true failure, then print messasge 5946 * and withdraw node from set in order to 5947 * cleanup from failed join attempt. 5948 */ 5949 if ((!mdismddberror(ep, MDE_DB_STALE)) && 5950 (!mdismddberror(ep, MDE_DB_ACCOK))) { 5951 mde_perror(ep, 5952 "WARNING: Unable to join node %s " 5953 "to set %s", nd->nd_nodename, 5954 sp->setname); 5955 mdclrerror(ep); 5956 if (clnt_withdrawset(nd->nd_nodename, 5957 sp, &xep)) 5958 mdclrerror(&xep); 5959 nd = nd->nd_next; 5960 continue; 5961 } 5962 } 5963 /* Set owner flag even if STALE or ACCOK */ 5964 nd->nd_flags |= MD_MN_NODE_OWN; 5965 } 5966 nd = nd->nd_next; 5967 } 5968 /* 5969 * Resume I/Os if suspended above. 5970 */ 5971 if (susp_res_flag) { 5972 nd = sd->sd_nodelist; 5973 while (nd) { 5974 /* 5975 * Skip non-alive and non-owner nodes 5976 * (this list doesn't include any of 5977 * the nodes that were joined). 5978 */ 5979 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5980 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5981 nd = nd->nd_next; 5982 continue; 5983 } 5984 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5985 MN_RES_IO, ep)) { 5986 mde_perror(ep, dgettext(TEXT_DOMAIN, 5987 "Unable to resume I/O on node %s" 5988 " in set %s"), nd->nd_nodename, 5989 sp->setname); 5990 5991 /* 5992 * If an RPC failure then don't do any 5993 * more RPC calls, since one timeout is enough 5994 * to endure. If RPC failure to another node, 5995 * return 205. If RPC failure to my node, 5996 * return -1. 5997 * (Reconfig return step will resume I/Os 5998 * for all sets.) 5999 * If not an RPC failure, continue resuming the 6000 * rest of the nodes and then return -1. 6001 */ 6002 if (mdanyrpcerror(ep)) { 6003 if (sd->sd_mn_mynode->nd_nodeid == 6004 nd->nd_nodeid) { 6005 return (-1); 6006 } else { 6007 return (205); 6008 } 6009 } 6010 6011 /* 6012 * If not an RPC error, continue resuming rest 6013 * of nodes, ignoring any failures except for 6014 * an RPC failure which constitutes an 6015 * immediate exit. 6016 * Start in middle of list with failing node. 6017 */ 6018 nd2 = nd->nd_next; 6019 while (nd2) { 6020 /* Skip non-owner nodes */ 6021 if ((!(nd2->nd_flags & 6022 MD_MN_NODE_ALIVE)) || 6023 (!(nd2->nd_flags & 6024 MD_MN_NODE_OWN))) { 6025 nd2 = nd2->nd_next; 6026 continue; 6027 } 6028 (void) (clnt_mn_susp_res_io( 6029 nd2->nd_nodename, sp->setno, 6030 MN_RES_IO, &xep)); 6031 if (mdanyrpcerror(&xep)) { 6032 return (-1); 6033 } 6034 nd2 = nd2->nd_next; 6035 } 6036 } 6037 nd = nd->nd_next; 6038 } 6039 } 6040 6041 nd = sd->sd_nodelist; 6042 while (nd) { 6043 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 6044 nd = nd->nd_next; 6045 continue; 6046 } 6047 /* 6048 * If 1 node fails - go ahead and update the rest except 6049 * in the case of an RPC failure, fail immediately. 6050 */ 6051 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 6052 sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 6053 /* RPC failure to another node */ 6054 if (mdanyrpcerror(ep)) { 6055 return (205); 6056 } 6057 nd = nd->nd_next; 6058 rval = -1; 6059 continue; 6060 } 6061 nd = nd->nd_next; 6062 } 6063 6064 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 6065 "Join of all nodes completed for set %s: %s"), 6066 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 6067 6068 return (rval); 6069 } 6070