1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Just in case we're not in a build environment, make sure that 30 * TEXT_DOMAIN gets set to something. 31 */ 32 #if !defined(TEXT_DOMAIN) 33 #define TEXT_DOMAIN "SYS_TEST" 34 #endif 35 36 /* 37 * Metadevice diskset interfaces 38 */ 39 40 #include "meta_set_prv.h" 41 #include <meta.h> 42 #include <metad.h> 43 #include <mdmn_changelog.h> 44 #include <sys/lvm/md_crc.h> 45 #include <sys/utsname.h> 46 #include <sdssc.h> 47 48 #include <sys/sysevent/eventdefs.h> 49 #include <sys/sysevent/svm.h> 50 extern char *blkname(char *); 51 52 static md_drive_desc * 53 dr2drivedesc( 54 mdsetname_t *sp, 55 side_t sideno, 56 int flags, 57 md_error_t *ep 58 ) 59 { 60 md_set_record *sr; 61 md_drive_record *dr; 62 mddrivename_t *dnp; 63 md_drive_desc *dd_head = NULL; 64 md_set_desc *sd; 65 66 if (flags & MD_BYPASS_DAEMON) { 67 if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL) 68 return (NULL); 69 sd = metaget_setdesc(sp, ep); 70 sideno = getnodeside(mynode(), sd); 71 sp = metafakesetname(sp->setno, sr->sr_setname); 72 } else { 73 if ((sr = getsetbyname(sp->setname, ep)) == NULL) 74 return (NULL); 75 } 76 77 assert(sideno != MD_SIDEWILD); 78 79 /* 80 * WARNING: 81 * The act of getting the dnp from the namespace means that we 82 * will get the devid of the disk as recorded in the namespace. 83 * This devid has the potential to be stale if the disk is being 84 * replaced via a rebind, this means that any code that relies 85 * on any of the dnp information should take the appropriate action 86 * to preserve that information. For example in the rebind code the 87 * devid of the new disk is saved off and then copied back in once 88 * the code that has called this function has completed. 89 */ 90 for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { 91 if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key, 92 flags, ep)) == NULL) { 93 if (!(flags & MD_BYPASS_DAEMON)) 94 free_sr(sr); 95 metafreedrivedesc(&dd_head); 96 return (NULL); 97 } 98 99 (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt, 100 dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags); 101 } 102 103 if (!(flags & MD_BYPASS_DAEMON)) { 104 free_sr(sr); 105 } 106 return (dd_head); 107 } 108 109 static int 110 get_sidenmlist( 111 mdsetname_t *sp, 112 mddrivename_t *dnp, 113 md_error_t *ep 114 ) 115 { 116 md_set_desc *sd; 117 mdsidenames_t *sn, **sn_next; 118 int i; 119 120 if ((sd = metaget_setdesc(sp, ep)) == NULL) 121 return (-1); 122 123 metaflushsidenames(dnp); 124 sn_next = &dnp->side_names; 125 if (MD_MNSET_DESC(sd)) { 126 /* 127 * Only get sidenames for this node since 128 * that is the only side information stored in 129 * the local mddb for a multi-node diskset. 130 */ 131 if (sd->sd_mn_mynode) { 132 sn = Zalloc(sizeof (*sn)); 133 sn->sideno = sd->sd_mn_mynode->nd_nodeid; 134 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 135 sn->sideno, dnp->side_names_key, &sn->dname, 136 &sn->mnum, NULL, ep)) == NULL) { 137 if (sn->dname != NULL) 138 Free(sn->dname); 139 Free(sn); 140 return (-1); 141 } 142 143 /* Add to the end of the linked list */ 144 assert(*sn_next == NULL); 145 *sn_next = sn; 146 sn_next = &sn->next; 147 } 148 } else { 149 for (i = 0; i < MD_MAXSIDES; i++) { 150 /* Skip empty slots */ 151 if (sd->sd_nodes[i][0] == '\0') 152 continue; 153 154 sn = Zalloc(sizeof (*sn)); 155 sn->sideno = i; 156 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 157 i+SKEW, dnp->side_names_key, &sn->dname, 158 &sn->mnum, NULL, ep)) == NULL) { 159 /* 160 * It is possible that during the add of a 161 * host to have a 'missing' side as the side 162 * for this disk will be added later. So ignore 163 * the error. The 'missing' side will be added 164 * once the addhosts process has completed. 165 */ 166 if (mdissyserror(ep, ENOENT)) { 167 mdclrerror(ep); 168 Free(sn); 169 continue; 170 } 171 172 if (sn->dname != NULL) 173 Free(sn->dname); 174 Free(sn); 175 return (-1); 176 } 177 178 /* Add to the end of the linked list */ 179 assert(*sn_next == NULL); 180 *sn_next = sn; 181 sn_next = &sn->next; 182 } 183 } 184 185 return (0); 186 } 187 188 static md_drive_desc * 189 rl_to_dd( 190 mdsetname_t *sp, 191 md_replicalist_t *rlp, 192 md_error_t *ep 193 ) 194 { 195 md_replicalist_t *rl; 196 md_replica_t *r; 197 md_drive_desc *dd = NULL; 198 md_drive_desc *d; 199 int found; 200 md_set_desc *sd; 201 daddr_t nblks = 0; 202 203 if ((sd = metaget_setdesc(sp, ep)) == NULL) 204 return (NULL); 205 206 /* find the smallest existing replica */ 207 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 208 r = rl->rl_repp; 209 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 210 } 211 212 if (nblks <= 0) 213 nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 214 215 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 216 r = rl->rl_repp; 217 218 found = 0; 219 for (d = dd; d != NULL; d = d->dd_next) { 220 if (strcmp(r->r_namep->drivenamep->cname, 221 d->dd_dnp->cname) == 0) { 222 found = 1; 223 dd->dd_dbcnt++; 224 break; 225 } 226 } 227 228 if (! found) 229 (void) metadrivedesc_append(&dd, r->r_namep->drivenamep, 230 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK); 231 } 232 233 return (dd); 234 } 235 236 /* 237 * Exported Entry Points 238 */ 239 240 set_t 241 get_max_sets(md_error_t *ep) 242 { 243 244 static set_t max_sets = 0; 245 246 if (max_sets == 0) 247 if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0) 248 return (0); 249 250 return (max_sets); 251 } 252 253 int 254 get_max_meds(md_error_t *ep) 255 { 256 static int max_meds = 0; 257 258 if (max_meds == 0) 259 if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0) 260 return (0); 261 262 return (max_meds); 263 } 264 265 side_t 266 getmyside(mdsetname_t *sp, md_error_t *ep) 267 { 268 md_set_desc *sd; 269 char *node = NULL; 270 side_t sideno; 271 272 if (sp->setno == 0) 273 return (0); 274 275 if ((sd = metaget_setdesc(sp, ep)) == NULL) 276 return (MD_SIDEWILD); 277 278 node = mynode(); 279 280 assert(node != NULL); 281 282 sideno = getnodeside(node, sd); 283 284 if (sideno != MD_SIDEWILD) 285 return (sideno); 286 287 return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node)); 288 } 289 290 /* 291 * get set info from name 292 */ 293 md_set_record * 294 getsetbyname(char *setname, md_error_t *ep) 295 { 296 md_set_record *sr = NULL; 297 md_mnset_record *mnsr = NULL; 298 char *p; 299 size_t len; 300 301 /* get set info from daemon */ 302 if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1) 303 return (NULL); 304 if (sr != NULL) { 305 /* 306 * Returned record could be for a multi-node set or a 307 * non-multi-node set. 308 */ 309 if (MD_MNSET_REC(sr)) { 310 /* 311 * Record is for a multi-node set. Reissue call 312 * to get mnset information. Need to free 313 * record as if a non-multi-node set record since 314 * that is what clnt_getset gave us. If in 315 * the daemon, don't free since this is a pointer 316 * into the setrecords array. 317 */ 318 if (! md_in_daemon) { 319 sr->sr_flags &= ~MD_SR_MN; 320 free_sr(sr); 321 } 322 if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr, 323 ep) == -1) 324 return (NULL); 325 if (mnsr != NULL) 326 return ((struct md_set_record *)mnsr); 327 } else { 328 return (sr); 329 } 330 } 331 332 /* no such set */ 333 len = strlen(setname) + 30; 334 p = Malloc(len); 335 (void) snprintf(p, len, "setname \"%s\"", setname); 336 (void) mderror(ep, MDE_NO_SET, p); 337 Free(p); 338 return (NULL); 339 } 340 341 /* 342 * get set info from number 343 */ 344 md_set_record * 345 getsetbynum(set_t setno, md_error_t *ep) 346 { 347 md_set_record *sr; 348 md_mnset_record *mnsr = NULL; 349 char buf[100]; 350 351 if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1) 352 return (NULL); 353 354 if (sr != NULL) { 355 /* 356 * Record is for a multi-node set. Reissue call 357 * to get mnset information. Need to free 358 * record as if a non-multi-node set record since 359 * that is what clnt_getset gave us. If in 360 * the daemon, don't free since this is a pointer 361 * into the setrecords array. 362 */ 363 if (MD_MNSET_REC(sr)) { 364 /* 365 * Record is for a multi-node set. Reissue call 366 * to get mnset information. 367 */ 368 if (! md_in_daemon) { 369 sr->sr_flags &= ~MD_SR_MN; 370 free_sr(sr); 371 } 372 if (clnt_mngetset(mynode(), NULL, setno, &mnsr, 373 ep) == -1) 374 return (NULL); 375 if (mnsr != NULL) 376 return ((struct md_set_record *)mnsr); 377 } else { 378 return (sr); 379 } 380 } 381 382 (void) sprintf(buf, "setno %u", setno); 383 (void) mderror(ep, MDE_NO_SET, buf); 384 return (NULL); 385 } 386 387 int 388 meta_check_drive_inuse( 389 mdsetname_t *sp, 390 mddrivename_t *dnp, 391 int check_db, 392 md_error_t *ep 393 ) 394 { 395 mdnamelist_t *nlp = NULL; 396 mdnamelist_t *p; 397 int rval = 0; 398 399 /* get all underlying partitions */ 400 if (meta_getalldevs(sp, &nlp, check_db, ep) != 0) 401 return (-1); 402 403 /* search for drive */ 404 for (p = nlp; (p != NULL); p = p->next) { 405 mdname_t *np = p->namep; 406 407 if (strcmp(dnp->cname, np->drivenamep->cname) == 0) { 408 rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno, 409 NULL, dnp->cname, sp->setname)); 410 break; 411 } 412 } 413 414 /* cleanup, return success */ 415 metafreenamelist(nlp); 416 return (rval); 417 } 418 419 /* 420 * simple check for ownership 421 */ 422 int 423 meta_check_ownership(mdsetname_t *sp, md_error_t *ep) 424 { 425 int ownset; 426 md_set_desc *sd; 427 md_drive_desc *dd; 428 md_replicalist_t *rlp = NULL; 429 md_error_t xep = mdnullerror; 430 431 if (metaislocalset(sp)) 432 return (0); 433 434 ownset = own_set(sp, NULL, TRUE, ep); 435 if (! mdisok(ep)) 436 return (-1); 437 438 if ((sd = metaget_setdesc(sp, ep)) == NULL) 439 return (-1); 440 441 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 442 if (! mdisok(ep)) 443 return (-1); 444 445 /* If we have no drive descriptors, check for no ownership */ 446 if (dd == NULL) { 447 if (ownset == MD_SETOWNER_NONE) 448 return (0); 449 450 /* If ownership somehow has come to exist, we must clean up */ 451 452 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, 453 &xep) < 0) 454 mdclrerror(&xep); 455 456 if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL) 457 if (! mdisok(&xep)) 458 mdclrerror(&xep); 459 460 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 461 if (rel_own_bydd(sp, dd, TRUE, &xep)) 462 mdclrerror(&xep); 463 } 464 465 if (halt_set(sp, &xep)) 466 mdclrerror(&xep); 467 468 metafreereplicalist(rlp); 469 470 metafreedrivedesc(&dd); 471 472 return (0); 473 } 474 475 metafreedrivedesc(&sd->sd_drvs); 476 477 if (ownset == MD_SETOWNER_YES) 478 return (0); 479 480 return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL, 481 sp->setname)); 482 } 483 484 /* 485 * simple check for ownership 486 */ 487 int 488 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep) 489 { 490 md_set_desc *sd; 491 md_drive_desc *dd; 492 int bool; 493 494 if (metaislocalset(sp)) 495 return (0); 496 497 if ((sd = metaget_setdesc(sp, ep)) == NULL) 498 return (-1); 499 500 if (getnodeside(hostname, sd) == MD_SIDEWILD) 501 return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 502 hostname, NULL, sp->setname)); 503 504 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 505 if (! mdisok(ep)) 506 return (-1); 507 508 if (clnt_ownset(hostname, sp, &bool, ep) == -1) 509 return (-1); 510 511 if (dd == NULL) 512 return (0); 513 514 metafreedrivedesc(&sd->sd_drvs); 515 516 if (bool == TRUE) 517 return (0); 518 519 return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL, 520 sp->setname)); 521 } 522 523 /* 524 * Function that determines if a node is in the multinode diskset 525 * membership list. Calling node passes in node to be checked and 526 * the nodelist as returned from meta_read_nodelist. This routine 527 * anticipates being called many times using the same diskset membership 528 * list which is why the alloc and free of the diskset membership list 529 * is left to the calling routine. 530 * Returns: 531 * 1 - if a member 532 * 0 - not a member 533 */ 534 int 535 meta_is_member( 536 char *node_name, 537 md_mn_nodeid_t node_id, 538 mndiskset_membershiplist_t *nl 539 ) 540 { 541 mndiskset_membershiplist_t *nl2; 542 int flag_check_name; 543 544 if (node_id != 0) 545 flag_check_name = 0; 546 else if (node_name != NULL) 547 flag_check_name = 1; 548 else 549 return (0); 550 551 nl2 = nl; 552 while (nl2) { 553 if (flag_check_name) { 554 /* Compare given name against name in member list */ 555 if (strcmp(nl2->msl_node_name, node_name) == 0) 556 break; 557 } else { 558 /* Compare given nodeid against nodeid in member list */ 559 if (nl2->msl_node_id == node_id) 560 break; 561 } 562 nl2 = nl2->next; 563 } 564 /* No match found in member list */ 565 if (nl2 == NULL) { 566 return (0); 567 } 568 /* Return 1 if node is in member list */ 569 return (1); 570 } 571 572 /* 573 * meta_getnext_devinfo should go to the host that 574 * has the device, to return the device name, driver name, minor num. 575 * We can take the big cheat for now, since it is a requirement 576 * that the device names and device numbers are the same, and 577 * just get the info locally. 578 * 579 * This routine is very similar to meta_getnextside_devinfo except 580 * that the specific side to be used is being passed in. 581 * 582 * Exit status: 583 * 0 - No more side info to return 584 * 1 - More side info's to return 585 * -1 - An error has been detected 586 */ 587 /*ARGSUSED*/ 588 int 589 meta_getside_devinfo( 590 mdsetname_t *sp, /* for this set */ 591 char *bname, /* local block name (myside) */ 592 side_t sideno, /* sideno */ 593 char **ret_bname, /* block device name of returned side */ 594 char **ret_dname, /* driver name of returned side */ 595 minor_t *ret_mnum, /* minor number of returned side */ 596 md_error_t *ep 597 ) 598 { 599 mdname_t *np; 600 601 if (ret_bname != NULL) 602 *ret_bname = NULL; 603 if (ret_dname != NULL) 604 *ret_dname = NULL; 605 if (ret_mnum != NULL) 606 *ret_mnum = NODEV32; 607 608 609 if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL) 610 return (-1); 611 612 /* 613 * NOTE (future) - There will be more work here once devids are integrated 614 * into disksets. Then the side should be used to find the correct 615 * host and the b/d names should be gotten from that host. 616 */ 617 618 /* 619 * Return the side info. 620 */ 621 if (ret_bname != NULL) 622 *ret_bname = Strdup(np->bname); 623 624 if (ret_dname != NULL) { 625 mdcinfo_t *cinfo; 626 627 if ((cinfo = metagetcinfo(np, ep)) == NULL) 628 return (-1); 629 630 *ret_dname = Strdup(cinfo->dname); 631 } 632 633 if (ret_mnum != NULL) 634 *ret_mnum = meta_getminor(np->dev); 635 636 return (1); 637 } 638 639 /* 640 * Get the information on the device from the remote node using the devid 641 * of the disk. 642 * 643 * Exit status: 644 * 0 - No more side info to return 645 * 1 - More side info's to return 646 * -1 - An error has been detected 647 */ 648 int 649 meta_getnextside_devinfo( 650 mdsetname_t *sp, /* for this set */ 651 char *bname, /* local block name (myside) */ 652 side_t *sideno, /* previous sideno & returned sideno */ 653 char **ret_bname, /* block device name of returned side */ 654 char **ret_dname, /* driver name of returned side */ 655 minor_t *ret_mnum, /* minor number of returned side */ 656 md_error_t *ep 657 ) 658 { 659 md_set_desc *sd; 660 int i; 661 mdname_t *np; 662 mddrivename_t *dnp; 663 char *devidstr = NULL; 664 int devidstrlen; 665 md_dev64_t retdev = NODEV64; 666 char *ret_devname = NULL; 667 char *ret_blkdevname = NULL; 668 char *ret_driver = NULL; 669 char *nodename; 670 int fd; 671 int ret = -1; 672 char *minor_name = NULL; 673 md_mnnode_desc *nd; 674 675 676 if (ret_bname != NULL) 677 *ret_bname = NULL; 678 if (ret_dname != NULL) 679 *ret_dname = NULL; 680 if (ret_mnum != NULL) 681 *ret_mnum = NODEV32; 682 683 if (metaislocalset(sp)) { 684 /* no more sides - we are done */ 685 if (*sideno != MD_SIDEWILD) 686 return (0); 687 688 /* First time through - set up return sideno */ 689 *sideno = 0; 690 } else { 691 692 /* 693 * Find the next sideno, starting after the one given. 694 */ 695 if ((sd = metaget_setdesc(sp, ep)) == NULL) 696 return (-1); 697 698 if (MD_MNSET_DESC(sd)) { 699 nd = sd->sd_nodelist; 700 if ((*sideno == MD_SIDEWILD) && 701 (nd != (struct md_mnnode_desc *)NULL)) { 702 *sideno = nd->nd_nodeid; 703 } else { 704 while (nd) { 705 /* 706 * Found given sideno, now find 707 * next sideno, if there is one. 708 */ 709 if ((*sideno == nd->nd_nodeid) && 710 (nd->nd_next != 711 (struct md_mnnode_desc *)NULL)) { 712 *sideno = 713 nd->nd_next->nd_nodeid; 714 break; 715 } 716 nd = nd->nd_next; 717 } 718 if (nd == NULL) { 719 return (0); 720 } 721 } 722 if (*sideno == MD_SIDEWILD) 723 return (0); 724 } else { 725 for (i = (*sideno)+1; i < MD_MAXSIDES; i++) 726 /* Find next full slot */ 727 if (sd->sd_nodes[i][0] != '\0') 728 break; 729 730 /* No more sides - we are done */ 731 if (i == MD_MAXSIDES) 732 return (0); 733 734 /* Set up the return sideno */ 735 *sideno = i; 736 nodename = (char *)sd->sd_nodes[i]; 737 } 738 } 739 740 /* 741 * Need to pass the node the devid of the disk and get it to 742 * send back the details of the disk from that side. 743 */ 744 if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL) 745 return (-1); 746 747 dnp = np->drivenamep; 748 749 /* 750 * By default, set up the parameters so that they are copied out. 751 */ 752 if (ret_bname != NULL) 753 *ret_bname = Strdup(np->bname); 754 755 if (ret_dname != NULL) { 756 mdcinfo_t *cinfo; 757 758 if ((cinfo = metagetcinfo(np, ep)) == NULL) 759 return (-1); 760 761 *ret_dname = Strdup(cinfo->dname); 762 } 763 764 if (ret_mnum != NULL) 765 *ret_mnum = meta_getminor(np->dev); 766 767 /* 768 * Try some optimization. If this is the local set or the device 769 * is a metadevice then just copy the information. If the device 770 * does not have a devid (due to not having a minor name) then 771 * fall back to the pre-devid behaviour of copying the information 772 * on the device: this is okay because the sanity checks before this 773 * call would have found any issues with the device. If it's a 774 * multi-node diskset also just return ie. copy. 775 */ 776 if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) || 777 (MD_MNSET_DESC(sd))) 778 return (1); 779 780 if (np->minor_name == (char *)NULL) { 781 /* 782 * Have to get the minor name then. The slice should exist 783 * on the disk because it will have already been repartitioned 784 * up prior to getting to this point. 785 */ 786 if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) { 787 (void) mdsyserror(ep, errno, np->bname); 788 return (-1); 789 } 790 (void) devid_get_minor_name(fd, &minor_name); 791 np->minor_name = Strdup(minor_name); 792 devid_str_free(minor_name); 793 (void) close(fd); 794 } 795 796 /* allocate extra space for "/" and NULL hence +2 */ 797 devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2; 798 devidstr = (char *)Malloc(devidstrlen); 799 800 /* 801 * As a minor name is supplied then the ret_devname will be 802 * appropriate to that minor_name and in this case it will be 803 * a block device ie /dev/dsk. 804 */ 805 (void) snprintf(devidstr, devidstrlen, 806 "%s/%s", dnp->devid, np->minor_name); 807 808 ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev, 809 np->bname, &ret_devname, &ret_driver, ep); 810 811 Free(devidstr); 812 813 /* 814 * If the other side is not running device id in disksets, 815 * 'ret' is set to ENOTSUP in which case we fallback to 816 * the existing behaviour 817 */ 818 if (ret == ENOTSUP) 819 return (1); 820 else if (ret == -1) 821 return (-1); 822 823 /* 824 * ret_devname comes from the rpc call and is a 825 * raw device name. We need to make this into a 826 * block device via blkname for further processing. 827 * Unfortunately, when our device id isn't found in 828 * the system, the rpc call will return a " " in 829 * ret_devname in which case we need to fill that in 830 * as ret_blkname because blkname of " " returns NULL. 831 */ 832 if (ret_bname != NULL && ret_devname != NULL) { 833 ret_blkdevname = blkname(ret_devname); 834 if (ret_blkdevname == NULL) 835 *ret_bname = Strdup(ret_devname); 836 else 837 *ret_bname = Strdup(ret_blkdevname); 838 } 839 840 if (ret_dname != NULL && ret_driver != NULL) 841 *ret_dname = Strdup(ret_driver); 842 843 if (ret_mnum != NULL) 844 *ret_mnum = meta_getminor(retdev); 845 846 return (1); 847 } 848 849 int 850 meta_is_drive_in_anyset( 851 mddrivename_t *dnp, 852 mdsetname_t **spp, 853 int bypass_daemon, 854 md_error_t *ep 855 ) 856 { 857 set_t setno; 858 mdsetname_t *this_sp; 859 int is_it; 860 set_t max_sets; 861 862 if ((max_sets = get_max_sets(ep)) == 0) 863 return (-1); 864 865 assert(spp != NULL); 866 *spp = NULL; 867 868 for (setno = 1; setno < max_sets; setno++) { 869 if (!bypass_daemon) { 870 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 871 if (mdismddberror(ep, MDE_DB_NODB)) { 872 mdclrerror(ep); 873 return (0); 874 } 875 if (mdiserror(ep, MDE_NO_SET)) { 876 mdclrerror(ep); 877 continue; 878 } 879 return (-1); 880 } 881 } else 882 this_sp = metafakesetname(setno, NULL); 883 884 if ((is_it = meta_is_drive_in_thisset(this_sp, dnp, 885 bypass_daemon, ep)) == -1) { 886 if (mdiserror(ep, MDE_NO_SET)) { 887 mdclrerror(ep); 888 continue; 889 } 890 return (-1); 891 } 892 if (is_it) { 893 *spp = this_sp; 894 return (0); 895 } 896 } 897 return (0); 898 } 899 900 int 901 meta_is_drive_in_thisset( 902 mdsetname_t *sp, 903 mddrivename_t *dnp, 904 int bypass_daemon, 905 md_error_t *ep 906 ) 907 { 908 md_drive_desc *dd, *p; 909 910 if (bypass_daemon) 911 dd = dr2drivedesc(sp, MD_SIDEWILD, 912 (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep); 913 else 914 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 915 916 if (dd == NULL) { 917 if (! mdisok(ep)) 918 return (-1); 919 return (0); 920 } 921 922 923 for (p = dd; p != NULL; p = p->dd_next) 924 if (strcmp(p->dd_dnp->cname, dnp->cname) == 0) 925 return (1); 926 return (0); 927 } 928 929 int 930 meta_set_balance( 931 mdsetname_t *sp, 932 md_error_t *ep 933 ) 934 { 935 md_set_desc *sd; 936 md_drive_desc *dd, *curdd; 937 daddr_t dbsize; 938 daddr_t nblks; 939 int i; 940 int rval = 0; 941 sigset_t oldsigs; 942 md_setkey_t *cl_sk; 943 md_error_t xep = mdnullerror; 944 md_mnnode_desc *nd; 945 int suspend1_flag = 0; 946 947 if ((sd = metaget_setdesc(sp, ep)) == NULL) 948 return (-1); 949 950 dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 951 952 /* Make sure we own the set */ 953 if (meta_check_ownership(sp, ep) != 0) 954 return (-1); 955 956 /* END CHECK CODE */ 957 958 /* 959 * Get drive descriptors for the drives that are currently in the set. 960 */ 961 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); 962 963 if (! mdisok(ep)) 964 return (-1); 965 966 /* Find the minimum replica size in use is or use the default */ 967 if ((nblks = meta_db_minreplica(sp, ep)) < 0) 968 mdclrerror(ep); 969 else 970 dbsize = nblks; /* adjust replica size */ 971 972 /* Make sure we are blocking all signals */ 973 if (procsigs(TRUE, &oldsigs, &xep) < 0) 974 mdclrerror(&xep); 975 976 /* 977 * Lock the set on current set members. 978 * For MN diskset lock_set and SUSPEND are used to protect against 979 * other meta* commands running on the other nodes. 980 */ 981 if (MD_MNSET_DESC(sd)) { 982 nd = sd->sd_nodelist; 983 while (nd) { 984 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 985 nd = nd->nd_next; 986 continue; 987 } 988 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 989 rval = -1; 990 goto out; 991 } 992 nd = nd->nd_next; 993 } 994 /* 995 * Lock out other meta* commands by suspending 996 * class 1 messages across the diskset. 997 */ 998 nd = sd->sd_nodelist; 999 while (nd) { 1000 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1001 nd = nd->nd_next; 1002 continue; 1003 } 1004 if (clnt_mdcommdctl(nd->nd_nodename, 1005 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 1006 MD_MSCF_NO_FLAGS, ep)) { 1007 rval = -1; 1008 goto out; 1009 } 1010 suspend1_flag = 1; 1011 nd = nd->nd_next; 1012 } 1013 } else { 1014 for (i = 0; i < MD_MAXSIDES; i++) { 1015 /* Skip empty slots */ 1016 if (sd->sd_nodes[i][0] == '\0') continue; 1017 1018 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1019 rval = -1; 1020 goto out; 1021 } 1022 } 1023 } 1024 1025 /* We are not adding or deleting any drives, just balancing */ 1026 dd = NULL; 1027 1028 /* 1029 * Balance the DB's according to the list of existing drives and the 1030 * list of added drives. 1031 */ 1032 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) 1033 goto out; 1034 1035 out: 1036 /* 1037 * Unlock diskset by resuming class 1 messages across the diskset. 1038 * Just resume all classes so that resume is the same whether 1039 * just one class was locked or all classes were locked. 1040 */ 1041 if (suspend1_flag) { 1042 nd = sd->sd_nodelist; 1043 while (nd) { 1044 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1045 nd = nd->nd_next; 1046 continue; 1047 } 1048 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 1049 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 1050 /* 1051 * We are here because we failed to resume 1052 * rpc.mdcommd. However we potentially have 1053 * an error from the previous call 1054 * (meta_db_balance). If the previous call 1055 * did fail, we capture that error and 1056 * generate a perror withthe string, 1057 * "Unable to resume...". 1058 * Setting rval to -1 ensures that in the 1059 * next iteration of the loop, ep is not 1060 * clobbered. 1061 */ 1062 if (rval == 0) 1063 (void) mdstealerror(ep, &xep); 1064 else 1065 mdclrerror(&xep); 1066 rval = -1; 1067 mde_perror(ep, dgettext(TEXT_DOMAIN, 1068 "Unable to resume rpc.mdcommd.")); 1069 } 1070 nd = nd->nd_next; 1071 } 1072 } 1073 1074 /* Unlock the set */ 1075 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1076 if (MD_MNSET_DESC(sd)) { 1077 nd = sd->sd_nodelist; 1078 while (nd) { 1079 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1080 nd = nd->nd_next; 1081 continue; 1082 } 1083 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1084 if (rval == 0) 1085 (void) mdstealerror(ep, &xep); 1086 else 1087 mdclrerror(&xep); 1088 rval = -1; 1089 } 1090 nd = nd->nd_next; 1091 } 1092 } else { 1093 for (i = 0; i < MD_MAXSIDES; i++) { 1094 /* Skip empty slots */ 1095 if (sd->sd_nodes[i][0] == '\0') 1096 continue; 1097 1098 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1099 if (rval == 0) 1100 (void) mdstealerror(ep, &xep); 1101 rval = -1; 1102 } 1103 } 1104 } 1105 1106 /* release signals back to what they were on entry */ 1107 if (procsigs(FALSE, &oldsigs, &xep) < 0) 1108 mdclrerror(&xep); 1109 1110 cl_set_setkey(NULL); 1111 1112 metaflushsetname(sp); 1113 1114 return (rval); 1115 } 1116 1117 int 1118 meta_set_destroy( 1119 mdsetname_t *sp, 1120 int lock_set, 1121 md_error_t *ep 1122 ) 1123 { 1124 int i; 1125 med_rec_t medr; 1126 md_set_desc *sd; 1127 md_drive_desc *dd, *p, *p1; 1128 mddrivename_t *dnp; 1129 mdname_t *np; 1130 mdnamelist_t *nlp = NULL; 1131 int num_users = 0; 1132 int has_set; 1133 side_t mysideno; 1134 sigset_t oldsigs; 1135 md_error_t xep = mdnullerror; 1136 md_setkey_t *cl_sk; 1137 int rval = 0; 1138 int delete_end = 1; 1139 1140 /* Make sure we are blocking all signals */ 1141 if (procsigs(TRUE, &oldsigs, ep) < 0) 1142 return (-1); 1143 1144 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1145 if (! mdisok(ep)) 1146 rval = -1; 1147 goto out; 1148 } 1149 1150 /* 1151 * meta_set_destroy should not be called for a MN diskset. 1152 * This routine destroys a set without communicating this information 1153 * to the other nodes which would lead to an inconsistency in 1154 * the MN diskset. 1155 */ 1156 if (MD_MNSET_DESC(sd)) { 1157 rval = -1; 1158 goto out; 1159 } 1160 1161 /* Continue if a traditional diskset */ 1162 1163 /* 1164 * Check to see who has the set. If we are not the last user of the 1165 * set, we will not touch the replicas. 1166 */ 1167 for (i = 0; i < MD_MAXSIDES; i++) { 1168 /* Skip empty slots */ 1169 if (sd->sd_nodes[i][0] == '\0') 1170 continue; 1171 1172 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, 1173 ep); 1174 1175 if (has_set < 0) { 1176 mdclrerror(ep); 1177 } else 1178 num_users++; 1179 } 1180 1181 if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) { 1182 if (! mdisok(ep)) { 1183 rval = -1; 1184 goto out; 1185 } 1186 } 1187 1188 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 1189 rval = -1; 1190 goto out; 1191 } 1192 1193 if (lock_set == TRUE) { 1194 /* Lock the set on our side */ 1195 if (clnt_lock_set(mynode(), sp, ep)) { 1196 rval = -1; 1197 goto out; 1198 } 1199 } 1200 1201 /* 1202 * A traditional diskset has no diskset stale information to send 1203 * since there can only be one owner node at a time. 1204 */ 1205 if (snarf_set(sp, FALSE, ep)) 1206 mdclrerror(ep); 1207 1208 if (dd != NULL) { 1209 /* 1210 * Make sure that no drives are in use as parts of metadrives 1211 * or hot spare pools, this is one of the few error conditions 1212 * that will stop this routine, unless the environment has 1213 * META_DESTROY_SET_OK set, in which case, the operation will 1214 * proceed. 1215 */ 1216 if (getenv("META_DESTROY_SET_OK") == NULL) { 1217 for (p = dd; p != NULL; p = p->dd_next) { 1218 dnp = p->dd_dnp; 1219 1220 i = meta_check_drive_inuse(sp, dnp, FALSE, ep); 1221 if (i == -1) { 1222 /* need xep - wire calls clear error */ 1223 i = metaget_setownership(sp, &xep); 1224 if (i == -1) { 1225 rval = -1; 1226 goto out; 1227 } 1228 1229 mysideno = getmyside(sp, &xep); 1230 1231 if (mysideno == MD_SIDEWILD) { 1232 rval = -1; 1233 goto out; 1234 } 1235 1236 if (sd->sd_isown[mysideno] == FALSE) 1237 if (halt_set(sp, &xep)) { 1238 rval = -1; 1239 goto out; 1240 } 1241 1242 rval = -1; 1243 goto out; 1244 } 1245 } 1246 } 1247 1248 for (i = 0; i < MD_MAXSIDES; i++) { 1249 /* Skip empty slots */ 1250 if (sd->sd_nodes[i][0] == '\0') 1251 continue; 1252 1253 /* Skip non local nodes */ 1254 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1255 continue; 1256 1257 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep)) 1258 mdclrerror(ep); 1259 } 1260 1261 /* 1262 * Go thru each drive and individually delete the replicas. 1263 * This way we can ignore individual errors. 1264 */ 1265 for (p = dd; p != NULL; p = p->dd_next) { 1266 uint_t rep_slice; 1267 1268 dnp = p->dd_dnp; 1269 if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 1270 (((np = metaslicename(dnp, rep_slice, ep)) 1271 == NULL) && 1272 ((np = metaslicename(dnp, MD_SLICE0, ep)) 1273 == NULL))) { 1274 rval = -1; 1275 goto out; 1276 } 1277 1278 if ((np = metaslicename(dnp, 1279 rep_slice, ep)) == NULL) { 1280 if ((np = metaslicename(dnp, 1281 MD_SLICE0, ep)) == NULL) { 1282 rval = -1; 1283 goto out; 1284 } 1285 mdclrerror(ep); 1286 } 1287 1288 /* Yes this is UGLY!!! */ 1289 p1 = p->dd_next; 1290 p->dd_next = NULL; 1291 if (rel_own_bydd(sp, p, FALSE, ep)) 1292 mdclrerror(ep); 1293 p->dd_next = p1; 1294 1295 if (p->dd_dbcnt == 0) 1296 continue; 1297 1298 /* 1299 * Skip the replica removal if we are not the last user 1300 */ 1301 if (num_users != 1) 1302 continue; 1303 1304 nlp = NULL; 1305 (void) metanamelist_append(&nlp, np); 1306 if (meta_db_detach(sp, nlp, 1307 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep)) 1308 mdclrerror(ep); 1309 metafreenamelist(nlp); 1310 } 1311 } 1312 1313 if (halt_set(sp, ep)) { 1314 rval = -1; 1315 goto out; 1316 } 1317 1318 /* Setup the mediator record */ 1319 (void) memset(&medr, '\0', sizeof (med_rec_t)); 1320 medr.med_rec_mag = MED_REC_MAGIC; 1321 medr.med_rec_rev = MED_REC_REV; 1322 medr.med_rec_fl = 0; 1323 medr.med_rec_sn = sp->setno; 1324 (void) strcpy(medr.med_rec_snm, sp->setname); 1325 medr.med_rec_meds = sd->sd_med; /* structure assigment */ 1326 (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t)); 1327 medr.med_rec_foff = 0; 1328 1329 /* 1330 * If we are the last remaining user, then remove the mediator hosts 1331 */ 1332 if (num_users == 1) { 1333 for (i = 0; i < MED_MAX_HOSTS; i++) { 1334 if (medr.med_rec_meds.n_lst[i].a_cnt != 0) 1335 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, 1336 SVM_TAG_MEDIATOR, sp->setno, i); 1337 (void) memset(&medr.med_rec_meds.n_lst[i], '\0', 1338 sizeof (md_h_t)); 1339 } 1340 medr.med_rec_meds.n_cnt = 0; 1341 } else { /* Remove this host from the mediator node list. */ 1342 for (i = 0; i < MD_MAXSIDES; i++) { 1343 /* Skip empty slots */ 1344 if (sd->sd_nodes[i][0] == '\0') 1345 continue; 1346 1347 /* Copy non local node */ 1348 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) { 1349 (void) strcpy(medr.med_rec_nodes[i], 1350 sd->sd_nodes[i]); 1351 continue; 1352 } 1353 1354 /* Clear local node */ 1355 (void) memset(&medr.med_rec_nodes[i], '\0', 1356 sizeof (md_node_nm_t)); 1357 } 1358 } 1359 1360 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); 1361 1362 /* 1363 * If the client is part of a cluster put the DCS service 1364 * into a deleteing state. 1365 */ 1366 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1367 if (metad_isautotakebyname(sp->setname)) { 1368 delete_end = 0; 1369 } else { 1370 mdclrerror(ep); 1371 goto out; 1372 } 1373 } 1374 1375 /* Inform the mediator hosts of the new information */ 1376 for (i = 0; i < MED_MAX_HOSTS; i++) { 1377 if (sd->sd_med.n_lst[i].a_cnt == 0) 1378 continue; 1379 1380 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) 1381 mdclrerror(ep); 1382 } 1383 1384 /* Delete the set locally */ 1385 for (i = 0; i < MD_MAXSIDES; i++) { 1386 /* Skip empty slots */ 1387 if (sd->sd_nodes[i][0] == '\0') 1388 continue; 1389 1390 /* Skip non local nodes */ 1391 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1392 continue; 1393 1394 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) 1395 mdclrerror(ep); 1396 } 1397 if (delete_end && 1398 sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) 1399 rval = -1; 1400 1401 out: 1402 /* release signals back to what they were on entry */ 1403 if (procsigs(FALSE, &oldsigs, &xep) < 0) { 1404 if (rval == 0) 1405 (void) mdstealerror(ep, &xep); 1406 rval = -1; 1407 } 1408 1409 if (lock_set == TRUE) { 1410 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1411 if (clnt_unlock_set(mynode(), cl_sk, &xep)) { 1412 if (rval == 0) 1413 (void) mdstealerror(ep, &xep); 1414 rval = -1; 1415 } 1416 cl_set_setkey(NULL); 1417 } 1418 1419 metaflushsetname(sp); 1420 return (rval); 1421 } 1422 1423 int 1424 meta_set_purge( 1425 mdsetname_t *sp, 1426 int bypass_cluster, 1427 int forceflg, 1428 md_error_t *ep 1429 ) 1430 { 1431 char *thishost = mynode(); 1432 md_set_desc *sd; 1433 md_setkey_t *cl_sk; 1434 md_error_t xep = mdnullerror; 1435 int rval = 0; 1436 int i, num_hosts = 0; 1437 int has_set = 0; 1438 int max_node = 0; 1439 int delete_end = 1; 1440 md_mnnode_desc *nd; 1441 1442 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1443 /* unable to find set description */ 1444 rval = 1; 1445 return (rval); 1446 } 1447 1448 if (MD_MNSET_DESC(sd)) { 1449 /* 1450 * Get a count of the hosts in the set and also lock the set 1451 * on those hosts that know about it. 1452 */ 1453 nd = sd->sd_nodelist; 1454 while (nd) { 1455 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1456 nd = nd->nd_next; 1457 continue; 1458 } 1459 has_set = nodehasset(sp, nd->nd_nodename, 1460 NHS_NST_EQ, ep); 1461 1462 /* 1463 * The host is not aware of this set (has_set < 0) or 1464 * the set does not match (has_set == 0). This check 1465 * prevents the code getting confused by an apparent 1466 * inconsistancy in the set's state, this is in the 1467 * purge code so something is broken in any case and 1468 * this is just trying to fix the brokeness. 1469 */ 1470 if (has_set <= 0) { 1471 mdclrerror(ep); 1472 nd->nd_flags |= MD_MN_NODE_NOSET; 1473 } else { 1474 num_hosts++; 1475 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1476 /* 1477 * If the force flag is set then 1478 * ignore any RPC failures because we 1479 * are only really interested with 1480 * the set on local node. 1481 */ 1482 if (forceflg && mdanyrpcerror(ep)) { 1483 mdclrerror(ep); 1484 } else { 1485 /* 1486 * set max_node so that in the 1487 * unlock code nodes in the 1488 * set that have not been 1489 * locked are not unlocked. 1490 */ 1491 max_node = nd->nd_nodeid; 1492 rval = 2; 1493 goto out1; 1494 } 1495 } 1496 1497 } 1498 nd = nd->nd_next; 1499 } 1500 max_node = 0; 1501 } else { 1502 /* 1503 * Get a count of the hosts in the set and also lock the set 1504 * on those hosts that know about it. 1505 */ 1506 for (i = 0; i < MD_MAXSIDES; i++) { 1507 /* Skip empty slots */ 1508 if (sd->sd_nodes[i][0] == '\0') 1509 continue; 1510 1511 has_set = nodehasset(sp, sd->sd_nodes[i], 1512 NHS_NST_EQ, ep); 1513 1514 /* 1515 * The host is not aware of this set (has_set < 0) or 1516 * the set does not match (has_set == 0). This check 1517 * prevents the code getting confused by an apparent 1518 * inconsistancy in the set's state, this is in the 1519 * purge code so something is broken in any case and 1520 * this is just trying to fix the brokeness. 1521 */ 1522 if (has_set <= 0) { 1523 mdclrerror(ep); 1524 /* 1525 * set the node to NULL to prevent further 1526 * requests to this unresponsive node. 1527 */ 1528 sd->sd_nodes[i][0] = '\0'; 1529 } else { 1530 num_hosts++; 1531 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1532 /* 1533 * If the force flag is set then 1534 * ignore any RPC failures because we 1535 * are only really interested with 1536 * the set on local node. 1537 */ 1538 if (forceflg && mdanyrpcerror(ep)) { 1539 mdclrerror(ep); 1540 } else { 1541 rval = 2; 1542 /* 1543 * set max_node so that in the 1544 * unlock code nodes in the 1545 * set that have not been 1546 * locked are not unlocked. 1547 */ 1548 max_node = i; 1549 goto out1; 1550 } 1551 } 1552 } 1553 } 1554 max_node = i; /* now MD_MAXSIDES */ 1555 } 1556 if (!bypass_cluster) { 1557 /* 1558 * If there is only one host associated with the 1559 * set then remove the set from the cluster. 1560 */ 1561 if (num_hosts == 1) { 1562 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1563 if (metad_isautotakebyname(sp->setname)) { 1564 delete_end = 0; 1565 } else { 1566 mdclrerror(ep); 1567 rval = 3; 1568 goto out1; 1569 } 1570 } 1571 } 1572 } 1573 1574 if (MD_MNSET_DESC(sd)) { 1575 /* 1576 * Get a count of the hosts in the set and also lock the set 1577 * on those hosts that know about it. 1578 */ 1579 nd = sd->sd_nodelist; 1580 while (nd) { 1581 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1582 nd = nd->nd_next; 1583 continue; 1584 } 1585 if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) { 1586 /* 1587 * Tell the remote node to remove this node 1588 */ 1589 if (clnt_delhosts(nd->nd_nodename, sp, 1, 1590 &thishost, ep) == -1) { 1591 /* 1592 * If we fail to delete ourselves 1593 * from the remote host it does not 1594 * really matter because the set is 1595 * being "purged" from this node. The 1596 * set can be purged from the other 1597 * node at a later time. 1598 */ 1599 mdclrerror(ep); 1600 } 1601 nd = nd->nd_next; 1602 continue; 1603 } 1604 /* remove the set from this host */ 1605 if (clnt_delset(nd->nd_nodename, sp, ep) == -1) { 1606 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1607 if (!bypass_cluster && num_hosts == 1) 1608 (void) sdssc_delete_end(sp->setname, 1609 SDSSC_CLEANUP); 1610 mdclrerror(ep); 1611 goto out1; 1612 } 1613 nd = nd->nd_next; 1614 } 1615 } else { 1616 for (i = 0; i < MD_MAXSIDES; i++) { 1617 /* Skip empty slots */ 1618 if (sd->sd_nodes[i][0] == '\0') 1619 continue; 1620 if (strcmp(thishost, sd->sd_nodes[i]) != 0) { 1621 /* 1622 * Tell the remote node to remove this node 1623 */ 1624 if (clnt_delhosts(sd->sd_nodes[i], sp, 1, 1625 &thishost, ep) == -1) { 1626 /* 1627 * If we fail to delete ourselves 1628 * from the remote host it does not 1629 * really matter because the set is 1630 * being "purged" from this node. The 1631 * set can be purged from the other 1632 * node at a later time. 1633 */ 1634 mdclrerror(ep); 1635 } 1636 continue; 1637 } 1638 1639 /* remove the set from this host */ 1640 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) { 1641 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1642 if (!bypass_cluster && num_hosts == 1) 1643 (void) sdssc_delete_end(sp->setname, 1644 SDSSC_CLEANUP); 1645 mdclrerror(ep); 1646 goto out1; 1647 } 1648 } 1649 } 1650 1651 if (!bypass_cluster && num_hosts == 1) { 1652 if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == 1653 SDSSC_ERROR) { 1654 rval = 4; 1655 } 1656 } 1657 1658 out1: 1659 1660 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1661 1662 /* 1663 * Remove the set lock on those nodes that had the set locked 1664 * max_node will either be MD_MAXSIDES or array index of the last 1665 * node contacted (or rather failed to contact) for traditional 1666 * diskset. For a MN diskset, max_node is the node_id of the node 1667 * that failed the lock. 1668 */ 1669 if (MD_MNSET_DESC(sd)) { 1670 nd = sd->sd_nodelist; 1671 while (nd) { 1672 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1673 nd = nd->nd_next; 1674 continue; 1675 } 1676 if (nd->nd_nodeid == max_node) 1677 break; 1678 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1679 if (forceflg && mdanyrpcerror(&xep)) { 1680 mdclrerror(&xep); 1681 nd = nd->nd_next; 1682 continue; 1683 } 1684 if (rval == 0) 1685 (void) mdstealerror(ep, &xep); 1686 rval = 5; 1687 } 1688 nd = nd->nd_next; 1689 } 1690 } else { 1691 for (i = 0; i < max_node; i++) { 1692 /* Skip empty slots */ 1693 if (sd->sd_nodes[i][0] == '\0') 1694 continue; 1695 1696 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1697 if (forceflg && mdanyrpcerror(&xep)) { 1698 mdclrerror(&xep); 1699 continue; 1700 } 1701 if (rval == 0) 1702 (void) mdstealerror(ep, &xep); 1703 rval = 5; 1704 } 1705 } 1706 } 1707 1708 cl_set_setkey(NULL); 1709 1710 return (rval); 1711 } 1712 1713 int 1714 meta_set_query( 1715 mdsetname_t *sp, 1716 mddb_dtag_lst_t **dtlpp, 1717 md_error_t *ep 1718 ) 1719 { 1720 mddb_dtag_get_parm_t dtgp; 1721 1722 (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t)); 1723 dtgp.dtgp_setno = sp->setno; 1724 1725 /*CONSTCOND*/ 1726 while (1) { 1727 if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0) 1728 if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) || 1729 *dtlpp == NULL) 1730 return (mdstealerror(ep, &dtgp.dtgp_mde)); 1731 else 1732 break; 1733 1734 /* 1735 * Run to the end of the list 1736 */ 1737 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) 1738 /* void */; 1739 1740 *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t)); 1741 1742 (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt, 1743 sizeof (mddb_dtag_t)); 1744 1745 dtgp.dtgp_dt.dt_id++; 1746 } 1747 return (0); 1748 } 1749 1750 /* 1751 * return drivename get by key 1752 */ 1753 mddrivename_t * 1754 metadrivename_withdrkey( 1755 mdsetname_t *sp, 1756 side_t sideno, 1757 mdkey_t key, 1758 int flags, 1759 md_error_t *ep 1760 ) 1761 { 1762 char *nm; 1763 mdname_t *np; 1764 mddrivename_t *dnp; 1765 ddi_devid_t devidp; 1766 md_set_desc *sd; 1767 1768 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1769 return (NULL); 1770 } 1771 1772 /* get namespace info */ 1773 if (MD_MNSET_DESC(sd)) { 1774 if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, 1775 key, ep)) == NULL) 1776 return (NULL); 1777 } else { 1778 if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW, 1779 key, ep)) == NULL) 1780 return (NULL); 1781 } 1782 1783 /* get device name */ 1784 if (flags & PRINT_FAST) { 1785 if ((np = metaname_fast(&sp, nm, LOGICAL_DEVICE, ep)) == NULL) { 1786 Free(nm); 1787 return (NULL); 1788 } 1789 } else { 1790 if ((np = metaname(&sp, nm, LOGICAL_DEVICE, ep)) == NULL) { 1791 Free(nm); 1792 return (NULL); 1793 } 1794 } 1795 Free(nm); 1796 1797 /* make sure it's OK */ 1798 if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0)) 1799 return (NULL); 1800 1801 /* get drivename */ 1802 dnp = np->drivenamep; 1803 dnp->side_names_key = key; 1804 1805 /* 1806 * Skip the following devid check if dnp is did device 1807 * The device id is disabled for did device due to the 1808 * lack of minor name support in the did driver. The following 1809 * devid code path can set and propagate the error and 1810 * eventually prevent did disks from being added to the 1811 * diskset under SunCluster systems 1812 */ 1813 if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) { 1814 goto out; 1815 } 1816 1817 /* Also, Skip the check if MN diskset, no devid's */ 1818 if (MD_MNSET_DESC(sd)) { 1819 goto out; 1820 } 1821 1822 /* 1823 * Get the devid associated with the key. 1824 * 1825 * If a devid was returned, it MUST be valid even in 1826 * the case where a device id has been "updated". The 1827 * "update" of the device id may have occured due to 1828 * a firmware upgrade. 1829 */ 1830 if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) 1831 != NULL) { 1832 dnp->devid = devid_str_encode(devidp, NULL); 1833 free(devidp); 1834 } else { 1835 /* 1836 * It is okay if replica is not in devid mode 1837 */ 1838 if (mdissyserror(ep, MDDB_F_NODEVID)) { 1839 mdclrerror(ep); 1840 goto out; 1841 } 1842 1843 /* 1844 * devid is missing so this means that we have 1845 * just upgraded from a configuration where 1846 * devid's were not used so try to add in 1847 * the devid and requery. 1848 */ 1849 if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, 1850 ep) < 0) 1851 return (NULL); 1852 if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, 1853 sideno+SKEW, key, ep)) == NULL) 1854 return (NULL); 1855 dnp->devid = devid_str_encode(devidp, NULL); 1856 devid_free(devidp); 1857 } 1858 1859 out: 1860 if (flags & MD_BYPASS_DAEMON) 1861 return (dnp); 1862 1863 if (get_sidenmlist(sp, dnp, ep)) 1864 return (NULL); 1865 1866 /* return success */ 1867 return (dnp); 1868 } 1869 1870 void 1871 metafreedrivedesc(md_drive_desc **dd) 1872 { 1873 md_drive_desc *p, *next = NULL; 1874 1875 for (p = *dd; p != NULL; p = next) { 1876 next = p->dd_next; 1877 Free(p); 1878 } 1879 *dd = NULL; 1880 } 1881 1882 md_drive_desc * 1883 metaget_drivedesc( 1884 mdsetname_t *sp, 1885 int flags, 1886 md_error_t *ep 1887 ) 1888 { 1889 side_t sideno = MD_SIDEWILD; 1890 1891 assert(! (flags & MD_BYPASS_DAEMON)); 1892 1893 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 1894 return (NULL); 1895 1896 return (metaget_drivedesc_sideno(sp, sideno, flags, ep)); 1897 } 1898 1899 md_drive_desc * 1900 metaget_drivedesc_fromnamelist( 1901 mdsetname_t *sp, 1902 mdnamelist_t *nlp, 1903 md_error_t *ep 1904 ) 1905 { 1906 md_set_desc *sd; 1907 mdnamelist_t *p; 1908 md_drive_desc *dd = NULL; 1909 1910 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1911 return (NULL); 1912 1913 for (p = nlp; p != NULL; p = p->next) 1914 (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0, 1915 sd->sd_ctime, sd->sd_genid, MD_DR_ADD); 1916 1917 return (dd); 1918 } 1919 1920 md_drive_desc * 1921 metaget_drivedesc_sideno( 1922 mdsetname_t *sp, 1923 side_t sideno, 1924 int flags, 1925 md_error_t *ep 1926 ) 1927 { 1928 md_set_desc *sd = NULL; 1929 1930 assert(! (flags & MD_BYPASS_DAEMON)); 1931 1932 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1933 return (NULL); 1934 1935 if (sd->sd_drvs) 1936 return (sd->sd_drvs); 1937 1938 if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL) 1939 return (NULL); 1940 1941 return (sd->sd_drvs); 1942 } 1943 1944 int 1945 metaget_setownership( 1946 mdsetname_t *sp, 1947 md_error_t *ep 1948 ) 1949 { 1950 md_set_desc *sd; 1951 int bool; 1952 int i; 1953 md_mnnode_desc *nd; 1954 1955 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1956 return (-1); 1957 1958 if (MD_MNSET_DESC(sd)) { 1959 nd = sd->sd_nodelist; 1960 while (nd) { 1961 /* If node isn't alive, can't own diskset */ 1962 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1963 nd->nd_flags &= ~MD_MN_NODE_OWN; 1964 nd = nd->nd_next; 1965 continue; 1966 } 1967 /* 1968 * If can't communicate with rpc.metad, then mark 1969 * this node as not an owner. That node may 1970 * in fact, be an owner, but without rpc.metad running 1971 * that node can't do much. 1972 */ 1973 if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) { 1974 nd->nd_flags &= ~MD_MN_NODE_OWN; 1975 } else if (bool == TRUE) { 1976 nd->nd_flags |= MD_MN_NODE_OWN; 1977 } else { 1978 nd->nd_flags &= ~MD_MN_NODE_OWN; 1979 } 1980 nd = nd->nd_next; 1981 } 1982 return (0); 1983 } 1984 1985 /* Rest of code handles traditional disksets */ 1986 1987 for (i = 0; i < MD_MAXSIDES; i++) 1988 sd->sd_isown[i] = 0; 1989 1990 if (clnt_ownset(mynode(), sp, &bool, ep) == -1) 1991 return (-1); 1992 1993 if (bool == TRUE) 1994 sd->sd_isown[getmyside(sp, ep)] = 1; 1995 1996 return (0); 1997 } 1998 1999 char * 2000 mynode(void) 2001 { 2002 static struct utsname myuname; 2003 static int done = 0; 2004 2005 if (! done) { 2006 if (uname(&myuname) == -1) { 2007 md_perror(dgettext(TEXT_DOMAIN, "uname")); 2008 assert(0); 2009 } 2010 done = 1; 2011 } 2012 return (myuname.nodename); 2013 } 2014 2015 int 2016 strinlst(char *str, int cnt, char **lst) 2017 { 2018 int i; 2019 2020 for (i = 0; i < cnt; i++) 2021 if (strcmp(lst[i], str) == 0) 2022 return (TRUE); 2023 2024 return (FALSE); 2025 } 2026 2027 /* 2028 * meta_get_reserved_names 2029 * returns an mdnamelist_t of reserved slices 2030 * reserved slices are those that are used but don't necessarily 2031 * show up as metadevices (ex. reserved slice for db in sets, logs) 2032 */ 2033 2034 /*ARGSUSED*/ 2035 int 2036 meta_get_reserved_names( 2037 mdsetname_t *sp, 2038 mdnamelist_t **nlpp, 2039 int options, 2040 md_error_t *ep) 2041 { 2042 int count = 0; 2043 mdname_t *np = NULL; 2044 mdnamelist_t *transnlp = NULL; 2045 mdnamelist_t **tailpp = nlpp; 2046 mdnamelist_t *nlp; 2047 md_drive_desc *dd, *di; 2048 2049 if (metaislocalset(sp)) 2050 goto out; 2051 2052 if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) { 2053 count = -1; 2054 goto out; 2055 } 2056 2057 /* db in for sets on reserved slice */ 2058 for (di = dd; di && count >= 0; di = di->dd_next) { 2059 uint_t rep_slice; 2060 2061 /* 2062 * Add the name struct to the end of the 2063 * namelist but keep a pointer to the last 2064 * element so that we don't incur the overhead 2065 * of traversing the list each time 2066 */ 2067 if (di->dd_dnp && 2068 (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) && 2069 (np = metaslicename(di->dd_dnp, rep_slice, ep)) && 2070 (tailpp = meta_namelist_append_wrapper(tailpp, np))) 2071 count++; 2072 else 2073 count = -1; 2074 } 2075 2076 /* now find logs */ 2077 if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) { 2078 count = -1; 2079 goto out; 2080 } 2081 2082 for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) { 2083 mdname_t *transnp = nlp->namep; 2084 md_trans_t *transp; 2085 2086 if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) { 2087 count = -1; 2088 goto out; 2089 } 2090 if (transp->lognamep) { 2091 /* 2092 * Add the name struct to the end of the 2093 * namelist but keep a pointer to the last 2094 * element so that we don't incur the overhead 2095 * of traversing the list each time 2096 */ 2097 tailpp = meta_namelist_append_wrapper( 2098 tailpp, transp->lognamep); 2099 } 2100 } 2101 out: 2102 metafreenamelist(transnlp); 2103 return (count); 2104 } 2105 2106 /* 2107 * Entry point to join a node to MultiNode diskset. 2108 * 2109 * Validate host in diskset. 2110 * - Should be in membership list from API 2111 * - Should not already be joined into diskset. 2112 * - Set must have drives 2113 * Assume valid configuration is stored in the set/drive/node records 2114 * in the local mddb since no node or drive can be added to the MNset 2115 * unless all drives and nodes are available. Reconfig steps will 2116 * resync all ALIVE nodes in case of panic in critical areas. 2117 * 2118 * Lock down the set. 2119 * Verify host is a member of this diskset. 2120 * If drives exist in the configuration, load the mddbs. 2121 * Set this node to active by notifying master if one exists. 2122 * If this is the first node active in the diskset, this node 2123 * becomes the master. 2124 * Unlock the set. 2125 * 2126 * Mirror Resync: 2127 * If this node is the last node to join the set and clustering 2128 * isn't running, then start the 'metasync -r' type resync 2129 * on all mirrors in this diskset. 2130 * If clustering is running, this resync operation will 2131 * be handled by the reconfig steps and should NOT 2132 * be handled during a join operation. 2133 * 2134 * There are multiple return values in order to assist 2135 * the join operation of all sets in the metaset command. 2136 * 2137 * Return values: 2138 * 0 - Node successfully joined to set. 2139 * -1 - Join attempted but failed 2140 * - any failure from libmeta calls 2141 * - node not in the member list 2142 * -2 - Join not attempted since 2143 * - this set had no drives in set 2144 * - this node already joined to set 2145 * - set is not a multinode set 2146 * -3 - Node joined to STALE set. 2147 */ 2148 extern int 2149 meta_set_join( 2150 mdsetname_t *sp, 2151 md_error_t *ep 2152 ) 2153 { 2154 md_set_desc *sd; 2155 md_drive_desc *dd; 2156 md_mnnode_desc *nd, *nd2, my_nd; 2157 int rval = 0; 2158 md_setkey_t *cl_sk; 2159 md_error_t xep = mdnullerror; 2160 md_error_t ep_snarf = mdnullerror; 2161 int master_flag = 0; 2162 md_mnset_record *mas_mnsr = NULL; 2163 int clear_nr_flags = 0; 2164 md_mnnode_record *nr; 2165 int stale_set = 0; 2166 int rb_flags = 0; 2167 int stale_bool = FALSE; 2168 int suspendall_flag = 0; 2169 int suspend1_flag = 0; 2170 sigset_t oldsigs; 2171 int send_reinit = 0; 2172 2173 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2174 return (-1); 2175 } 2176 2177 /* Must be a multinode diskset */ 2178 if (!MD_MNSET_DESC(sd)) { 2179 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2180 return (-2); 2181 } 2182 2183 /* Verify that the node is ALIVE (i.e. is in the API membership list) */ 2184 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) { 2185 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, 2186 sd->sd_mn_mynode->nd_nodename, NULL, 2187 sp->setname); 2188 return (-1); 2189 } 2190 2191 /* Make sure we are blocking all signals */ 2192 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2193 mdclrerror(&xep); 2194 2195 /* 2196 * Lock the set on current set members. 2197 * For MN diskset lock_set and SUSPEND are used to protect against 2198 * other meta* commands running on the other nodes. 2199 */ 2200 nd = sd->sd_nodelist; 2201 while (nd) { 2202 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2203 nd = nd->nd_next; 2204 continue; 2205 } 2206 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2207 rval = -1; 2208 goto out; 2209 } 2210 nd = nd->nd_next; 2211 } 2212 2213 /* 2214 * Lock out other meta* commands by suspending 2215 * class 1 messages across the diskset. 2216 */ 2217 nd = sd->sd_nodelist; 2218 while (nd) { 2219 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2220 nd = nd->nd_next; 2221 continue; 2222 } 2223 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2224 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2225 rval = -1; 2226 goto out; 2227 } 2228 suspend1_flag = 1; 2229 nd = nd->nd_next; 2230 } 2231 2232 /* 2233 * Verify that this host is a member (in the host list) of the set. 2234 */ 2235 nd = sd->sd_nodelist; 2236 while (nd) { 2237 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2238 break; 2239 } 2240 nd = nd->nd_next; 2241 } 2242 if (!nd) { 2243 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2244 sd->sd_mn_mynode->nd_nodename, NULL, 2245 sp->setname); 2246 rval = -1; 2247 goto out; 2248 } 2249 2250 /* 2251 * Need to return failure if host is already 'joined' 2252 * into the set. This is done so that if later the user 2253 * issues a command to join all sets and a failure is 2254 * encountered - that the resulting cleanup effort 2255 * (withdrawing from all sets that were joined 2256 * during that command) won't withdraw from this set. 2257 */ 2258 if (nd->nd_flags & MD_MN_NODE_OWN) { 2259 rval = -2; 2260 goto out2; 2261 } 2262 2263 /* 2264 * Call metaget_setownership that calls each node in diskset and 2265 * marks in set descriptor if node is an owner of the set or not. 2266 * metaget_setownership checks to see if a node is an owner by 2267 * checking to see if that node's kernel has the mddb loaded. 2268 * If a node had panic'd during a reconfig or an 2269 * add/delete/join/withdraw operation, the other nodes' node 2270 * records may not reflect the current state of the diskset, 2271 * so calling metaget_setownership is the safest thing to do. 2272 */ 2273 if (metaget_setownership(sp, ep) == -1) { 2274 rval = -1; 2275 goto out; 2276 } 2277 2278 /* If first active member of diskset, become the master. */ 2279 nd = sd->sd_nodelist; 2280 while (nd) { 2281 if (nd->nd_flags & MD_MN_NODE_OWN) 2282 break; 2283 nd = nd->nd_next; 2284 } 2285 if (nd == NULL) 2286 master_flag = 1; 2287 2288 /* 2289 * If not first active member of diskset, then get the 2290 * master information from a node that is already joined 2291 * and set the master information for this node. Be sure 2292 * that this node (the already joined node) has its own 2293 * join flag set. If not, then this diskset isn't currently 2294 * consistent and shouldn't allow a node to join. This diskset 2295 * inconsistency should only occur when a node has panic'd in 2296 * the set while doing a metaset operation and the sysadmin is 2297 * attempting to join a node into the set. This inconsistency 2298 * will be fixed during a reconfig cycle which should be occurring 2299 * soon since a node panic'd. 2300 * 2301 * If unable to get this information from an owning node, then 2302 * this diskset isn't currently consistent and shouldn't 2303 * allow a node to join. 2304 */ 2305 if (!master_flag) { 2306 /* get master information from an owner (joined) node */ 2307 if (clnt_mngetset(nd->nd_nodename, sp->setname, 2308 sp->setno, &mas_mnsr, ep) == -1) { 2309 rval = -1; 2310 goto out; 2311 } 2312 2313 /* Verify that owner (joined) node has its own JOIN flag set */ 2314 nr = mas_mnsr->sr_nodechain; 2315 while (nr) { 2316 if ((nd->nd_nodeid == nr->nr_nodeid) && 2317 ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) { 2318 (void) mddserror(ep, MDE_DS_NODENOSET, 2319 sp->setno, nd->nd_nodename, NULL, 2320 nd->nd_nodename); 2321 free_sr((md_set_record *)mas_mnsr); 2322 rval = -1; 2323 goto out; 2324 } 2325 nr = nr->nr_next; 2326 } 2327 2328 /* 2329 * Does master have set marked as STALE? 2330 * If so, need to pass this down to kernel when 2331 * this node snarfs the set. 2332 */ 2333 if (clnt_mn_is_stale(nd->nd_nodename, sp, 2334 &stale_bool, ep) == -1) { 2335 rval = -1; 2336 goto out; 2337 } 2338 2339 /* set master information in my rpc.metad's set record */ 2340 if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm, 2341 mas_mnsr->sr_master_nodeid, ep)) { 2342 free_sr((md_set_record *)mas_mnsr); 2343 rval = -1; 2344 goto out; 2345 } 2346 2347 /* set master information in my cached set desc */ 2348 (void) strcpy(sd->sd_mn_master_nodenm, 2349 mas_mnsr->sr_master_nodenm); 2350 sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid; 2351 nd2 = sd->sd_nodelist; 2352 while (nd2) { 2353 if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) { 2354 sd->sd_mn_masternode = nd2; 2355 break; 2356 } 2357 nd2 = nd2->nd_next; 2358 } 2359 free_sr((md_set_record *)mas_mnsr); 2360 2361 /* 2362 * Set the node flags in mynode's rpc.metad node records for 2363 * the nodes that are in the diskset. Can use my sd 2364 * since earlier call to metaget_setownership set the 2365 * owner flags based on whether that node had snarfed 2366 * the MN diskset mddb. Reconfig steps guarantee that 2367 * return of metaget_setownership will match the owning 2368 * node's owner list except in the case where a node 2369 * has just panic'd and in this case, a reconfig will 2370 * be starting immediately and the owner lists will 2371 * be sync'd up by the reconfig. 2372 * 2373 * Flag of SET means to take no action except to 2374 * set the node flags as given in the nodelist linked list. 2375 */ 2376 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 2377 MD_NR_SET, NULL, ep)) { 2378 rval = -1; 2379 goto out; 2380 } 2381 } 2382 2383 /* 2384 * Read in the mddb if there are drives in the set. 2385 */ 2386 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2387 ep)) == NULL) { 2388 /* No drives in list */ 2389 if (! mdisok(ep)) { 2390 rval = -1; 2391 goto out; 2392 } 2393 rval = -2; 2394 goto out; 2395 } 2396 2397 /* 2398 * Notify rpc.mdcommd on all nodes of a nodelist change. 2399 * Start by suspending rpc.mdcommd (which drains it of all messages), 2400 * then change the nodelist followed by a reinit and resume. 2401 */ 2402 nd = sd->sd_nodelist; 2403 while (nd) { 2404 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2405 nd = nd->nd_next; 2406 continue; 2407 } 2408 2409 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, 2410 MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2411 rval = -1; 2412 goto out; 2413 } 2414 suspendall_flag = 1; 2415 nd = nd->nd_next; 2416 } 2417 2418 /* Set master in my set record in rpc.metad */ 2419 if (master_flag) { 2420 if (clnt_mnsetmaster(mynode(), sp, 2421 sd->sd_mn_mynode->nd_nodename, 2422 sd->sd_mn_mynode->nd_nodeid, ep)) { 2423 rval = -1; 2424 goto out; 2425 } 2426 } 2427 /* 2428 * Causes mddbs to be loaded into the kernel. 2429 * Set the force flag so that replica locations can be 2430 * loaded into the kernel even if a mediator node was 2431 * unavailable. This allows a node to join an MO 2432 * diskset when there are sufficient replicas available, 2433 * but a mediator node in unavailable. 2434 */ 2435 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 2436 mde_perror(ep, dgettext(TEXT_DOMAIN, 2437 "Host not able to start diskset.")); 2438 rval = -1; 2439 goto out; 2440 } 2441 2442 if (! mdisok(ep)) { 2443 rval = -1; 2444 goto out; 2445 } 2446 2447 /* 2448 * Set rollback flags to 1 so that halt_set is called if a failure 2449 * is seen after this point. If snarf_set fails, still need to 2450 * call halt_set to cleanup the diskset. 2451 */ 2452 rb_flags = 1; 2453 2454 /* Starts the set */ 2455 if (snarf_set(sp, stale_bool, ep) != 0) { 2456 if (mdismddberror(ep, MDE_DB_STALE)) { 2457 /* 2458 * Don't fail join, STALE means that set has 2459 * < 50% mddbs. 2460 */ 2461 (void) mdstealerror(&ep_snarf, ep); 2462 stale_set = 1; 2463 } else if (mdisok(ep)) { 2464 /* If snarf failed, but no error was set - set it */ 2465 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2466 sp->setno, 0, NULL); 2467 rval = -1; 2468 goto out; 2469 } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) { 2470 /* 2471 * Don't fail join if ACCOK; ACCOK means that mediator 2472 * provided extra vote. 2473 */ 2474 rval = -1; 2475 goto out; 2476 } 2477 } 2478 2479 /* Did set really get snarfed? */ 2480 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) { 2481 if (mdisok(ep)) { 2482 /* If snarf failed, but no error was set - set it */ 2483 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2484 sp->setno, 0, NULL); 2485 } 2486 mde_perror(ep, dgettext(TEXT_DOMAIN, 2487 "Host not able to start diskset.")); 2488 rval = -1; 2489 goto out; 2490 } 2491 2492 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2493 send_reinit = 1; 2494 2495 /* If first node to enter set, setup master and clear change log */ 2496 if (master_flag) { 2497 /* Set master in my locally cached set descriptor */ 2498 (void) strcpy(sd->sd_mn_master_nodenm, 2499 sd->sd_mn_mynode->nd_nodename); 2500 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; 2501 sd->sd_mn_am_i_master = 1; 2502 2503 /* 2504 * If first node to join set, then clear out change log 2505 * entries. Change log entries are only needed when a 2506 * change of master is occurring in a diskset that has 2507 * multiple owners. Since this node is the first owner 2508 * of the diskset, clear the entries. 2509 * 2510 * Only do this if we are in a single node non-SC3.x 2511 * situation. 2512 */ 2513 if (meta_mn_singlenode() && 2514 mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) { 2515 mde_perror(ep, dgettext(TEXT_DOMAIN, 2516 "Unable to reset changelog.")); 2517 rval = -1; 2518 goto out; 2519 } 2520 } 2521 2522 /* Set my locally cached flag */ 2523 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 2524 2525 /* 2526 * Set this node's own flag on all joined nodes in the set 2527 * (including my node). 2528 */ 2529 clear_nr_flags = 1; 2530 2531 my_nd = *(sd->sd_mn_mynode); 2532 my_nd.nd_next = NULL; 2533 nd = sd->sd_nodelist; 2534 while (nd) { 2535 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2536 nd = nd->nd_next; 2537 continue; 2538 } 2539 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 2540 MD_NR_JOIN, NULL, ep)) { 2541 rval = -1; 2542 goto out; 2543 } 2544 nd = nd->nd_next; 2545 } 2546 2547 out: 2548 if (rval != NULL) { 2549 /* 2550 * If rollback flag is 1, then node was joined to set. 2551 * Since an error occurred, withdraw node from set in 2552 * order to rollback to before command was run. 2553 * Need to preserve ep so that calling function can 2554 * get error information. 2555 */ 2556 if (rb_flags == 1) { 2557 if (halt_set(sp, &xep)) { 2558 mdclrerror(&xep); 2559 } 2560 } 2561 2562 /* 2563 * If error, reset master to INVALID. 2564 * Ignore error since (next) first node to successfully join 2565 * will set master on all nodes. 2566 */ 2567 (void) clnt_mnsetmaster(mynode(), sp, "", 2568 MD_MN_INVALID_NID, &xep); 2569 mdclrerror(&xep); 2570 /* Reset master in my locally cached set descriptor */ 2571 sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; 2572 sd->sd_mn_am_i_master = 0; 2573 2574 /* 2575 * If nr flags set on other nodes, reset them. 2576 */ 2577 if (clear_nr_flags) { 2578 nd = sd->sd_nodelist; 2579 while (nd) { 2580 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2581 nd = nd->nd_next; 2582 continue; 2583 } 2584 (void) clnt_upd_nr_flags(nd->nd_nodename, sp, 2585 &my_nd, MD_NR_WITHDRAW, NULL, &xep); 2586 mdclrerror(&xep); 2587 nd = nd->nd_next; 2588 } 2589 /* Reset my locally cached flag */ 2590 sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN; 2591 } 2592 } 2593 2594 /* 2595 * Notify rpc.mdcommd on all nodes of a nodelist change. 2596 * Send reinit command to mdcommd which forces it to get 2597 * fresh set description. 2598 */ 2599 if (send_reinit) { 2600 /* Send reinit */ 2601 nd = sd->sd_nodelist; 2602 while (nd) { 2603 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2604 nd = nd->nd_next; 2605 continue; 2606 } 2607 2608 /* Class is ignored for REINIT */ 2609 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 2610 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 2611 /* 2612 * We are here because we failed to resume 2613 * rpc.mdcommd. However we potentially have 2614 * an error from the previous call 2615 * If the previous call did fail, we capture 2616 * that error and generate a perror with 2617 * the string, "Unable to resume...". 2618 * Setting rval to -1 ensures that in the 2619 * next iteration of the loop, ep is not 2620 * clobbered. 2621 */ 2622 if (rval == 0) 2623 (void) mdstealerror(ep, &xep); 2624 else 2625 mdclrerror(&xep); 2626 rval = -1; 2627 mde_perror(ep, dgettext(TEXT_DOMAIN, 2628 "Unable to reinit rpc.mdcommd.")); 2629 } 2630 nd = nd->nd_next; 2631 } 2632 2633 } 2634 2635 out2: 2636 /* 2637 * Unlock diskset by resuming messages across the diskset. 2638 * Just resume all classes so that resume is the same whether 2639 * just one class was locked or all classes were locked. 2640 */ 2641 if ((suspend1_flag) || (suspendall_flag)) { 2642 nd = sd->sd_nodelist; 2643 while (nd) { 2644 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2645 nd = nd->nd_next; 2646 continue; 2647 } 2648 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 2649 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 2650 /* 2651 * We are here because we failed to resume 2652 * rpc.mdcommd. However we potentially have 2653 * an error from the previous call 2654 * If the previous call did fail, we capture 2655 * that error and generate a perror with 2656 * the string, "Unable to resume...". 2657 * Setting rval to -1 ensures that in the 2658 * next iteration of the loop, ep is not 2659 * clobbered. 2660 */ 2661 if (rval == 0) 2662 (void) mdstealerror(ep, &xep); 2663 else 2664 mdclrerror(&xep); 2665 rval = -1; 2666 mde_perror(ep, dgettext(TEXT_DOMAIN, 2667 "Unable to resume rpc.mdcommd.")); 2668 } 2669 nd = nd->nd_next; 2670 } 2671 meta_ping_mnset(sp->setno); 2672 } 2673 2674 /* 2675 * Unlock set. This flushes the caches on the servers. 2676 */ 2677 cl_sk = cl_get_setkey(sp->setno, sp->setname); 2678 nd = sd->sd_nodelist; 2679 while (nd) { 2680 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2681 nd = nd->nd_next; 2682 continue; 2683 } 2684 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 2685 if (rval == 0) 2686 (void) mdstealerror(ep, &xep); 2687 else 2688 mdclrerror(&xep); 2689 rval = -1; 2690 } 2691 nd = nd->nd_next; 2692 } 2693 2694 /* 2695 * If this node is the last to join the diskset and clustering isn't 2696 * running, then resync the mirrors in the diskset. We have to wait 2697 * until all nodes are joined so that the status gets propagated to 2698 * all of the members of the set. 2699 * Ignore any error from the resync as the join function shouldn't fail 2700 * because the mirror resync had a problem. 2701 * 2702 * Don't start resync if set is stale. 2703 */ 2704 if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) && 2705 (stale_set != 1)) { 2706 nd = sd->sd_nodelist; 2707 while (nd) { 2708 if (!(nd->nd_flags & MD_MN_NODE_OWN)) 2709 break; 2710 nd = nd->nd_next; 2711 } 2712 /* 2713 * nd set to NULL means that we have no nodes in the set that 2714 * haven't joined. In this case we start the resync. 2715 */ 2716 if (nd == NULL) { 2717 (void) meta_mirror_resync_all(sp, 0, &xep); 2718 mdclrerror(&xep); 2719 } 2720 } 2721 2722 /* Update ABR state for all soft partitions */ 2723 (void) meta_sp_update_abr(sp, &xep); 2724 mdclrerror(&xep); 2725 2726 /* 2727 * call metaflushsetnames to reset local cache for master and 2728 * node information. 2729 */ 2730 metaflushsetname(sp); 2731 2732 /* release signals back to what they were on entry */ 2733 if (procsigs(FALSE, &oldsigs, &xep) < 0) 2734 mdclrerror(&xep); 2735 2736 /* 2737 * If no error and stale_set is set, then set ep back 2738 * to ep from snarf_set call and return -3. If another error 2739 * occurred and rval is not 0, then that error would have 2740 * caused the node to be withdrawn from the set and would 2741 * have set ep to that error information. 2742 */ 2743 if ((rval == 0) && (stale_set)) { 2744 (void) mdstealerror(ep, &ep_snarf); 2745 return (-3); 2746 } 2747 2748 return (rval); 2749 } 2750 2751 /* 2752 * Entry point to withdraw a node from MultiNode diskset. 2753 * 2754 * Validate host in diskset. 2755 * - Should be joined into diskset. 2756 * Assume valid configuration is stored in the set/drive/node records 2757 * in the local mddb since no node or drive can be added to the MNset 2758 * unless all drives and nodes are available. Reconfig steps will 2759 * resync all ALIVE nodes in case of panic in critical areas. 2760 * 2761 * Lock down the set. 2762 * Verify that drives exist in configuration. 2763 * Verify host is a member of this diskset. 2764 * Verify host is an owner of the diskset (host is joined to diskset). 2765 * Only allow withdrawal of master node if master node is the only joined 2766 * in the diskset. 2767 * Halt the diskset on this node. 2768 * Reset Master on this node. 2769 * Updated node flags that this node with withdrawn. 2770 * Unlock the set. 2771 * 2772 * Return values: 2773 * 0 - Node successfully withdrew from set. 2774 * -1 - Withdrawal attempted but failed 2775 * - any failure from libmeta calls 2776 * - node not in the member list 2777 * -2 - Withdrawal not attempted since 2778 * - this set had no drives in set 2779 * - this node not joined to set 2780 * - set is not a multinode set 2781 */ 2782 extern int 2783 meta_set_withdraw( 2784 mdsetname_t *sp, 2785 md_error_t *ep 2786 ) 2787 { 2788 md_set_desc *sd; 2789 md_drive_desc *dd = 0; 2790 md_mnnode_desc *nd, my_nd; 2791 int rval = 0; 2792 md_setkey_t *cl_sk; 2793 md_error_t xep = mdnullerror; 2794 int set_halted = 0; 2795 int suspendall_flag = 0; 2796 int suspend1_flag = 0; 2797 bool_t stale_bool = FALSE; 2798 mddb_config_t c; 2799 int node_id_list[1]; 2800 sigset_t oldsigs; 2801 int send_reinit = 0; 2802 2803 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2804 return (-1); 2805 } 2806 2807 /* Must be a multinode diskset */ 2808 if (!MD_MNSET_DESC(sd)) { 2809 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2810 return (-1); 2811 } 2812 2813 /* Make sure we are blocking all signals */ 2814 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2815 mdclrerror(&xep); 2816 2817 /* 2818 * Lock the set on current set members. 2819 * For MN diskset lock_set and SUSPEND are used to protect against 2820 * other meta* commands running on the other nodes. 2821 */ 2822 nd = sd->sd_nodelist; 2823 while (nd) { 2824 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2825 nd = nd->nd_next; 2826 continue; 2827 } 2828 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2829 rval = -1; 2830 goto out; 2831 } 2832 nd = nd->nd_next; 2833 } 2834 /* 2835 * Lock out other meta* commands by suspending 2836 * class 1 messages across the diskset. 2837 */ 2838 nd = sd->sd_nodelist; 2839 while (nd) { 2840 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2841 nd = nd->nd_next; 2842 continue; 2843 } 2844 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2845 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2846 rval = -1; 2847 goto out; 2848 } 2849 suspend1_flag = 1; 2850 nd = nd->nd_next; 2851 } 2852 2853 /* Get list of drives - needed in case of failure */ 2854 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2855 ep)) == NULL) { 2856 /* Error getting drives in list */ 2857 if (! mdisok(ep)) { 2858 rval = -1; 2859 goto out2; 2860 } 2861 /* no drives in list */ 2862 rval = -2; 2863 goto out2; 2864 } 2865 2866 /* 2867 * Verify that this host is a member (in the host list) of the set. 2868 */ 2869 nd = sd->sd_nodelist; 2870 while (nd) { 2871 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2872 break; 2873 } 2874 nd = nd->nd_next; 2875 } 2876 if (!nd) { 2877 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2878 sd->sd_mn_mynode->nd_nodename, NULL, 2879 sp->setname); 2880 rval = -1; 2881 goto out2; 2882 } 2883 2884 /* 2885 * Call metaget_setownership that calls each node in diskset and 2886 * marks in set descriptor if node is an owner of the set or not. 2887 * metaget_setownership checks to see if a node is an owner by 2888 * checking to see if that node's kernel has the mddb loaded. 2889 * If a node had panic'd during a reconfig or an 2890 * add/delete/join/withdraw operation, the other nodes' node 2891 * records may not reflect the current state of the diskset, 2892 * so calling metaget_setownership is the safest thing to do. 2893 */ 2894 if (metaget_setownership(sp, ep) == -1) { 2895 rval = -1; 2896 goto out2; 2897 } 2898 2899 /* 2900 * Verify that this node is joined 2901 * to diskset (i.e. is an owner of the diskset). 2902 */ 2903 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 2904 rval = -2; 2905 goto out2; 2906 } 2907 2908 /* 2909 * For a MN diskset, only withdraw master if it is 2910 * the only joined node. 2911 */ 2912 if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) { 2913 nd = sd->sd_nodelist; 2914 while (nd) { 2915 /* Skip my node since checking for other owners */ 2916 if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { 2917 nd = nd->nd_next; 2918 continue; 2919 } 2920 /* If another owner node if found, error */ 2921 if (nd->nd_flags & MD_MN_NODE_OWN) { 2922 (void) mddserror(ep, MDE_DS_WITHDRAWMASTER, 2923 sp->setno, 2924 sd->sd_mn_mynode->nd_nodename, NULL, 2925 sp->setname); 2926 rval = -1; 2927 goto out2; 2928 } 2929 nd = nd->nd_next; 2930 } 2931 } 2932 2933 /* 2934 * Is current set STALE? 2935 */ 2936 (void) memset(&c, 0, sizeof (c)); 2937 c.c_id = 0; 2938 c.c_setno = sp->setno; 2939 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 2940 (void) mdstealerror(ep, &c.c_mde); 2941 rval = -1; 2942 goto out; 2943 } 2944 if (c.c_flags & MDDB_C_STALE) { 2945 stale_bool = TRUE; 2946 } 2947 2948 /* 2949 * Notify rpc.mdcommd on all nodes of a nodelist change. 2950 * Start by suspending rpc.mdcommd (which drains it of all messages), 2951 * then change the nodelist followed by a reinit and resume. 2952 */ 2953 nd = sd->sd_nodelist; 2954 while (nd) { 2955 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2956 nd = nd->nd_next; 2957 continue; 2958 } 2959 2960 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2961 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2962 rval = -1; 2963 goto out; 2964 } 2965 suspendall_flag = 1; 2966 nd = nd->nd_next; 2967 } 2968 2969 /* 2970 * Withdraw the set - halt set. 2971 * This will fail if any I/O is occuring to any metadevice which 2972 * includes a resync to a mirror metadevice. 2973 */ 2974 set_halted = 1; 2975 if (halt_set(sp, ep)) { 2976 /* Was set actually halted? */ 2977 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) { 2978 set_halted = 0; 2979 } 2980 rval = -1; 2981 goto out; 2982 } 2983 2984 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2985 send_reinit = 1; 2986 2987 /* Reset master on withdrawn node */ 2988 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "", 2989 MD_MN_INVALID_NID, ep)) { 2990 rval = -1; 2991 goto out; 2992 } 2993 2994 /* Mark my node as withdrawn and send to other nodes */ 2995 nd = sd->sd_nodelist; 2996 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 2997 my_nd.nd_next = NULL; 2998 while (nd) { 2999 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3000 nd = nd->nd_next; 3001 continue; 3002 } 3003 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3004 MD_NR_WITHDRAW, NULL, ep)) { 3005 rval = -1; 3006 goto out; 3007 } 3008 nd = nd->nd_next; 3009 } 3010 3011 /* 3012 * If withdrawn node is a mirror owner, reset mirror owner 3013 * to NULL. If an error occurs, print a warning and continue. 3014 * Don't fail metaset because of mirror owner reset problem since 3015 * next node to grab mirror will resolve this issue. 3016 * Before next node grabs mirrors, metaset will show the withdrawn 3017 * node as owner which is why an attempt to reset the mirror owner 3018 * is made. 3019 */ 3020 node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */ 3021 nd = sd->sd_nodelist; 3022 while (nd) { 3023 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3024 nd = nd->nd_next; 3025 continue; 3026 } 3027 if (clnt_reset_mirror_owner(nd->nd_nodename, sp, 3028 1, &node_id_list[0], &xep) == 01) { 3029 mde_perror(&xep, dgettext(TEXT_DOMAIN, 3030 "Unable to reset mirror owner on node %s"), 3031 nd->nd_nodename); 3032 mdclrerror(&xep); 3033 } 3034 nd = nd->nd_next; 3035 } 3036 3037 out: 3038 if (rval == -1) { 3039 /* Rejoin node - Mark node as joined and send to other nodes */ 3040 nd = sd->sd_nodelist; 3041 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3042 my_nd.nd_next = NULL; 3043 while (nd) { 3044 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3045 nd = nd->nd_next; 3046 continue; 3047 } 3048 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3049 MD_NR_JOIN, NULL, &xep)) { 3050 mdclrerror(&xep); 3051 } 3052 nd = nd->nd_next; 3053 } 3054 3055 /* Set master on withdrawn node */ 3056 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, 3057 sd->sd_mn_master_nodenm, 3058 sd->sd_mn_master_nodeid, &xep)) { 3059 mdclrerror(&xep); 3060 } 3061 3062 /* Join set if halt_set had succeeded */ 3063 if (set_halted) { 3064 /* 3065 * Causes mddbs to be loaded into the kernel. 3066 * Set the force flag so that replica locations can be 3067 * loaded into the kernel even if a mediator node was 3068 * unavailable. This allows a node to join an MO 3069 * diskset when there are sufficient replicas available, 3070 * but a mediator node in unavailable. 3071 */ 3072 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) { 3073 mdclrerror(&xep); 3074 } 3075 /* If set previously stale - make it so at re-join */ 3076 if (snarf_set(sp, stale_bool, &xep) != 0) { 3077 mdclrerror(&xep); 3078 (void) halt_set(sp, &xep); 3079 mdclrerror(&xep); 3080 } 3081 } 3082 } 3083 3084 /* 3085 * Notify rpc.mdcommd on all nodes of a nodelist change. 3086 * Send reinit command to mdcommd which forces it to get 3087 * fresh set description. 3088 */ 3089 if (send_reinit) { 3090 /* Send reinit */ 3091 nd = sd->sd_nodelist; 3092 while (nd) { 3093 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3094 nd = nd->nd_next; 3095 continue; 3096 } 3097 3098 /* Class is ignored for REINIT */ 3099 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 3100 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 3101 /* 3102 * We are here because we failed to resume 3103 * rpc.mdcommd. However we potentially have 3104 * an error from the previous call. 3105 * If the previous call did fail, we 3106 * capture that error and generate a perror 3107 * withthe string, "Unable to resume...". 3108 * Setting rval to -1 ensures that in the 3109 * next iteration of the loop, ep is not 3110 * clobbered. 3111 */ 3112 if (rval == 0) 3113 (void) mdstealerror(ep, &xep); 3114 else 3115 mdclrerror(&xep); 3116 rval = -1; 3117 mde_perror(ep, dgettext(TEXT_DOMAIN, 3118 "Unable to reinit rpc.mdcommd.")); 3119 } 3120 nd = nd->nd_next; 3121 } 3122 } 3123 3124 out2: 3125 /* 3126 * Unlock diskset by resuming messages across the diskset. 3127 * Just resume all classes so that resume is the same whether 3128 * just one class was locked or all classes were locked. 3129 */ 3130 if ((suspend1_flag) || (suspendall_flag)) { 3131 nd = sd->sd_nodelist; 3132 while (nd) { 3133 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3134 nd = nd->nd_next; 3135 continue; 3136 } 3137 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 3138 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 3139 /* 3140 * We are here because we failed to resume 3141 * rpc.mdcommd. However we potentially have 3142 * an error from the previous call 3143 * If the previous call did fail, we capture 3144 * that error and generate a perror with 3145 * the string, "Unable to resume...". 3146 * Setting rval to -1 ensures that in the 3147 * next iteration of the loop, ep is not 3148 * clobbered. 3149 */ 3150 if (rval == 0) 3151 (void) mdstealerror(ep, &xep); 3152 else 3153 mdclrerror(&xep); 3154 rval = -1; 3155 mde_perror(ep, dgettext(TEXT_DOMAIN, 3156 "Unable to resume rpc.mdcommd.")); 3157 } 3158 nd = nd->nd_next; 3159 } 3160 meta_ping_mnset(sp->setno); 3161 } 3162 3163 /* 3164 * Unlock set. This flushes the caches on the servers. 3165 */ 3166 cl_sk = cl_get_setkey(sp->setno, sp->setname); 3167 nd = sd->sd_nodelist; 3168 while (nd) { 3169 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3170 nd = nd->nd_next; 3171 continue; 3172 } 3173 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 3174 if (rval == 0) 3175 (void) mdstealerror(ep, &xep); 3176 else 3177 mdclrerror(&xep); 3178 rval = -1; 3179 } 3180 nd = nd->nd_next; 3181 } 3182 3183 /* 3184 * call metaflushsetnames to reset local cache for master and 3185 * node information. 3186 */ 3187 metaflushsetname(sp); 3188 3189 /* release signals back to what they were on entry */ 3190 if (procsigs(FALSE, &oldsigs, &xep) < 0) 3191 mdclrerror(&xep); 3192 3193 return (rval); 3194 3195 } 3196 3197 /* 3198 * Update nodelist with cluster member information. 3199 * A node not in the member list will be marked 3200 * as not ALIVE and not OWN. 3201 * A node in the member list will be marked ALIVE, but 3202 * the OWN bit will not be changed. 3203 * 3204 * If mynode isn't in the membership list, fail causing 3205 * another reconfig cycle to be started since a non-member 3206 * node shouldn't be taking part in the reconfig cycle. 3207 * 3208 * Return values: 3209 * 0 - No problem. 3210 * 1 - Any failure including RPC failure to my node. 3211 */ 3212 int 3213 meta_reconfig_update_nodelist( 3214 mdsetname_t *sp, 3215 mndiskset_membershiplist_t *nl, 3216 md_set_desc *sd, 3217 md_error_t *ep 3218 ) 3219 { 3220 mndiskset_membershiplist_t *nl2; 3221 md_mnnode_desc *nd; 3222 md_error_t xep = mdnullerror; 3223 int rval = 0; 3224 3225 /* 3226 * Walk through nodelist, checking to see if each 3227 * node is in the member list. 3228 * If node is not a member, reset ALIVE and OWN node flag. 3229 * If node is a member, set ALIVE. 3230 * If mynode's OWN flag gets reset, then halt the diskset on this node. 3231 */ 3232 nd = sd->sd_nodelist; 3233 while (nd) { 3234 nl2 = nl; 3235 while (nl2) { 3236 /* If node is in member list, set ALIVE */ 3237 if (nl2->msl_node_id == nd->nd_nodeid) { 3238 nd->nd_flags |= MD_MN_NODE_ALIVE; 3239 break; 3240 } else { 3241 nl2 = nl2->next; 3242 } 3243 /* node is not in member list, mark !ALIVE and !OWN */ 3244 if (nl2 == NULL) { 3245 /* If node is mynode, then halt set if needed */ 3246 if (strcmp(mynode(), nd->nd_nodename) == 0) { 3247 /* 3248 * This shouldn't happen, but just 3249 * in case... Any node not in the 3250 * membership list should be dead and 3251 * not running reconfig step1. 3252 */ 3253 if (nd->nd_flags & MD_MN_NODE_OWN) { 3254 if (halt_set(sp, &xep)) { 3255 mde_perror(&xep, ""); 3256 mdclrerror(&xep); 3257 } 3258 } 3259 /* 3260 * Return failure since this node 3261 * (mynode) is not in the membership 3262 * list, but process the rest of the 3263 * nodelist first so that rpc.metad 3264 * can be updated with the latest 3265 * membership information. 3266 */ 3267 (void) mddserror(ep, 3268 MDE_DS_NOTINMEMBERLIST, 3269 sp->setno, nd->nd_nodename, NULL, 3270 sp->setname); 3271 rval = 1; 3272 } 3273 nd->nd_flags &= ~MD_MN_NODE_ALIVE; 3274 nd->nd_flags &= ~MD_MN_NODE_OWN; 3275 } 3276 } 3277 nd = nd->nd_next; 3278 } 3279 3280 /* Send this information to rpc.metad */ 3281 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 3282 MD_NR_SET, MNSET_IN_RECONFIG, &xep)) { 3283 /* Return failure if can't send node flags to rpc.metad */ 3284 if (rval == 0) { 3285 (void) mdstealerror(ep, &xep); 3286 rval = 1; 3287 } 3288 } 3289 return (rval); 3290 } 3291 3292 /* 3293 * Choose master determines the master for a diskset. 3294 * Each node determines the master on its own and 3295 * adds this information to its local rpc.metad nodelist 3296 * and also sends it to the kernel. 3297 * 3298 * Nodelist in set descriptor (sd) is sorted in 3299 * monotonically increasing sequence of nodeid. 3300 * 3301 * Return values: 3302 * 0 - No problem. 3303 * 205 - There was an RPC problem to another node. 3304 * -1 - There was an error. This could be an RPC error to my node. 3305 * This is a catastrophic failure causing node to panic. 3306 */ 3307 int 3308 meta_reconfig_choose_master_for_set( 3309 mdsetname_t *sp, 3310 md_set_desc *sd, 3311 md_error_t *ep 3312 ) 3313 { 3314 int is_owner; 3315 md_mnset_record *mnsr = NULL; 3316 int lowest_alive_nodeid = 0; 3317 uint_t master_nodeid; 3318 md_mnnode_desc *nd, *nd2; 3319 md_mnnode_record *nr; 3320 md_drive_desc *dd; 3321 md_setkey_t *cl_sk; 3322 int rval = 0; 3323 md_error_t xep = mdnullerror; 3324 mddb_setflags_config_t sf; 3325 3326 /* 3327 * Is current node joined to diskset? 3328 * Don't trust flags, really check to see if mddb is snarfed. 3329 */ 3330 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 3331 /* 3332 * If a node is joined to the diskset, this node checks 3333 * to see if the current master of the diskset is valid and 3334 * is still in the membership list (ALIVE) and is 3335 * still joined (OWN). Need to verify if master is 3336 * really joined - don't trust the flags. (Can trust 3337 * ALIVE since set during earlier part of reconfig cycle.) 3338 * If the current master is valid, still in the membership 3339 * list and joined, then master is not changed on this node. 3340 * Just return. 3341 * 3342 * Verify that nodeid is valid before accessing masternode. 3343 */ 3344 if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) && 3345 (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) { 3346 if (clnt_ownset(sd->sd_mn_master_nodenm, sp, 3347 &is_owner, ep) == -1) { 3348 /* If RPC failure to another node return 205 */ 3349 if ((mdanyrpcerror(ep)) && 3350 (sd->sd_mn_mynode->nd_nodeid != 3351 sd->sd_mn_master_nodeid)) { 3352 return (205); 3353 } else { 3354 /* Any other failure */ 3355 return (-1); 3356 } 3357 } else { 3358 if (is_owner == TRUE) { 3359 3360 meta_mc_log(MC_LOG5, dgettext( 3361 TEXT_DOMAIN, "Set %s previous " 3362 "master chosen %s (%d): %s"), 3363 sp->setname, 3364 sd->sd_mn_master_nodenm, 3365 sd->sd_mn_master_nodeid, 3366 meta_print_hrtime(gethrtime() - 3367 start_time)); 3368 3369 /* Previous master is ok - done */ 3370 return (0); 3371 } 3372 } 3373 } 3374 3375 /* 3376 * If current master is no longer in the membership list or 3377 * is no longer joined, then this node uses the following 3378 * algorithm: 3379 * - node calls RPC routine clnt_ownset to get latest 3380 * information on which nodes are owners of diskset. 3381 * clnt_ownset checks on each node to see if its kernel 3382 * has that diskset snarfed. 3383 */ 3384 nd = sd->sd_nodelist; 3385 while (nd) { 3386 /* Don't consider node that isn't in member list */ 3387 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3388 nd = nd->nd_next; 3389 continue; 3390 } 3391 3392 if (clnt_ownset(nd->nd_nodename, sp, 3393 &is_owner, ep) == -1) { 3394 /* If RPC failure to another node return 205 */ 3395 if ((mdanyrpcerror(ep)) && 3396 (sd->sd_mn_mynode->nd_nodeid != 3397 nd->nd_nodeid)) { 3398 return (205); 3399 } else { 3400 /* Any other failure */ 3401 return (-1); 3402 } 3403 } 3404 3405 /* 3406 * Set owner flag for each node based on whether 3407 * that node really has a diskset mddb snarfed in 3408 * or not. 3409 */ 3410 if (is_owner == TRUE) 3411 nd->nd_flags |= MD_MN_NODE_OWN; 3412 else 3413 nd->nd_flags &= ~MD_MN_NODE_OWN; 3414 3415 nd = nd->nd_next; 3416 } 3417 3418 /* 3419 * - node walks through nodelist looking for nodes that are 3420 * owners of the diskset that are in the membership list. 3421 * - for each owner, node calls RPC routine clnt_getset to 3422 * see if that node has its node record set to OK. 3423 * - If so, master is chosen to be this owner node. 3424 */ 3425 nd = sd->sd_nodelist; 3426 while (nd) { 3427 /* Don't consider node that isn't in member list */ 3428 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3429 nd = nd->nd_next; 3430 continue; 3431 } 3432 3433 /* Don't consider a node that isn't an owner */ 3434 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3435 nd = nd->nd_next; 3436 continue; 3437 } 3438 3439 /* Does node has its own node record set to OK? */ 3440 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3441 MD_SET_BAD, &mnsr, ep) == -1) { 3442 /* If RPC failure to another node return 205 */ 3443 if ((mdanyrpcerror(ep)) && 3444 (sd->sd_mn_mynode->nd_nodeid != 3445 nd->nd_nodeid)) { 3446 return (205); 3447 } else { 3448 /* Any other failure */ 3449 return (-1); 3450 } 3451 } 3452 nr = mnsr->sr_nodechain; 3453 while (nr) { 3454 if (nd->nd_nodeid == nr->nr_nodeid) { 3455 if (nr->nr_flags & MD_MN_NODE_OK) { 3456 /* Found a master */ 3457 free_sr( 3458 (md_set_record *)mnsr); 3459 goto found_master; 3460 } 3461 } 3462 nr = nr->nr_next; 3463 } 3464 free_sr((md_set_record *)mnsr); 3465 nd = nd->nd_next; 3466 } 3467 3468 /* 3469 * - If no owner node has its own node record on its own node 3470 * set to OK, then this node checks all of the non-owner 3471 * nodes that are in the membership list. 3472 * - for each non-owner, node calls RPC routine clnt_getset to 3473 * see if that node has its node record set to OK. 3474 * - If set doesn't exist, don't choose node for master. 3475 * - If so, master is chosen to be this non-owner node. 3476 * 3477 */ 3478 nd = sd->sd_nodelist; 3479 while (nd) { 3480 /* Don't consider node that isn't in member list */ 3481 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3482 nd = nd->nd_next; 3483 continue; 3484 } 3485 3486 /* Only checking non-owner nodes this time around */ 3487 if (nd->nd_flags & MD_MN_NODE_OWN) { 3488 nd = nd->nd_next; 3489 continue; 3490 } 3491 3492 /* Does node has its own node record set to OK? */ 3493 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3494 MD_SET_BAD, &mnsr, ep) == -1) { 3495 /* 3496 * If set doesn't exist on non-owner node, 3497 * don't consider this node for master. 3498 */ 3499 if (mdiserror(ep, MDE_NO_SET)) { 3500 nd = nd->nd_next; 3501 continue; 3502 } else if ((mdanyrpcerror(ep)) && 3503 (sd->sd_mn_mynode->nd_nodeid != 3504 nd->nd_nodeid)) { 3505 /* RPC failure to another node */ 3506 return (205); 3507 } else { 3508 /* Any other failure */ 3509 return (-1); 3510 } 3511 } 3512 nr = mnsr->sr_nodechain; 3513 while (nr) { 3514 if (nd->nd_nodeid == nr->nr_nodeid) { 3515 if (nr->nr_flags & MD_MN_NODE_OK) { 3516 /* Found a master */ 3517 free_sr( 3518 (md_set_record *)mnsr); 3519 goto found_master; 3520 } 3521 } 3522 nr = nr->nr_next; 3523 } 3524 free_sr((md_set_record *)mnsr); 3525 nd = nd->nd_next; 3526 } 3527 3528 /* 3529 * - If no node can be found that has its own node record on 3530 * its node to be set to OK, then all alive nodes 3531 * were in the process of being added to or deleted 3532 * from set. Each alive node will remove all 3533 * information pertaining to this set from its node. 3534 * 3535 * If all nodes in set are ALIVE, then call sdssc end routines 3536 * since set was truly being initially created or destroyed. 3537 */ 3538 goto delete_set; 3539 } else { 3540 3541 /* 3542 * If node is not joined to diskset, then this 3543 * node uses the following algorithm: 3544 * - If unjoined node doesn't have a node record for itself, 3545 * just delete the diskset since diskset was in the 3546 * process of being created. 3547 * - node needs to find master of diskset before 3548 * reconfig cycle, if a master existed. 3549 * - node calls RPC routine clnt_ownset to get latest 3550 * information on which nodes are owners of diskset. 3551 * clnt_ownset checks on each node to see if its 3552 * kernel has that diskset snarfed. 3553 */ 3554 3555 /* 3556 * Is my node in the set description? 3557 * If not, delete the set from this node. 3558 * sr2setdesc sets sd_mn_mynode pointer to the node 3559 * descriptor for this node if there was a node 3560 * record for this node. 3561 * 3562 */ 3563 if (sd->sd_mn_mynode == NULL) { 3564 goto delete_set; 3565 } 3566 3567 nd = sd->sd_nodelist; 3568 while (nd) { 3569 /* Don't consider node that isn't in member list */ 3570 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3571 nd = nd->nd_next; 3572 continue; 3573 } 3574 3575 if (clnt_ownset(nd->nd_nodename, sp, 3576 &is_owner, ep) == -1) { 3577 /* If RPC failure to another node return 205 */ 3578 if ((mdanyrpcerror(ep)) && 3579 (sd->sd_mn_mynode->nd_nodeid != 3580 nd->nd_nodeid)) { 3581 return (205); 3582 } else { 3583 /* Any other failure */ 3584 return (-1); 3585 } 3586 } 3587 3588 /* 3589 * Set owner flag for each node based on whether 3590 * that node really has a diskset mddb snarfed in 3591 * or not. 3592 */ 3593 if (is_owner == TRUE) 3594 nd->nd_flags |= MD_MN_NODE_OWN; 3595 else 3596 nd->nd_flags &= ~MD_MN_NODE_OWN; 3597 3598 nd = nd->nd_next; 3599 } 3600 3601 /* 3602 * - node walks through nodelist looking for nodes that 3603 * are owners of the diskset that are in 3604 * the membership list. 3605 * - for each owner, node calls RPC routine clnt_getset to 3606 * see if that node has a master set and to get the 3607 * diskset description. 3608 * - If the owner node has a set description that doesn't 3609 * include the non-joined node in the nodelist, this node 3610 * removes its set description of that diskset 3611 * (i.e. removes the set from its local mddbs). This is 3612 * handling the case of when a node was removed from a 3613 * diskset while it was not in the cluster membership 3614 * list. 3615 * - If that node has a master set and the master is in the 3616 * membership list and is an owner, then either this was 3617 * the master from before the reconfig cycle or this 3618 * node has already chosen a new master - either way, 3619 * the master value is valid as long as it is in the 3620 * membership list and is an owner 3621 * - master is chosen to be owner node's master 3622 */ 3623 nd = sd->sd_nodelist; 3624 while (nd) { 3625 /* Don't consider node that isn't in member list */ 3626 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3627 nd = nd->nd_next; 3628 continue; 3629 } 3630 3631 /* Don't consider a node that isn't an owner */ 3632 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3633 nd = nd->nd_next; 3634 continue; 3635 } 3636 3637 /* Get owner node's set record */ 3638 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3639 MD_SET_BAD, &mnsr, ep) == -1) { 3640 /* If RPC failure to another node return 205 */ 3641 if ((mdanyrpcerror(ep)) && 3642 (sd->sd_mn_mynode->nd_nodeid != 3643 nd->nd_nodeid)) { 3644 return (205); 3645 } else { 3646 /* Any other failure */ 3647 return (-1); 3648 } 3649 } 3650 3651 /* Is this node in the owner node's set record */ 3652 nr = mnsr->sr_nodechain; 3653 while (nr) { 3654 if (sd->sd_mn_mynode->nd_nodeid == 3655 nr->nr_nodeid) { 3656 break; 3657 } 3658 nr = nr->nr_next; 3659 } 3660 if (nr == NULL) { 3661 /* my node not found - delete set */ 3662 free_sr((md_set_record *)mnsr); 3663 goto delete_set; 3664 } 3665 3666 /* Is owner's node's master valid? */ 3667 master_nodeid = mnsr->sr_master_nodeid; 3668 free_sr((md_set_record *)mnsr); 3669 if (master_nodeid == MD_MN_INVALID_NID) { 3670 nd = nd->nd_next; 3671 continue; 3672 } 3673 3674 nd2 = sd->sd_nodelist; 3675 while (nd2) { 3676 if ((nd2->nd_nodeid == master_nodeid) && 3677 (nd2->nd_flags & MD_MN_NODE_ALIVE) && 3678 (nd2->nd_flags & MD_MN_NODE_OWN)) { 3679 nd = nd2; 3680 goto found_master; 3681 } 3682 nd2 = nd2->nd_next; 3683 } 3684 nd = nd->nd_next; 3685 } 3686 3687 /* 3688 * - If no owner node has a valid master, then follow 3689 * algorithm of when a node is joined to the diskset. 3690 * - node walks through nodelist looking for nodes that are 3691 * owners of the diskset that are in the membership list. 3692 * - for each owner, node calls RPC routine clnt_getset to 3693 * see if that node has its node record set to OK. 3694 * - If so, master is chosen to be this owner node. 3695 */ 3696 nd = sd->sd_nodelist; 3697 while (nd) { 3698 /* Don't consider node that isn't in member list */ 3699 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3700 nd = nd->nd_next; 3701 continue; 3702 } 3703 3704 /* Don't consider a node that isn't an owner */ 3705 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3706 nd = nd->nd_next; 3707 continue; 3708 } 3709 3710 /* Does node has its own node record set to OK? */ 3711 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3712 MD_SET_BAD, &mnsr, ep) == -1) { 3713 /* If RPC failure to another node return 205 */ 3714 if ((mdanyrpcerror(ep)) && 3715 (sd->sd_mn_mynode->nd_nodeid != 3716 nd->nd_nodeid)) { 3717 return (205); 3718 } else { 3719 /* Any other failure */ 3720 return (-1); 3721 } 3722 } 3723 nr = mnsr->sr_nodechain; 3724 while (nr) { 3725 if (nd->nd_nodeid == nr->nr_nodeid) { 3726 if (nr->nr_flags & MD_MN_NODE_OK) { 3727 /* Found a master */ 3728 free_sr( 3729 (md_set_record *)mnsr); 3730 goto found_master; 3731 } 3732 } 3733 nr = nr->nr_next; 3734 } 3735 free_sr((md_set_record *)mnsr); 3736 nd = nd->nd_next; 3737 } 3738 3739 /* 3740 * - If no owner node has its own node record on its own node 3741 * set to OK, then this node checks all of the non-owner 3742 * nodes that are in the membership list. 3743 * - for each non-owner, node calls RPC routine clnt_getset to 3744 * see if that node has its node record set to OK. 3745 * - If set doesn't exist, don't choose node for master. 3746 * - If this node doesn't exist in the nodelist on any of the 3747 * non-owner nodes, this node removes its set description 3748 * of that diskset (i.e. removes the set from its local 3749 * mddbs). This is handling the case of when a node was 3750 * removed from a diskset while it was not in the 3751 * cluster membership list. 3752 * - If non-owner node has its node record set to OK and if 3753 * this node hasn't removed this diskset (step directly 3754 * before this one), then the master is chosen to be this 3755 * non-owner node. 3756 */ 3757 nd = sd->sd_nodelist; 3758 while (nd) { 3759 /* Don't consider node that isn't in member list */ 3760 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3761 nd->nd_flags |= MD_MN_NODE_DEL; 3762 nd = nd->nd_next; 3763 continue; 3764 } 3765 3766 /* Don't consider owner nodes since none are OK */ 3767 if (nd->nd_flags & MD_MN_NODE_OWN) { 3768 nd->nd_flags |= MD_MN_NODE_DEL; 3769 nd = nd->nd_next; 3770 continue; 3771 } 3772 3773 /* 3774 * Don't need to get nodelist from my node since 3775 * this is where sd_nodelist was obtained. 3776 */ 3777 if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3778 nd = nd->nd_next; 3779 continue; 3780 } 3781 3782 /* 3783 * If node has already been decided against for 3784 * master, then skip it. 3785 */ 3786 if (nd->nd_flags & MD_MN_NODE_DEL) { 3787 nd = nd->nd_next; 3788 continue; 3789 } 3790 3791 /* 3792 * Does node in my nodelist have its own node 3793 * record marked OK on its node? And does node 3794 * in my nodelist exist on all other nodes? 3795 * Don't want to choose a node for master unless 3796 * that node is marked OK on its own node and that 3797 * node exists on all other alive nodes. 3798 * 3799 * This is guarding against the case when several 3800 * nodes are down and one of the downed nodes is 3801 * deleted from the diskset. When the down nodes 3802 * are rebooted into the cluster, you don't want 3803 * any node to pick the deleted node as the master. 3804 */ 3805 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3806 MD_SET_BAD, &mnsr, ep) == -1) { 3807 /* 3808 * If set doesn't exist on non-owner node, 3809 * don't consider this node for master. 3810 */ 3811 if (mdiserror(ep, MDE_NO_SET)) { 3812 nd->nd_flags |= MD_MN_NODE_DEL; 3813 nd = nd->nd_next; 3814 continue; 3815 } else if (mdanyrpcerror(ep)) { 3816 /* RPC failure to another node */ 3817 return (205); 3818 } else { 3819 /* Any other failure */ 3820 return (-1); 3821 } 3822 } 3823 /* 3824 * Is my node in the nodelist gotten from the other 3825 * node? If not, then remove the set from my node 3826 * since set was deleted from my node while my node 3827 * was out of the cluster. 3828 */ 3829 nr = mnsr->sr_nodechain; 3830 while (nr) { 3831 if (sd->sd_mn_mynode->nd_nodeid == 3832 nr->nr_nodeid) { 3833 break; 3834 } 3835 nr = nr->nr_next; 3836 } 3837 if (nr == NULL) { 3838 /* my node not found - delete set */ 3839 free_sr((md_set_record *)mnsr); 3840 goto delete_set; 3841 } 3842 3843 /* Is node being checked marked OK on its own node? */ 3844 nr = mnsr->sr_nodechain; 3845 while (nr) { 3846 if (nd->nd_nodeid == nr->nr_nodeid) { 3847 if (!(nr->nr_flags & MD_MN_NODE_OK)) { 3848 nd->nd_flags |= MD_MN_NODE_DEL; 3849 } 3850 break; 3851 } 3852 nr = nr->nr_next; 3853 } 3854 /* 3855 * If node being checked doesn't exist on its 3856 * own node - don't choose it as master. 3857 */ 3858 if (nr == NULL) { 3859 nd->nd_flags |= MD_MN_NODE_DEL; 3860 } 3861 3862 /* 3863 * Check every node in my node's nodelist against 3864 * the nodelist gotten from the other node. 3865 * If a node in my node's nodelist is not found in the 3866 * other node's nodelist, then set the DEL flag. 3867 */ 3868 nd2 = sd->sd_nodelist; 3869 while (nd2) { 3870 nr = mnsr->sr_nodechain; 3871 while (nr) { 3872 if (nd2->nd_nodeid == nr->nr_nodeid) { 3873 break; 3874 } 3875 nr = nr->nr_next; 3876 } 3877 /* nd2 not found in other node's nodelist */ 3878 if (nr == NULL) { 3879 nd2->nd_flags |= MD_MN_NODE_DEL; 3880 } 3881 nd2 = nd2->nd_next; 3882 } 3883 3884 free_sr((md_set_record *)mnsr); 3885 nd = nd->nd_next; 3886 } 3887 3888 /* 3889 * Rescan list look for node that has not been marked DEL. 3890 * First node found is the master. 3891 */ 3892 nd = sd->sd_nodelist; 3893 while (nd) { 3894 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 3895 break; 3896 } 3897 nd = nd->nd_next; 3898 continue; 3899 } 3900 if (nd) { 3901 /* Found a master */ 3902 goto found_master; 3903 } 3904 3905 /* 3906 * - If no node can be found that has its own node record on 3907 * its node to be set to OK, then all alive nodes 3908 * were in the process of being added to or deleted 3909 * from set. Each alive node will remove all 3910 * information pertaining to this set from its node. 3911 * 3912 * If all nodes in set are ALIVE, then call sdssc end routines 3913 * since set was truly being initially created or destroyed. 3914 */ 3915 goto delete_set; 3916 } 3917 3918 found_master: 3919 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 3920 "Set %s master chosen %s (%d): %s"), 3921 sp->setname, nd->nd_nodename, nd->nd_nodeid, 3922 meta_print_hrtime(gethrtime() - start_time)); 3923 3924 if (clnt_lock_set(mynode(), sp, ep) == -1) { 3925 return (-1); 3926 } 3927 3928 cl_sk = cl_get_setkey(sp->setno, sp->setname); 3929 3930 if (clnt_mnsetmaster(mynode(), sp, 3931 nd->nd_nodename, nd->nd_nodeid, ep)) { 3932 rval = -1; 3933 } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3934 /* If this node is new master, set flag in this node's kernel */ 3935 (void) memset(&sf, 0, sizeof (sf)); 3936 sf.sf_setno = sp->setno; 3937 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 3938 /* Use magic to help protect ioctl against attack. */ 3939 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 3940 sf.sf_flags = MDDB_NM_SET; 3941 3942 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 3943 "Setting new master flag for set %s: %s"), 3944 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 3945 3946 /* 3947 * Fail reconfig cycle if ioctl fails since it is critical 3948 * to set new master flag. 3949 */ 3950 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, 3951 NULL) != NULL) { 3952 (void) mdstealerror(ep, &sf.sf_mde); 3953 rval = -1; 3954 } 3955 } 3956 3957 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 3958 if (rval == 0) { 3959 (void) mdstealerror(ep, &xep); 3960 rval = -1; 3961 } 3962 } 3963 3964 cl_set_setkey(NULL); 3965 3966 metaflushsetname(sp); 3967 3968 return (rval); 3969 3970 delete_set: 3971 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 3972 "Master not chosen, deleting set %s: %s"), 3973 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 3974 3975 /* 3976 * Remove all set information from this node: 3977 * - node records for this set 3978 * - drive records for this set 3979 * - set record for this set 3980 * (Only do this on this node since each node 3981 * will do it for its own local mddb.) 3982 * 3983 * If all nodes in set are ALIVE, then 3984 * the lowest numbered ALIVE nodeid in set 3985 * (irregardless of whether an owner node or not) will 3986 * call the DCS service to cleanup for create/delete of set. 3987 * sdssc_create_end(cleanup) if set was being created or 3988 * sdssc_delete_end(cleanup) if set was being deleted. 3989 * A node record with flag ADD denotes a set being 3990 * created. A node record with flag DEL denotes a 3991 * set being deleted. 3992 */ 3993 nd = sd->sd_nodelist; 3994 while (nd) { 3995 /* Found a node that isn't alive */ 3996 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) 3997 break; 3998 3999 /* Is my node the lowest numbered ALIVE node? */ 4000 if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) { 4001 break; 4002 } 4003 nd = nd->nd_next; 4004 } 4005 if (nd == NULL) { 4006 /* All nodes ALIVE and this is the lowest nodeid */ 4007 lowest_alive_nodeid = 1; 4008 } 4009 4010 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4011 return (-1); 4012 } 4013 4014 4015 /* 4016 * If this node had been joined, withdraw and reset master. 4017 * 4018 * This could happen if a node was being added to or removed 4019 * from a diskset and the node doing the add/delete operation and 4020 * all other nodes in the diskset have left the cluster. 4021 */ 4022 if (sd->sd_mn_mynode) { 4023 nd = sd->sd_mn_mynode; 4024 if (nd->nd_flags & MD_MN_NODE_OWN) { 4025 if (clnt_withdrawset(mynode(), sp, ep)) { 4026 rval = -1; 4027 goto out; 4028 } 4029 if (clnt_mnsetmaster(mynode(), sp, "", 4030 MD_MN_INVALID_NID, ep)) { 4031 rval = -1; 4032 goto out; 4033 } 4034 } 4035 } 4036 4037 /* 4038 * Remove side records for this node (side) from local mddb 4039 * (clnt_deldrvs does this) if there are drives in the set. 4040 * 4041 * Don't need to mark this node as DEL since already marked as 4042 * ADD or DEL (or this node would have been chosen as master). 4043 * Don't need to mark other node records, drive records or 4044 * set records as DEL. If a panic occurs during clnt_delset, 4045 * these records will be deleted the next time this node 4046 * becomes a member and goes through the reconfig cycle. 4047 */ 4048 /* Get the drive descriptors for this set */ 4049 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4050 ep)) == NULL) { 4051 if (! mdisok(ep)) { 4052 /* 4053 * Ignore and clear out any failures from 4054 * metaget_drivedesc since a panic could have 4055 * occurred when a node was partially added to a set. 4056 */ 4057 mdclrerror(ep); 4058 } 4059 } else { 4060 if (clnt_deldrvs(mynode(), sp, dd, ep)) { 4061 rval = -1; 4062 goto out; 4063 } 4064 } 4065 4066 /* 4067 * Now, delete the set - this removes the node, drive 4068 * and set records from the local mddb. 4069 */ 4070 if (clnt_delset(mynode(), sp, ep)) { 4071 rval = -1; 4072 goto out; 4073 } 4074 4075 out: 4076 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4077 4078 /* 4079 * Ignore errors from unlock of set since set is no longer 4080 * known (if clnt_delset worked). 4081 */ 4082 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4083 mdclrerror(&xep); 4084 } 4085 4086 cl_set_setkey(NULL); 4087 4088 metaflushsetname(sp); 4089 4090 /* 4091 * If this node is the lowest numbered nodeid then 4092 * call sdssc_create/delete_end depending on whether 4093 * this node is marked as ADD or DEL in the node record. 4094 */ 4095 if (lowest_alive_nodeid) { 4096 if (nd->nd_flags & MD_MN_NODE_ADD) 4097 sdssc_create_end(sp->setname, SDSSC_CLEANUP); 4098 else if (nd->nd_flags & MD_MN_NODE_DEL) 4099 sdssc_delete_end(sp->setname, SDSSC_CLEANUP); 4100 } 4101 4102 /* Finished with this set -- return */ 4103 return (rval); 4104 } 4105 4106 /* 4107 * Reconfig step to choose a new master for all MN disksets. 4108 * Return values: 4109 * 0 - Everything is great. 4110 * 1 - This node failed to reconfig. 4111 * 205 - Cause another reconfig due to a nodelist problem 4112 * or RPC failure to another node 4113 */ 4114 int 4115 meta_reconfig_choose_master( 4116 md_error_t *ep 4117 ) 4118 { 4119 set_t max_sets, setno; 4120 int nodecnt; 4121 mndiskset_membershiplist_t *nl; 4122 md_set_desc *sd; 4123 mdsetname_t *sp; 4124 int rval = 0; 4125 mddb_setflags_config_t sf; 4126 int start_node_delayed = 0; 4127 4128 if ((max_sets = get_max_sets(ep)) == 0) { 4129 mde_perror(ep, dgettext(TEXT_DOMAIN, 4130 "Unable to get number of sets")); 4131 return (1); 4132 } 4133 4134 /* 4135 * Get membershiplist from API routine. If there's 4136 * an error, return a 205 to cause another reconfig. 4137 */ 4138 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { 4139 mde_perror(ep, ""); 4140 return (205); 4141 } 4142 4143 for (setno = 1; setno < max_sets; setno++) { 4144 if ((sp = metasetnosetname(setno, ep)) == NULL) { 4145 if (mdiserror(ep, MDE_NO_SET)) { 4146 /* No set for this setno - continue */ 4147 mdclrerror(ep); 4148 continue; 4149 } else { 4150 /* 4151 * If encountered an RPC error from my node, 4152 * then immediately fail. 4153 */ 4154 if (mdanyrpcerror(ep)) { 4155 mde_perror(ep, ""); 4156 return (1); 4157 } 4158 /* Can't get set information */ 4159 mde_perror(ep, dgettext(TEXT_DOMAIN, 4160 "Unable to get information for " 4161 "set number %d"), setno); 4162 mdclrerror(ep); 4163 continue; 4164 } 4165 } 4166 4167 /* If setname is there, set desc should exist. */ 4168 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4169 /* 4170 * If encountered an RPC error from my node, 4171 * then immediately fail. 4172 */ 4173 if (mdanyrpcerror(ep)) { 4174 mde_perror(ep, ""); 4175 return (1); 4176 } 4177 mde_perror(ep, dgettext(TEXT_DOMAIN, 4178 "Unable to get set %s desc information"), 4179 sp->setname); 4180 mdclrerror(ep); 4181 continue; 4182 } 4183 4184 /* Only reconfig MN disksets */ 4185 if (!MD_MNSET_DESC(sd)) { 4186 continue; 4187 } 4188 4189 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4190 "Begin choose master for set %s: %s"), 4191 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4192 4193 /* Update nodelist with member information. */ 4194 if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) { 4195 /* 4196 * If encountered an RPC error from my node, 4197 * then immediately fail. 4198 */ 4199 if (mdanyrpcerror(ep)) { 4200 mde_perror(ep, ""); 4201 return (1); 4202 } 4203 mde_perror(ep, ""); 4204 mdclrerror(ep); 4205 continue; 4206 } 4207 4208 /* 4209 * If all nodes in a cluster are starting, then 4210 * all nodes will attempt to contact all other nodes 4211 * to determine a master node. This can lead to a 4212 * problem where node 1 is trying to contact the rpc.metad 4213 * node 2 and node 2 is trying to contact the rpc.metad 4214 * on node 1 -- and this causes the rpc call to fail 4215 * on both nodes and causes a new reconfig cycle. 4216 * 4217 * In order to break this problem, a newly starting node 4218 * will delay a small amount of time (nodeid mod 4 seconds) 4219 * and will then run the code to choose a master for the 4220 * first set. Delay will only be done once regardless of the 4221 * number of sets. 4222 */ 4223 if (start_node_delayed == 0) { 4224 (void) memset(&sf, 0, sizeof (sf)); 4225 sf.sf_setno = sp->setno; 4226 sf.sf_flags = MDDB_NM_GET; 4227 /* Use magic to help protect ioctl against attack. */ 4228 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4229 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, 4230 &sf.sf_mde, NULL) == 0) && 4231 ((sf.sf_setflags & MD_SET_MN_START_RC) == 4232 MD_SET_MN_START_RC)) { 4233 (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4); 4234 } 4235 start_node_delayed = 1; 4236 } 4237 4238 /* Choose master for this set */ 4239 rval = meta_reconfig_choose_master_for_set(sp, sd, ep); 4240 if (rval == -1) { 4241 mde_perror(ep, ""); 4242 return (1); 4243 } else if (rval == 205) { 4244 mde_perror(ep, ""); 4245 return (205); 4246 } 4247 4248 /* Send new nodelist to rpc.mdcommd */ 4249 (void) mdmn_reinit_set(sp->setno); 4250 4251 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4252 "Choose master for set %s completed: %s"), 4253 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4254 } 4255 4256 /* 4257 * Each node turns on I/Os for all MN disksets. 4258 * This is to recover from the situation where the master died 4259 * during a previous reconfig cycle when I/Os were suspended 4260 * for a MN diskset. 4261 * If a failure occurs return a 1 which will force this node to 4262 * panic. Cannot leave node in the situation where I/Os are 4263 * not resumed. 4264 */ 4265 setno = 0; /* 0 means all MN sets */ 4266 if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) { 4267 mde_perror(ep, ""); 4268 return (1); 4269 } 4270 4271 /* Free the nodelist */ 4272 if (nodecnt) 4273 meta_free_nodelist(nl); 4274 4275 return (0); 4276 } 4277 4278 /* 4279 * meta_mnsync_user_records will synchronize the diskset user records across 4280 * all nodes in the diskset. The diskset user records are stored in 4281 * each node's local set mddb. 4282 * 4283 * This needs to be done even if there is no master change during the 4284 * reconfig cycle since this routine should clean up any mess left by 4285 * the untimely termination of a metaset or metadb command (due to a 4286 * node panic or to user intervention). 4287 * 4288 * Caller is the Master node. 4289 * 4290 * Returns 0 - Success 4291 * 205 - Failure during RPC to another node 4292 * -1 - Any other failure and ep is filled in. 4293 */ 4294 int 4295 meta_mnsync_user_records( 4296 mdsetname_t *sp, 4297 md_error_t *ep 4298 ) 4299 { 4300 md_set_desc *sd; 4301 md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail; 4302 md_mnset_record *mnsr; 4303 md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL; 4304 md_mnnode_record *nr; 4305 md_drive_record *dr; 4306 int dr_cnt, dd_cnt; 4307 int found_my_nr; 4308 md_drive_desc *dd, *dd_prev, *master_dd, *other_dd; 4309 int all_drives_ok; 4310 int rval = 0; 4311 int max_genid = 0; 4312 int num_alive_nodes, num_alive_nodes_del = 0; 4313 int set_locked = 0; 4314 md_setkey_t *cl_sk; 4315 md_error_t xep = mdnullerror; 4316 char *anode[1]; 4317 mddb_setflags_config_t sf; 4318 4319 /* 4320 * Sync up node records first. 4321 * Construct a master nodelist using the nodelist from this 4322 * node's rpc.metad node records and then setting the state of each 4323 * node following these rules: 4324 * - If a node record is marked OK on its node, mark it OK 4325 * in the master nodelist (and later OK on all nodes) 4326 * If a node record is also marked OWN on its node, 4327 * mark it OWN in the master nodelist. 4328 * - If a node record is not marked OK on its node, then mark 4329 * it as DEL in the master list (later deleting it) 4330 * - If node record doesn't exist on that node, then mark it DEL 4331 * (later deleting it) 4332 * - If set record doesn't exist on that node, mark node as DEL 4333 * - If a node record doesn't exist on all nodes, then mark it DEL 4334 * - If a node is not ALIVE, then 4335 * - If that node marked DEL on any node - mark it DEL 4336 * in master list but leave in nodelist 4337 * - If that node is marked as ADD on any node, mark it 4338 * ADD in the master list but leave in nodelist 4339 * - When that node returns to the living, the DEL 4340 * node record will be removed and the ADD node 4341 * record may be removed if marked ADD on that 4342 * node. 4343 * The key rule is to not remove a node from the nodelist until 4344 * that node record is removed from its own node. Do not want to 4345 * remove a node's record from all other nodes and then have 4346 * that node have its own record marked OK so that a node will pick 4347 * a different master than the other nodes. 4348 * 4349 * Next, 4350 * If node is ALIVE and node record is marked DEL in master nodelist, 4351 * remove node from set. 4352 * If node is ALIVE and node record is marked OK in master nodelist, 4353 * mark it OK on all other nodes. 4354 * If node is not ALIVE and node record is marked DEL in master 4355 * nodelist, mark it DEL on all other nodes. 4356 * If node is not ALIVE and node record is marked ADD in master, 4357 * nodelist, mark it ADD on all other nodes. 4358 */ 4359 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4360 return (-1); 4361 } 4362 master_nodelist = sd->sd_nodelist; 4363 4364 /* 4365 * Walk through nodelist creating a master nodelist. 4366 */ 4367 num_alive_nodes = 0; 4368 nd = master_nodelist; 4369 while (nd) { 4370 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4371 nd = nd->nd_next; 4372 continue; 4373 } 4374 num_alive_nodes++; 4375 if (clnt_mngetset(nd->nd_nodename, sp->setname, 4376 MD_SET_BAD, &mnsr, ep) == -1) { 4377 if (mdiserror(ep, MDE_NO_SET)) { 4378 /* set doesn't exist, mark node as DEL */ 4379 nd->nd_flags &= ~MD_MN_NODE_OK; 4380 nd->nd_flags &= ~MD_MN_NODE_ADD; 4381 nd->nd_flags |= MD_MN_NODE_DEL; 4382 nd->nd_flags |= MD_MN_NODE_NOSET; 4383 nd = nd->nd_next; 4384 continue; 4385 } else { 4386 /* If RPC failure to another node return 205 */ 4387 if ((mdanyrpcerror(ep)) && 4388 (sd->sd_mn_mynode->nd_nodeid != 4389 nd->nd_nodeid)) { 4390 rval = 205; 4391 } else { 4392 /* Any other failure */ 4393 rval = -1; 4394 } 4395 goto out; 4396 } 4397 } 4398 /* Find biggest genid in records for this diskset */ 4399 if (mnsr->sr_genid > max_genid) 4400 max_genid = mnsr->sr_genid; 4401 4402 dr = mnsr->sr_drivechain; 4403 while (dr) { 4404 /* Find biggest genid in records for this diskset */ 4405 if (dr->dr_genid > max_genid) { 4406 max_genid = dr->dr_genid; 4407 } 4408 dr = dr->dr_next; 4409 } 4410 4411 found_my_nr = 0; 4412 nr = mnsr->sr_nodechain; 4413 /* nr is the list of node recs from nd_nodename node */ 4414 while (nr) { 4415 /* Find biggest genid in records for this diskset */ 4416 if (nr->nr_genid > max_genid) 4417 max_genid = nr->nr_genid; 4418 nd2 = master_nodelist; 4419 ndtail = NULL; 4420 /* For each node record, is it in master list? */ 4421 while (nd2) { 4422 if (nd2->nd_nodeid == nr->nr_nodeid) 4423 break; 4424 if (nd2->nd_next == NULL) 4425 ndtail = nd2; 4426 nd2 = nd2->nd_next; 4427 } 4428 /* 4429 * Found node record not in master list -- add it 4430 * to list marking it as DEL since node record 4431 * should exist on all nodes unless a panic occurred 4432 * during addition or deletion of host to diskset. 4433 */ 4434 if (nd2 == NULL) { 4435 nd2 = Zalloc(sizeof (*nd2)); 4436 (void) strcpy(nd2->nd_nodename, 4437 nr->nr_nodename); 4438 nd2->nd_flags = nr->nr_flags; 4439 nd2->nd_flags |= MD_MN_NODE_DEL; 4440 nd2->nd_nodeid = nr->nr_nodeid; 4441 nd2->nd_next = NULL; 4442 ndtail->nd_next = nd2; 4443 nd2 = NULL; 4444 nr = nr->nr_next; 4445 continue; 4446 } 4447 /* 4448 * Is this the node record for the node that 4449 * we requested the set desc from? 4450 * If so, check if node has its own node record 4451 * marked OK. If marked OK, check for the OWN bit. 4452 */ 4453 if (nr->nr_nodeid == nd->nd_nodeid) { 4454 found_my_nr = 1; 4455 if (nr->nr_flags & MD_MN_NODE_OK) { 4456 /* 4457 * If node record is marked OK 4458 * on its own node, then mark it OK 4459 * in the master list. Node record 4460 * would have to exist on all nodes 4461 * in the ADD state before it could 4462 * be put into the OK state. 4463 */ 4464 nd->nd_flags |= MD_MN_NODE_OK; 4465 nd->nd_flags &= 4466 ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL); 4467 /* 4468 * Mark own in master list as marked 4469 * on own node. 4470 */ 4471 if (nr->nr_flags & MD_MN_NODE_OWN) 4472 nd->nd_flags |= MD_MN_NODE_OWN; 4473 else 4474 nd->nd_flags &= ~MD_MN_NODE_OWN; 4475 } else { 4476 /* Otherwise, mark node as DEL */ 4477 nd->nd_flags &= ~MD_MN_NODE_OK; 4478 nd->nd_flags &= ~MD_MN_NODE_ADD; 4479 nd->nd_flags |= MD_MN_NODE_DEL; 4480 } 4481 } 4482 /* 4483 * If node is not ALIVE and marked DEL 4484 * on any node, make it DEL in master list. 4485 * If node is not ALIVE and marked ADD 4486 * on any node, make it ADD in master list 4487 * unless node record has already been marked DEL. 4488 */ 4489 if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) { 4490 if (nr->nr_flags & MD_MN_NODE_ADD) { 4491 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4492 /* If not DEL - mark it ADD */ 4493 nd->nd_flags |= MD_MN_NODE_ADD; 4494 nd->nd_flags &= ~MD_MN_NODE_OK; 4495 } 4496 } 4497 if (nr->nr_flags & MD_MN_NODE_DEL) { 4498 nd->nd_flags |= MD_MN_NODE_DEL; 4499 nd->nd_flags &= ~MD_MN_NODE_OK; 4500 /* Could already be ADD - make it DEL */ 4501 nd->nd_flags &= ~MD_MN_NODE_ADD; 4502 } 4503 } 4504 nr = nr->nr_next; 4505 } 4506 /* 4507 * If a node record doesn't exist on its own node, 4508 * then mark node as DEL. 4509 */ 4510 if (found_my_nr == 0) { 4511 nd->nd_flags &= ~MD_MN_NODE_OK; 4512 nd->nd_flags |= MD_MN_NODE_DEL; 4513 } 4514 4515 /* 4516 * If node is OK - put mnsr onto master_mnsr_node list for 4517 * later use when syncing up the drive records in the set. 4518 */ 4519 if (nd->nd_flags & MD_MN_NODE_OK) { 4520 mnsr_node = Zalloc(sizeof (*mnsr_node)); 4521 mnsr_node->mmn_mnsr = mnsr; 4522 (void) strncpy(mnsr_node->mmn_nodename, 4523 nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1); 4524 mnsr_node->mmn_next = master_mnsr_node; 4525 master_mnsr_node = mnsr_node; 4526 } else { 4527 free_sr((struct md_set_record *)mnsr); 4528 } 4529 4530 nd = nd->nd_next; 4531 } 4532 4533 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4534 "Master nodelist created for set %s: %s"), 4535 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4536 4537 /* 4538 * Send master nodelist to the rpc.metad on all nodes (including 4539 * myself) and each node will update itself. This will set the 4540 * ADD and DEL flags on each node as setup in the master nodelist. 4541 * Don't send nodelist to node where set doesn't exist. 4542 */ 4543 nd = master_nodelist; 4544 while (nd) { 4545 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4546 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4547 nd = nd->nd_next; 4548 continue; 4549 } 4550 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 4551 master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 4552 /* If RPC failure to another node return 205 */ 4553 if ((mdanyrpcerror(ep)) && 4554 (sd->sd_mn_mynode->nd_nodeid != 4555 nd->nd_nodeid)) { 4556 rval = 205; 4557 } else { 4558 /* Any other failure */ 4559 rval = -1; 4560 } 4561 goto out; 4562 } 4563 nd = nd->nd_next; 4564 } 4565 4566 /* 4567 * Now, delete nodes that need to be deleted. 4568 */ 4569 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4570 ep)) == NULL) { 4571 if (! mdisok(ep)) { 4572 rval = -1; 4573 goto out; 4574 } 4575 } 4576 4577 /* 4578 * May be doing lots of RPC commands to the nodes, so lock the 4579 * ALIVE members of the set since most of the rpc.metad routines 4580 * require this for security reasons. 4581 */ 4582 nd = master_nodelist; 4583 while (nd) { 4584 /* Skip non-alive nodes and node without set */ 4585 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4586 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4587 nd = nd->nd_next; 4588 continue; 4589 } 4590 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 4591 /* If RPC failure to another node return 205 */ 4592 if ((mdanyrpcerror(ep)) && 4593 (sd->sd_mn_mynode->nd_nodeid != 4594 nd->nd_nodeid)) { 4595 rval = 205; 4596 } else { 4597 /* Any other failure */ 4598 rval = -1; 4599 } 4600 goto out; 4601 } 4602 set_locked = 1; 4603 nd = nd->nd_next; 4604 } 4605 4606 nd = master_nodelist; 4607 while (nd) { 4608 /* Skip non-alive nodes */ 4609 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4610 nd = nd->nd_next; 4611 continue; 4612 } 4613 if (nd->nd_flags & MD_MN_NODE_DEL) { 4614 num_alive_nodes_del++; 4615 /* 4616 * Delete this node rec from all ALIVE nodes in diskset. 4617 */ 4618 nd2 = master_nodelist; 4619 while (nd2) { 4620 /* Skip non-alive nodes and node without set */ 4621 if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) || 4622 (nd2->nd_flags & MD_MN_NODE_NOSET)) { 4623 nd2 = nd2->nd_next; 4624 continue; 4625 } 4626 4627 /* This is a node being deleted from set */ 4628 if (nd2->nd_nodeid == nd->nd_nodeid) { 4629 /* Mark set record as DEL */ 4630 if (clnt_upd_sr_flags(nd->nd_nodename, 4631 sp, MD_SR_DEL, ep)) { 4632 /* RPC failure to !my node */ 4633 if ((mdanyrpcerror(ep)) && 4634 (sd->sd_mn_mynode-> 4635 nd_nodeid 4636 != nd->nd_nodeid)) { 4637 rval = 205; 4638 } else { 4639 /* Any other failure */ 4640 rval = -1; 4641 } 4642 goto out; 4643 } 4644 if (clnt_deldrvs(nd->nd_nodename, sp, 4645 dd, ep)) { 4646 /* RPC failure to !my node */ 4647 if ((mdanyrpcerror(ep)) && 4648 (sd->sd_mn_mynode-> 4649 nd_nodeid 4650 != nd->nd_nodeid)) { 4651 rval = 205; 4652 } else { 4653 /* Any other failure */ 4654 rval = -1; 4655 } 4656 goto out; 4657 } 4658 if (clnt_delset(nd->nd_nodename, sp, 4659 ep) == -1) { 4660 /* RPC failure to !my node */ 4661 if ((mdanyrpcerror(ep)) && 4662 (sd->sd_mn_mynode-> 4663 nd_nodeid 4664 != nd->nd_nodeid)) { 4665 rval = 205; 4666 } else { 4667 /* Any other failure */ 4668 rval = -1; 4669 } 4670 goto out; 4671 } 4672 } else { 4673 /* 4674 * Delete host from sets on hosts 4675 * not being deleted. 4676 */ 4677 anode[0] = Strdup(nd->nd_nodename); 4678 if (clnt_delhosts(nd2->nd_nodename, sp, 4679 1, anode, ep) == -1) { 4680 Free(anode[0]); 4681 /* RPC failure to !my node */ 4682 if ((mdanyrpcerror(ep)) && 4683 (sd->sd_mn_mynode-> 4684 nd_nodeid 4685 != nd2->nd_nodeid)) { 4686 rval = 205; 4687 } else { 4688 /* Any other failure */ 4689 rval = -1; 4690 } 4691 goto out; 4692 } 4693 4694 meta_mc_log(MC_LOG5, 4695 dgettext(TEXT_DOMAIN, 4696 "Deleted node %s (%d) on node %s " 4697 "from set %s: %s"), 4698 nd->nd_nodename, nd->nd_nodeid, 4699 nd2->nd_nodename, 4700 sp->setname, 4701 meta_print_hrtime( 4702 gethrtime() - start_time)); 4703 4704 Free(anode[0]); 4705 } 4706 nd2 = nd2->nd_next; 4707 } 4708 } 4709 nd = nd->nd_next; 4710 } 4711 4712 nd = master_nodelist; 4713 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4714 while (nd) { 4715 /* Skip non-alive nodes and node without set */ 4716 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4717 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4718 nd = nd->nd_next; 4719 continue; 4720 } 4721 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 4722 /* If RPC failure to another node return 205 */ 4723 if ((mdanyrpcerror(ep)) && 4724 (sd->sd_mn_mynode->nd_nodeid != 4725 nd->nd_nodeid)) { 4726 rval = 205; 4727 } else { 4728 /* Any other failure */ 4729 rval = -1; 4730 } 4731 goto out; 4732 } 4733 nd = nd->nd_next; 4734 } 4735 cl_set_setkey(NULL); 4736 set_locked = 0; 4737 4738 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4739 "Nodelist syncronization complete for set %s: %s"), 4740 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4741 4742 metaflushsetname(sp); 4743 4744 /* 4745 * If all alive nodes have been deleted from set, just 4746 * return since nothing else can be done until non-alive 4747 * nodes (if there are any) rejoin the cluster. 4748 */ 4749 if (num_alive_nodes == num_alive_nodes_del) { 4750 rval = 0; 4751 goto out; 4752 } 4753 4754 /* 4755 * Sync up drive records. 4756 * 4757 * If a node panic'd (or metaset command was killed) during the 4758 * addition or deletion of a drive to the diskset, the nodes 4759 * may have a different view of the drive list. During cleanup 4760 * of the drive list during reconfig, a drive will be deleted 4761 * from the list if the master node sees that the drive has been 4762 * marked in the ADD state on any node or is marked in the DEL state 4763 * on all nodes. 4764 * This cleanup must occur even if all nodes in the cluster are 4765 * not part of the cluster so that all nodes have the same view 4766 * of the drivelist. 4767 * Then if the entire cluster goes down and comes back up, the 4768 * new master node could be a node that wasn't in the cluster when 4769 * the node was deleted. This could lead to a situation where the 4770 * master node thinks that a drive is OK, but this drive isn't 4771 * known to the other nodes. 4772 * This situation can also occur during the addition of a drive 4773 * where a node has the drive marked OK, but the node executing the 4774 * metaset command enountered a failure before marking that drive OK 4775 * on the rest of the nodes. If the node with the OK drive then 4776 * panics, then rest of the nodes will remove that drive marked ADD 4777 * and when the node with the OK drive rejoins the cluster, it will 4778 * have a drive marked OK that is unknown by the other nodes. 4779 * 4780 * There are 2 situations to consider: 4781 * A) Master knows about a drive that other nodes don't know about. 4782 * B) At least one slave node knows about a drive that the master 4783 * node doesn't know about. 4784 * 4785 * To handle these situations the following steps are followed: 4786 * 1) Count number of drives known by this master node and the 4787 * other slave nodes. 4788 * If all nodes have the same number of drives and the master has 4789 * all drives marked OK, then skip to step4. 4790 * 4791 * 2) If a node has less drives listed than the master, the master 4792 * must get the drive descriptor list from that node so that 4793 * master can determine which drive it needs to delete from that 4794 * node. Master must get the drive descriptor list since the 4795 * drive record list does not contain the name of the drive, but 4796 * only a key and the key can only be interprested on that other 4797 * node. 4798 * 4799 * 3) The master will then create the master drive list by doing: 4800 * - Master starts with drive list known by master. 4801 * - Any drive marked ADD will be removed from the list. 4802 * - Any drive not known by another node (from step2) will be 4803 * removed from the drive list. 4804 * - If a drive is marked DEL on the master, the master must 4805 * verify that the drive record is marked DEL on all nodes. 4806 * If any node has the drive record marked OK, mark it OK 4807 * on the master. (The reason why is described below). 4808 * 4809 * 4) The master sends out the master drive list and the slave 4810 * nodes will force their drive lists to match the master 4811 * drive list by deleting drives, if necessary and by changing 4812 * the drive record states from ADD->OK if master has drive 4813 * marked OK and slave has drive marked ADD. 4814 * 4815 * Interesting scenarios: 4816 * 4817 * 1) System has 4 nodes with node 1 as the master. Node 3 starts 4818 * to delete a drive record (drive record on node 1 is marked DEL), 4819 * but is stopped when node 3 panics. Node 1 also panics. 4820 * During reconfig cycle, node 2 is picked as master and the drive 4821 * record is left alone since all nodes in the cluster have it 4822 * marked OK. User now sees drive as part of diskset. 4823 * Now, entire cluster is rebooted and node 1 rejoins the cluster. 4824 * Node 1 is picked as the master and node 1 has drive record 4825 * marked DEL. Node 1 contacts all other nodes in the cluster 4826 * and since at least one node has the drive record marked OK, 4827 * the master marks the drive record OK. 4828 * User continues to see the drive as part of the diskset. 4829 */ 4830 4831 /* Reget set descriptor since flushed above */ 4832 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4833 rval = -1; 4834 goto out; 4835 } 4836 4837 /* Has side effect of setting sd->sd_drvs to same as master_dd */ 4838 if ((master_dd = metaget_drivedesc_sideno(sp, 4839 sd->sd_mn_mynode->nd_nodeid, 4840 (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { 4841 /* No drives in list */ 4842 if (!mdisok(ep)) { 4843 /* 4844 * Can't get drive list for this node, so 4845 * return -1 causing this node to be removed 4846 * cluster config and fixed. 4847 */ 4848 rval = -1; 4849 goto out; 4850 } 4851 } 4852 4853 /* Count the number of drives for all nodes */ 4854 mnsr_node = master_mnsr_node; 4855 while (mnsr_node) { 4856 dr_cnt = 0; 4857 dr = mnsr_node->mmn_mnsr->sr_drivechain; 4858 while (dr) { 4859 dr_cnt++; 4860 dr = dr->dr_next; 4861 } 4862 mnsr_node->mmn_numdrives = dr_cnt; 4863 mnsr_node = mnsr_node->mmn_next; 4864 } 4865 4866 /* Count the number of drives for the master; also check flags */ 4867 all_drives_ok = 1; 4868 dd_cnt = 0; 4869 dd = master_dd; 4870 while (dd) { 4871 dd_cnt++; 4872 if (!(dd->dd_flags & MD_DR_OK)) 4873 all_drives_ok = 0; 4874 dd = dd->dd_next; 4875 } 4876 4877 /* If all drives are ok, do quick check against number of drives */ 4878 if (all_drives_ok) { 4879 /* If all nodes have same number of drives, almost done */ 4880 mnsr_node = master_mnsr_node; 4881 while (mnsr_node) { 4882 if (mnsr_node->mmn_numdrives != dd_cnt) 4883 break; 4884 mnsr_node = mnsr_node->mmn_next; 4885 } 4886 /* All nodes have same number of drives, just send flags */ 4887 if (mnsr_node == NULL) { 4888 goto send_drive_list; 4889 } 4890 } 4891 4892 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4893 "Begin detailed drive synchronization for set %s: %s"), 4894 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4895 4896 /* Detailed check required */ 4897 mnsr_node = master_mnsr_node; 4898 while (mnsr_node) { 4899 /* Does slave node have less drives than master? */ 4900 if (mnsr_node->mmn_numdrives < dd_cnt) { 4901 /* Yes - must determine which drive is missing */ 4902 if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp, 4903 &other_dd, ep)) { 4904 /* RPC failure to !my node */ 4905 if ((mdanyrpcerror(ep)) && 4906 (strcmp(mynode(), mnsr_node->mmn_nodename) 4907 != 0)) { 4908 rval = 205; 4909 } else { 4910 /* Any other failure */ 4911 rval = -1; 4912 } 4913 mde_perror(ep, dgettext(TEXT_DOMAIN, 4914 "Master node %s unable to " 4915 "retrieve drive list from node %s"), 4916 mynode(), mnsr_node->mmn_nodename); 4917 goto out; 4918 } 4919 mnsr_node->mmn_dd = other_dd; 4920 dd = master_dd; 4921 while (dd) { 4922 if (!(dd->dd_flags & MD_DR_OK)) { 4923 dd = dd->dd_next; 4924 continue; 4925 } 4926 other_dd = mnsr_node->mmn_dd; 4927 while (other_dd) { 4928 /* Convert to devids, when available */ 4929 if (strcmp(other_dd->dd_dnp->cname, 4930 dd->dd_dnp->cname) == 0) { 4931 break; 4932 } 4933 other_dd = other_dd->dd_next; 4934 } 4935 /* 4936 * dd not found on slave so mark it 4937 * ADD for later deletion (drives in ADD 4938 * state are deleted later in this routine). 4939 */ 4940 if (other_dd == NULL) { 4941 dd->dd_flags = MD_DR_ADD; 4942 } 4943 dd = dd->dd_next; 4944 } 4945 4946 } 4947 mnsr_node = mnsr_node->mmn_next; 4948 } 4949 4950 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4951 "Drive check completed for set %s: %s"), 4952 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4953 4954 dd = master_dd; 4955 dd_prev = 0; 4956 while (dd) { 4957 /* Remove any ADD drives from list */ 4958 if (dd->dd_flags & MD_DR_ADD) { 4959 if (dd_prev) { 4960 dd_prev->dd_next = dd->dd_next; 4961 dd->dd_next = NULL; 4962 metafreedrivedesc(&dd); 4963 dd = dd_prev->dd_next; 4964 } else { 4965 /* 4966 * If removing drive descriptor from head 4967 * of linked list, also change sd->sd_drvs. 4968 */ 4969 master_dd = sd->sd_drvs = dd->dd_next; 4970 dd->dd_next = NULL; 4971 metafreedrivedesc(&dd); 4972 dd = master_dd; 4973 } 4974 /* dd setup in if/else above */ 4975 continue; 4976 } 4977 /* 4978 * If drive is marked DEL, check all other nodes. 4979 * If drive on another node is marked OK, mark drive OK 4980 * in master list. If drive is marked DEL or doesn't exist 4981 * on all nodes, remove drive from list. 4982 */ 4983 if (dd->dd_flags & MD_DR_DEL) { 4984 mnsr_node = master_mnsr_node; 4985 while (mnsr_node) { 4986 if (mnsr_node->mmn_dd == NULL) { 4987 if (clnt_getdrivedesc( 4988 mnsr_node->mmn_nodename, sp, 4989 &other_dd, ep)) { 4990 /* RPC failure to !my node */ 4991 if ((mdanyrpcerror(ep)) && 4992 (strcmp(mynode(), 4993 mnsr_node->mmn_nodename) 4994 != 0)) { 4995 rval = 205; 4996 } else { 4997 /* Any other failure */ 4998 rval = -1; 4999 } 5000 mde_perror(ep, dgettext(TEXT_DOMAIN, 5001 "Master node %s unable " 5002 "to retrieve drive list from " 5003 "node %s"), mynode(), 5004 mnsr_node->mmn_nodename); 5005 goto out; 5006 } 5007 mnsr_node->mmn_dd = other_dd; 5008 } 5009 other_dd = mnsr_node->mmn_dd; 5010 while (other_dd) { 5011 /* Found drive (OK) from other node */ 5012 if (strcmp(dd->dd_dnp->cname, 5013 other_dd->dd_dnp->cname) 5014 == 0) { 5015 /* Drive marked OK */ 5016 if (other_dd->dd_flags & 5017 MD_DR_OK) { 5018 dd->dd_flags = MD_DR_OK; 5019 } 5020 break; 5021 } 5022 other_dd = other_dd->dd_next; 5023 } 5024 if (dd->dd_flags == MD_DR_OK) 5025 break; 5026 5027 mnsr_node = mnsr_node->mmn_next; 5028 } 5029 /* 5030 * If no node had this drive marked OK, delete it. 5031 */ 5032 if (dd->dd_flags & MD_DR_DEL) { 5033 if (dd_prev) { 5034 dd_prev->dd_next = dd->dd_next; 5035 dd->dd_next = NULL; 5036 metafreedrivedesc(&dd); 5037 dd = dd_prev->dd_next; 5038 } else { 5039 /* 5040 * If removing drive descriptor from 5041 * head of linked list, also change 5042 * sd->sd_drvs. 5043 */ 5044 master_dd = sd->sd_drvs = dd->dd_next; 5045 dd->dd_next = NULL; 5046 metafreedrivedesc(&dd); 5047 dd = master_dd; 5048 } 5049 /* dd setup in if/else above */ 5050 continue; 5051 } 5052 } 5053 dd_prev = dd; 5054 dd = dd->dd_next; 5055 } 5056 5057 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5058 "Setting drive states completed for set %s: %s"), 5059 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5060 5061 send_drive_list: 5062 /* 5063 * Set genid on all drives to be the highest value seen. 5064 */ 5065 dd = master_dd; 5066 while (dd) { 5067 dd->dd_genid = max_genid; 5068 dd = dd->dd_next; 5069 } 5070 /* 5071 * Send updated drive list to all alive nodes. 5072 * Will also set genid on set and node records to have same 5073 * as the drive records. 5074 */ 5075 nd = sd->sd_nodelist; 5076 while (nd) { 5077 /* Skip non-alive nodes */ 5078 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5079 nd = nd->nd_next; 5080 continue; 5081 } 5082 if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) { 5083 /* RPC failure to another node */ 5084 if ((mdanyrpcerror(ep)) && 5085 (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) { 5086 rval = 205; 5087 } else { 5088 /* Any other failure */ 5089 rval = -1; 5090 } 5091 goto out; 5092 } 5093 nd = nd->nd_next; 5094 } 5095 5096 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5097 "Sent drive list to all nodes for set %s: %s"), 5098 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5099 5100 /* 5101 * If no drive records left in set and nodes had been joined, 5102 * withdraw the nodes. Always reset the master and mark 5103 * all nodes as withdrawn on all nodes. 5104 */ 5105 if (master_dd == NULL) { 5106 /* Reset new master flag since no longer master */ 5107 (void) memset(&sf, 0, sizeof (sf)); 5108 sf.sf_setno = sp->setno; 5109 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5110 sf.sf_flags = MDDB_NM_RESET; 5111 /* Use magic to help protect ioctl against attack. */ 5112 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5113 /* Ignore failure, failure to reset flag isn't catastrophic */ 5114 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5115 &sf.sf_mde, NULL); 5116 5117 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5118 "Reset new master flag for " "set %s: %s"), 5119 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5120 5121 nd = sd->sd_nodelist; 5122 while (nd) { 5123 /* Skip non-alive nodes */ 5124 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5125 nd = nd->nd_next; 5126 continue; 5127 } 5128 5129 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 5130 /* RPC failure to another node */ 5131 if ((mdanyrpcerror(ep)) && 5132 (sd->sd_mn_mynode->nd_nodeid != 5133 nd->nd_nodeid)) { 5134 rval = 205; 5135 } else { 5136 /* Any other failure */ 5137 rval = -1; 5138 } 5139 goto out; 5140 } 5141 set_locked = 1; 5142 5143 /* Withdraw node from set if owner */ 5144 if ((nd->nd_flags & MD_MN_NODE_OWN) && 5145 (clnt_withdrawset(nd->nd_nodename, sp, ep))) { 5146 /* RPC failure to another node */ 5147 if ((mdanyrpcerror(ep)) && 5148 (sd->sd_mn_mynode->nd_nodeid != 5149 nd->nd_nodeid)) { 5150 rval = 205; 5151 } else { 5152 /* Any other failure */ 5153 rval = -1; 5154 } 5155 goto out; 5156 } 5157 5158 /* Mark all nodes as withdrawn on this node */ 5159 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5160 sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) { 5161 /* RPC failure to another node */ 5162 if ((mdanyrpcerror(ep)) && 5163 (sd->sd_mn_mynode->nd_nodeid != 5164 nd->nd_nodeid)) { 5165 rval = 205; 5166 } else { 5167 /* Any other failure */ 5168 rval = -1; 5169 } 5170 goto out; 5171 } 5172 5173 /* Resets master to no-master on this node */ 5174 if (clnt_mnsetmaster(nd->nd_nodename, sp, 5175 "", MD_MN_INVALID_NID, ep)) { 5176 /* RPC failure to another node */ 5177 if ((mdanyrpcerror(ep)) && 5178 (sd->sd_mn_mynode->nd_nodeid != 5179 nd->nd_nodeid)) { 5180 rval = 205; 5181 } else { 5182 /* Any other failure */ 5183 rval = -1; 5184 } 5185 goto out; 5186 } 5187 5188 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5189 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 5190 /* RPC failure to another node */ 5191 if ((mdanyrpcerror(ep)) && 5192 (sd->sd_mn_mynode->nd_nodeid != 5193 nd->nd_nodeid)) { 5194 rval = 205; 5195 } else { 5196 /* Any other failure */ 5197 rval = -1; 5198 } 5199 goto out; 5200 } 5201 set_locked = 0; 5202 nd = nd->nd_next; 5203 } 5204 } 5205 5206 out: 5207 /* 5208 * If got here and set is still locked, then an error has 5209 * occurred and master_nodelist is still valid. 5210 * If error is not an RPC error, then unlock. 5211 * If error is an RPC error, skip unlocks since this could cause 5212 * yet another RPC timeout if a node has failed. 5213 * Ignore failures in unlock since unlock is just trying to 5214 * clean things up. 5215 */ 5216 if ((set_locked) && !(mdanyrpcerror(ep))) { 5217 nd = master_nodelist; 5218 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5219 while (nd) { 5220 /* Skip non-alive nodes */ 5221 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5222 nd = nd->nd_next; 5223 continue; 5224 } 5225 /* 5226 * If clnt_unlock fails, just break out since next 5227 * reconfig cycle will reset the locks anyway. 5228 */ 5229 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 5230 break; 5231 } 5232 nd = nd->nd_next; 5233 } 5234 cl_set_setkey(NULL); 5235 } 5236 /* Free master_mnsr and drive descs */ 5237 mnsr_node = master_mnsr_node; 5238 while (mnsr_node) { 5239 master_mnsr_node = mnsr_node->mmn_next; 5240 free_sr((md_set_record *)mnsr_node->mmn_mnsr); 5241 free_rem_dd(mnsr_node->mmn_dd); 5242 Free(mnsr_node); 5243 mnsr_node = master_mnsr_node; 5244 } 5245 5246 /* Frees sd->sd_drvs (which is also master_dd) */ 5247 metaflushsetname(sp); 5248 return (rval); 5249 } 5250 5251 /* 5252 * meta_mnsync_diskset_mddbs 5253 * Calling node is guaranteed to be an owner node. 5254 * Calling node is the master node. 5255 * 5256 * Master node verifies that ondisk mddb format matches its incore format. 5257 * If no nodes are joined to set, remove the change log entries. 5258 * If a node is joined to set, play the change log. 5259 * 5260 * Returns 0 - Success 5261 * 1 - Master unable to join to set. 5262 * 205 - Failure during RPC to another node 5263 * -1 - Any other failure and ep is filled in. 5264 * -1 return will eventually cause node to panic 5265 * in a SunCluster environment. 5266 */ 5267 int 5268 meta_mnsync_diskset_mddbs( 5269 mdsetname_t *sp, 5270 md_error_t *ep 5271 ) 5272 { 5273 md_set_desc *sd; 5274 mddb_config_t c; 5275 md_mn_msgclass_t class; 5276 mddb_setflags_config_t sf; 5277 md_mnnode_desc *nd, *nd2; 5278 md_error_t xep = mdnullerror; 5279 int stale_set = 0; 5280 5281 /* If setname is there, set desc should exist. */ 5282 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5283 mde_perror(ep, dgettext(TEXT_DOMAIN, 5284 "Unable to get set %s desc information"), sp->setname); 5285 return (-1); 5286 } 5287 5288 /* Are there drives in the set? */ 5289 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5290 ep) == NULL) { 5291 if (! mdisok(ep)) { 5292 return (-1); 5293 } 5294 /* No drives in set -- nothing to sync up */ 5295 return (0); 5296 } 5297 5298 /* 5299 * Is master node (which is this node) joined to set? 5300 * If master node isn't joined (which means that no nodes 5301 * are joined to diskset), remove the change log entries 5302 * since no need to replay them - all nodes will have same 5303 * view of mddbs since all nodes are reading in the mddbs 5304 * from disk. 5305 * There is also no need to sync up the master and ondisk mddbs 5306 * since master has no incore knowledge. 5307 * Need to join master to set in order to flush the change 5308 * log entries. Don't need to block I/O during join of master 5309 * to set since no other nodes are joined to set and so no I/O 5310 * can be occurring. 5311 */ 5312 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 5313 /* Join master to set */ 5314 if (clnt_joinset(mynode(), sp, 5315 MNSET_IN_RECONFIG, ep)) { 5316 if (mdismddberror(ep, MDE_DB_STALE)) { 5317 /* 5318 * If STALE, print message and continue on. 5319 * Don't do any writes or reads to mddbs 5320 * so don't clear change log. 5321 */ 5322 mde_perror(ep, dgettext(TEXT_DOMAIN, 5323 "Join of master node to STALE set %s"), 5324 sp->setname); 5325 stale_set = 1; 5326 mdclrerror(ep); 5327 } else if (mdismddberror(ep, MDE_DB_ACCOK)) { 5328 /* ACCOK means mediator provided extra vote */ 5329 mdclrerror(ep); 5330 } else { 5331 /* 5332 * If master is unable to join set, print an 5333 * error message. Don't return failure or node 5334 * will panic during cluster reconfig cycle. 5335 * Also, withdraw node from set in order to 5336 * cleanup from failed join attempt. 5337 */ 5338 mde_perror(ep, dgettext(TEXT_DOMAIN, 5339 "Join of master node in set %s failed"), 5340 sp->setname); 5341 if (clnt_withdrawset(mynode(), sp, &xep)) 5342 mdclrerror(&xep); 5343 return (1); 5344 } 5345 } 5346 /* 5347 * Master node successfully joined. 5348 * Set local copy of flags to OWN and 5349 * send owner flag to rpc.metad. If not stale, 5350 * flush the change log. 5351 */ 5352 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 5353 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET, 5354 MNSET_IN_RECONFIG, ep)) { 5355 mde_perror(ep, dgettext(TEXT_DOMAIN, 5356 "Flag update of master node join in set %s failed"), 5357 sp->setname); 5358 return (-1); 5359 } 5360 5361 if (!stale_set) { 5362 if (mdmn_reset_changelog(sp, ep, 5363 MDMN_CLF_RESETLOG) != 0) { 5364 mde_perror(ep, dgettext(TEXT_DOMAIN, 5365 "Unable to reset changelog.")); 5366 return (-1); 5367 } 5368 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5369 "Removed changelog entries for set %s: %s"), 5370 sp->setname, 5371 meta_print_hrtime(gethrtime() - start_time)); 5372 } 5373 /* Reset new master flag before return */ 5374 (void) memset(&sf, 0, sizeof (sf)); 5375 sf.sf_setno = sp->setno; 5376 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5377 sf.sf_flags = MDDB_NM_RESET; 5378 /* Use magic to help protect ioctl against attack. */ 5379 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5380 /* Ignore failure, failure to reset flag isn't catastrophic */ 5381 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5382 &sf.sf_mde, NULL); 5383 5384 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5385 "Reset new master flag for set %s: %s"), 5386 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5387 5388 return (0); 5389 } 5390 5391 /* 5392 * Is master already joined to STALE set (< 50% mddbs avail)? 5393 * If so, can make no config changes to mddbs so don't check or play 5394 * changelog and don't sync master node to ondisk mddbs. 5395 * To get out of the stale state all nodes must be withdrawn 5396 * from set. Then as nodes are re-joined, all nodes will 5397 * have same view of mddbs since all nodes are reading the 5398 * mddbs from disk. 5399 */ 5400 (void) memset(&c, 0, sizeof (c)); 5401 c.c_id = 0; 5402 c.c_setno = sp->setno; 5403 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 5404 (void) mdstealerror(ep, &c.c_mde); 5405 return (-1); 5406 } 5407 if (c.c_flags & MDDB_C_STALE) { 5408 return (0); 5409 } 5410 5411 /* 5412 * If this node is NOT a newly chosen master, then there's 5413 * nothing else to do since the change log should be empty and 5414 * the ondisk and incore mddbs are already consistent. 5415 * 5416 * A newly chosen master is a node that was not the master 5417 * at the beginning of the reconfig cycle. If a node is a new 5418 * master, then the new master state is reset after the ondisk 5419 * and incore mddbs are consistent and the change log has 5420 * been replayed. 5421 */ 5422 (void) memset(&sf, 0, sizeof (sf)); 5423 sf.sf_setno = sp->setno; 5424 sf.sf_flags = MDDB_NM_GET; 5425 /* Use magic to help protect ioctl against attack. */ 5426 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5427 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) && 5428 ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) { 5429 return (0); 5430 } 5431 5432 /* 5433 * Now, sync up incore master view to ondisk mddbs. 5434 * This is needed in the case where a master node 5435 * had made a change to the mddb, but this change 5436 * may not have been relayed to the slaves yet. 5437 * So, the new master needs to verify that the ondisk 5438 * mddbs match what the new master has incore - 5439 * if different, new master rewrites all of the mddbs. 5440 * Then the new master will replay the changelog and the 5441 * new master will then execute what the old master had 5442 * done. 5443 * 5444 * Block all I/Os to disks in this diskset on all nodes in 5445 * the diskset. This will allow the rewriting of the mddbs 5446 * (if needed), to proceed in a timely manner. 5447 * 5448 * If block of I/Os fail, return a -1. 5449 */ 5450 5451 nd = sd->sd_nodelist; 5452 while (nd) { 5453 /* Skip non-alive and non-owner nodes */ 5454 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5455 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5456 nd = nd->nd_next; 5457 continue; 5458 } 5459 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5460 MN_SUSP_IO, ep)) { 5461 mde_perror(ep, dgettext(TEXT_DOMAIN, 5462 "Unable to suspend I/O on node %s in set %s"), 5463 nd->nd_nodename, sp->setname); 5464 5465 /* 5466 * Resume all other nodes that had been suspended. 5467 * (Reconfig return step also resumes I/Os 5468 * for all sets.) 5469 */ 5470 nd2 = sd->sd_nodelist; 5471 while (nd2) { 5472 /* Stop when reaching failed node */ 5473 if (nd2->nd_nodeid == nd->nd_nodeid) 5474 break; 5475 /* Skip non-alive and non-owner nodes */ 5476 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5477 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5478 nd2 = nd2->nd_next; 5479 continue; 5480 } 5481 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5482 sp->setno, MN_RES_IO, &xep)); 5483 nd2 = nd2->nd_next; 5484 } 5485 5486 /* 5487 * If an RPC failure on another node, return a 205. 5488 * Otherwise, exit with failure. 5489 */ 5490 if ((mdanyrpcerror(ep)) && 5491 (sd->sd_mn_mynode->nd_nodeid != 5492 nd->nd_nodeid)) { 5493 return (205); 5494 } else { 5495 return (-1); 5496 } 5497 5498 } 5499 nd = nd->nd_next; 5500 } 5501 5502 (void) memset(&c, 0, sizeof (c)); 5503 c.c_id = 0; 5504 c.c_setno = sp->setno; 5505 /* Master can't sync up to ondisk mddbs? Kick it out of cluster */ 5506 if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0) 5507 return (-1); 5508 5509 /* 5510 * Resume I/Os that were suspended above. 5511 */ 5512 nd = sd->sd_nodelist; 5513 while (nd) { 5514 /* Skip non-alive and non-owner nodes */ 5515 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5516 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5517 nd = nd->nd_next; 5518 continue; 5519 } 5520 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5521 MN_RES_IO, ep)) { 5522 mde_perror(ep, dgettext(TEXT_DOMAIN, 5523 "Unable to resume I/O on node %s in set %s"), 5524 nd->nd_nodename, sp->setname); 5525 5526 /* 5527 * If an RPC failure then don't do any 5528 * more RPC calls, since one timeout is enough 5529 * to endure. If RPC failure to another node, return 5530 * 205. If RPC failure to my node, return -1. 5531 * If not an RPC failure, continue resuming the 5532 * rest of the nodes and then return -1. 5533 */ 5534 if (mdanyrpcerror(ep)) { 5535 if (sd->sd_mn_mynode->nd_nodeid == 5536 nd->nd_nodeid) { 5537 return (-1); 5538 } else { 5539 return (205); 5540 } 5541 } 5542 5543 /* 5544 * If not an RPC error, continue resuming rest of 5545 * nodes, ignoring any failures except for an 5546 * RPC failure which constitutes an immediate exit. 5547 * Start in middle of list with failing node. 5548 */ 5549 nd2 = nd->nd_next; 5550 while (nd2) { 5551 /* Skip non-alive and non-owner nodes */ 5552 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5553 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5554 nd2 = nd2->nd_next; 5555 continue; 5556 } 5557 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5558 sp->setno, MN_RES_IO, &xep)); 5559 if (mdanyrpcerror(&xep)) { 5560 return (-1); 5561 } 5562 nd2 = nd2->nd_next; 5563 } 5564 } 5565 nd = nd->nd_next; 5566 } 5567 5568 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed " 5569 "checking/writing the mddb for set %s: %s"), sp->setname, 5570 meta_print_hrtime(gethrtime() - start_time)); 5571 5572 /* 5573 * Send (aka replay) all messages we find in the changelog. 5574 * Flag the messages with 5575 * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them 5576 * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd. 5577 */ 5578 for (class = MD_MN_NCLASSES - 1; class > 0; class--) { 5579 mdmn_changelog_record_t *lr; 5580 md_error_t xep = mdnullerror; 5581 md_mn_result_t *resultp = NULL; 5582 int ret; 5583 5584 lr = mdmn_get_changelogrec(sp->setno, class); 5585 if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) { 5586 /* no entry for this class */ 5587 continue; 5588 } 5589 5590 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5591 "replaying message ID=(%d, 0x%llx-%d)\n"), 5592 MSGID_ELEMS(lr->lr_msg.msg_msgid)); 5593 5594 ret = mdmn_send_message_with_msgid( 5595 lr->lr_msg.msg_setno, 5596 lr->lr_msg.msg_type, 5597 lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | 5598 MD_MSGF_OVERRIDE_SUSPEND, 5599 lr->lr_msg.msg_event_data, 5600 lr->lr_msg.msg_event_size, 5601 &resultp, 5602 &lr->lr_msg.msg_msgid, 5603 &xep); 5604 5605 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5606 "mdmn_send_message returned %d\n"), ret); 5607 5608 if (resultp) 5609 free_result(resultp); 5610 } 5611 5612 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5613 "Playing changelog completed for set %s: %s"), 5614 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5615 5616 /* 5617 * Now that new master has ondisk and incore mddbs in sync, reset 5618 * this node's new master kernel flag (for this set). If this node 5619 * re-enters another reconfig cycle before the completion of this 5620 * reconfig cycle, this master node won't need to check if the ondisk 5621 * and incore mddbs are in sync since this node won't be considered 5622 * a new master (since this flag is being reset here in the middle of 5623 * step2). This will save time during any subsequent reconfig 5624 * cycles as long as this node continues to be master. 5625 */ 5626 (void) memset(&sf, 0, sizeof (sf)); 5627 sf.sf_setno = sp->setno; 5628 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5629 sf.sf_flags = MDDB_NM_RESET; 5630 /* Use magic to help protect ioctl against attack. */ 5631 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5632 /* Ignore failure, since failure to reset flag isn't catastrophic */ 5633 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL); 5634 5635 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5636 "Reset new master flag for set %s: %s"), 5637 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5638 5639 return (0); 5640 } 5641 5642 /* 5643 * meta_mnjoin_all will join all starting nodes in the diskset. 5644 * A starting node is considered to be any node that is not 5645 * an owner of the set but is a member of the cluster. 5646 * Master node is already joined to set (done in meta_mnsync_diskset_mddbs). 5647 * 5648 * Caller is the Master node. 5649 * 5650 * Returns 0 - Success 5651 * 205 - Failure during RPC to another node 5652 * -1 - Any other failure and ep is filled in. 5653 */ 5654 int 5655 meta_mnjoin_all( 5656 mdsetname_t *sp, 5657 md_error_t *ep 5658 ) 5659 { 5660 md_set_desc *sd; 5661 md_mnnode_desc *nd, *nd2; 5662 int rval = 0; 5663 int stale_flag = 0; 5664 mddb_config_t c; 5665 int susp_res_flag = 0; 5666 md_error_t xep = mdnullerror; 5667 5668 /* If setname is there, set desc should exist. */ 5669 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5670 mde_perror(ep, dgettext(TEXT_DOMAIN, 5671 "Unable to get set %s desc information"), sp->setname); 5672 return (-1); 5673 } 5674 5675 /* Are there drives in the set? */ 5676 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5677 ep) == NULL) { 5678 if (! mdisok(ep)) { 5679 return (-1); 5680 } 5681 /* No drives in set -- nothing to join */ 5682 return (0); 5683 } 5684 5685 /* 5686 * Is set currently stale? 5687 */ 5688 (void) memset(&c, 0, sizeof (c)); 5689 c.c_id = 0; 5690 c.c_setno = sp->setno; 5691 /* Ignore failure since master node may not be joined yet */ 5692 (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); 5693 if (c.c_flags & MDDB_C_STALE) { 5694 stale_flag = MNSET_IS_STALE; 5695 } 5696 5697 /* 5698 * If any nodes are going to be joined to diskset, then 5699 * suspend I/O to all disks in diskset so that nodes can join 5700 * (read in mddbs) in a reasonable amount of time even under 5701 * high I/O load. Don't need to do this if set is STALE since 5702 * no I/O can be occurring to a STALE set. 5703 */ 5704 if (stale_flag != MNSET_IS_STALE) { 5705 nd = sd->sd_nodelist; 5706 while (nd) { 5707 /* Found a node that will be joined to diskset */ 5708 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5709 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5710 /* Set flag that diskset should be suspended */ 5711 susp_res_flag = 1; 5712 break; 5713 } 5714 nd = nd->nd_next; 5715 } 5716 } 5717 5718 if (susp_res_flag) { 5719 /* 5720 * Block all I/Os to disks in this diskset on all joined 5721 * nodes in the diskset. 5722 * If block of I/Os fails due to an RPC failure on another 5723 * node, return 205; otherwise, return -1. 5724 */ 5725 nd = sd->sd_nodelist; 5726 while (nd) { 5727 /* Skip non-alive and non-owner nodes */ 5728 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5729 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5730 nd = nd->nd_next; 5731 continue; 5732 } 5733 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5734 MN_SUSP_IO, ep)) { 5735 mde_perror(ep, dgettext(TEXT_DOMAIN, 5736 "Unable to suspend I/O on node %s" 5737 " in set %s"), nd->nd_nodename, 5738 sp->setname); 5739 /* 5740 * Resume other nodes that had been suspended. 5741 * (Reconfig return step also resumes I/Os 5742 * for all sets.) 5743 */ 5744 nd2 = sd->sd_nodelist; 5745 while (nd2) { 5746 /* Stop when reaching failed node */ 5747 if (nd2->nd_nodeid == nd->nd_nodeid) 5748 break; 5749 /* Skip non-alive/non-owner nodes */ 5750 if ((!(nd2->nd_flags & 5751 MD_MN_NODE_ALIVE)) || 5752 (!(nd2->nd_flags & 5753 MD_MN_NODE_OWN))) { 5754 nd2 = nd2->nd_next; 5755 continue; 5756 } 5757 (void) (clnt_mn_susp_res_io( 5758 nd2->nd_nodename, sp->setno, 5759 MN_RES_IO, &xep)); 5760 nd2 = nd2->nd_next; 5761 } 5762 5763 /* 5764 * If the suspend failed due to an 5765 * RPC failure on another node, return 5766 * a 205. 5767 * Otherwise, exit with failure. 5768 * The return reconfig step will resume 5769 * I/Os for all disksets. 5770 */ 5771 if ((mdanyrpcerror(ep)) && 5772 (sd->sd_mn_mynode->nd_nodeid != 5773 nd->nd_nodeid)) { 5774 return (205); 5775 } else { 5776 return (-1); 5777 } 5778 } 5779 nd = nd->nd_next; 5780 } 5781 } 5782 5783 nd = sd->sd_nodelist; 5784 while (nd) { 5785 /* 5786 * If a node is in the membership list but isn't joined 5787 * to the set, try to join the node. 5788 */ 5789 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5790 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5791 if (clnt_joinset(nd->nd_nodename, sp, 5792 (MNSET_IN_RECONFIG | stale_flag), ep)) { 5793 /* 5794 * If RPC failure to another node 5795 * then exit without attempting anything else. 5796 * (Reconfig return step will resume I/Os 5797 * for all sets.) 5798 */ 5799 if (mdanyrpcerror(ep)) { 5800 mde_perror(ep, ""); 5801 return (205); 5802 } 5803 /* 5804 * STALE and ACCOK failures aren't true 5805 * failures. STALE means that <50% mddbs 5806 * are available. ACCOK means that the 5807 * mediator provided the extra vote. 5808 * If a true failure, then print messasge 5809 * and withdraw node from set in order to 5810 * cleanup from failed join attempt. 5811 */ 5812 if ((!mdismddberror(ep, MDE_DB_STALE)) && 5813 (!mdismddberror(ep, MDE_DB_ACCOK))) { 5814 mde_perror(ep, 5815 "WARNING: Unable to join node %s " 5816 "to set %s", nd->nd_nodename, 5817 sp->setname); 5818 mdclrerror(ep); 5819 if (clnt_withdrawset(nd->nd_nodename, 5820 sp, &xep)) 5821 mdclrerror(&xep); 5822 nd = nd->nd_next; 5823 continue; 5824 } 5825 } 5826 /* Set owner flag even if STALE or ACCOK */ 5827 nd->nd_flags |= MD_MN_NODE_OWN; 5828 } 5829 nd = nd->nd_next; 5830 } 5831 /* 5832 * Resume I/Os if suspended above. 5833 */ 5834 if (susp_res_flag) { 5835 nd = sd->sd_nodelist; 5836 while (nd) { 5837 /* 5838 * Skip non-alive and non-owner nodes 5839 * (this list doesn't include any of 5840 * the nodes that were joined). 5841 */ 5842 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5843 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5844 nd = nd->nd_next; 5845 continue; 5846 } 5847 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5848 MN_RES_IO, ep)) { 5849 mde_perror(ep, dgettext(TEXT_DOMAIN, 5850 "Unable to resume I/O on node %s" 5851 " in set %s"), nd->nd_nodename, 5852 sp->setname); 5853 5854 /* 5855 * If an RPC failure then don't do any 5856 * more RPC calls, since one timeout is enough 5857 * to endure. If RPC failure to another node, 5858 * return 205. If RPC failure to my node, 5859 * return -1. 5860 * (Reconfig return step will resume I/Os 5861 * for all sets.) 5862 * If not an RPC failure, continue resuming the 5863 * rest of the nodes and then return -1. 5864 */ 5865 if (mdanyrpcerror(ep)) { 5866 if (sd->sd_mn_mynode->nd_nodeid == 5867 nd->nd_nodeid) { 5868 return (-1); 5869 } else { 5870 return (205); 5871 } 5872 } 5873 5874 /* 5875 * If not an RPC error, continue resuming rest 5876 * of nodes, ignoring any failures except for 5877 * an RPC failure which constitutes an 5878 * immediate exit. 5879 * Start in middle of list with failing node. 5880 */ 5881 nd2 = nd->nd_next; 5882 while (nd2) { 5883 /* Skip non-owner nodes */ 5884 if ((!(nd2->nd_flags & 5885 MD_MN_NODE_ALIVE)) || 5886 (!(nd2->nd_flags & 5887 MD_MN_NODE_OWN))) { 5888 nd2 = nd2->nd_next; 5889 continue; 5890 } 5891 (void) (clnt_mn_susp_res_io( 5892 nd2->nd_nodename, sp->setno, 5893 MN_RES_IO, &xep)); 5894 if (mdanyrpcerror(&xep)) { 5895 return (-1); 5896 } 5897 nd2 = nd2->nd_next; 5898 } 5899 } 5900 nd = nd->nd_next; 5901 } 5902 } 5903 5904 nd = sd->sd_nodelist; 5905 while (nd) { 5906 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 5907 nd = nd->nd_next; 5908 continue; 5909 } 5910 /* 5911 * If 1 node fails - go ahead and update the rest except 5912 * in the case of an RPC failure, fail immediately. 5913 */ 5914 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5915 sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 5916 /* RPC failure to another node */ 5917 if (mdanyrpcerror(ep)) { 5918 return (205); 5919 } 5920 nd = nd->nd_next; 5921 rval = -1; 5922 continue; 5923 } 5924 nd = nd->nd_next; 5925 } 5926 5927 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5928 "Join of all nodes completed for set %s: %s"), 5929 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5930 5931 return (rval); 5932 } 5933