1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Just in case we're not in a build environment, make sure that 31 * TEXT_DOMAIN gets set to something. 32 */ 33 #if !defined(TEXT_DOMAIN) 34 #define TEXT_DOMAIN "SYS_TEST" 35 #endif 36 37 /* 38 * Metadevice diskset interfaces 39 */ 40 41 #include "meta_set_prv.h" 42 #include <meta.h> 43 #include <metad.h> 44 #include <mdmn_changelog.h> 45 #include <sys/lvm/md_crc.h> 46 #include <sys/utsname.h> 47 #include <sdssc.h> 48 49 #include <sys/sysevent/eventdefs.h> 50 #include <sys/sysevent/svm.h> 51 extern char *blkname(char *); 52 53 static md_drive_desc * 54 dr2drivedesc( 55 mdsetname_t *sp, 56 side_t sideno, 57 int flags, 58 md_error_t *ep 59 ) 60 { 61 md_set_record *sr; 62 md_drive_record *dr; 63 mddrivename_t *dnp; 64 md_drive_desc *dd_head = NULL; 65 md_set_desc *sd; 66 67 if (flags & MD_BYPASS_DAEMON) { 68 if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL) 69 return (NULL); 70 sd = metaget_setdesc(sp, ep); 71 sideno = getnodeside(mynode(), sd); 72 sp = metafakesetname(sp->setno, sr->sr_setname); 73 } else { 74 if ((sr = getsetbyname(sp->setname, ep)) == NULL) 75 return (NULL); 76 } 77 78 assert(sideno != MD_SIDEWILD); 79 80 /* 81 * WARNING: 82 * The act of getting the dnp from the namespace means that we 83 * will get the devid of the disk as recorded in the namespace. 84 * This devid has the potential to be stale if the disk is being 85 * replaced via a rebind, this means that any code that relies 86 * on any of the dnp information should take the appropriate action 87 * to preserve that information. For example in the rebind code the 88 * devid of the new disk is saved off and then copied back in once 89 * the code that has called this function has completed. 90 */ 91 for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { 92 if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key, 93 flags, ep)) == NULL) { 94 if (!(flags & MD_BYPASS_DAEMON)) 95 free_sr(sr); 96 metafreedrivedesc(&dd_head); 97 return (NULL); 98 } 99 100 (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt, 101 dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags); 102 } 103 104 if (!(flags & MD_BYPASS_DAEMON)) { 105 free_sr(sr); 106 } 107 return (dd_head); 108 } 109 110 static int 111 get_sidenmlist( 112 mdsetname_t *sp, 113 mddrivename_t *dnp, 114 md_error_t *ep 115 ) 116 { 117 md_set_desc *sd; 118 mdsidenames_t *sn, **sn_next; 119 int i; 120 121 if ((sd = metaget_setdesc(sp, ep)) == NULL) 122 return (-1); 123 124 metaflushsidenames(dnp); 125 sn_next = &dnp->side_names; 126 if (MD_MNSET_DESC(sd)) { 127 /* 128 * Only get sidenames for this node since 129 * that is the only side information stored in 130 * the local mddb for a multi-node diskset. 131 */ 132 if (sd->sd_mn_mynode) { 133 sn = Zalloc(sizeof (*sn)); 134 sn->sideno = sd->sd_mn_mynode->nd_nodeid; 135 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 136 sn->sideno, dnp->side_names_key, &sn->dname, 137 &sn->mnum, NULL, ep)) == NULL) { 138 if (sn->dname != NULL) 139 Free(sn->dname); 140 Free(sn); 141 return (-1); 142 } 143 144 /* Add to the end of the linked list */ 145 assert(*sn_next == NULL); 146 *sn_next = sn; 147 sn_next = &sn->next; 148 } 149 } else { 150 for (i = 0; i < MD_MAXSIDES; i++) { 151 /* Skip empty slots */ 152 if (sd->sd_nodes[i][0] == '\0') 153 continue; 154 155 sn = Zalloc(sizeof (*sn)); 156 sn->sideno = i; 157 if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, 158 i+SKEW, dnp->side_names_key, &sn->dname, 159 &sn->mnum, NULL, ep)) == NULL) { 160 /* 161 * It is possible that during the add of a 162 * host to have a 'missing' side as the side 163 * for this disk will be added later. So ignore 164 * the error. The 'missing' side will be added 165 * once the addhosts process has completed. 166 */ 167 if (mdissyserror(ep, ENOENT)) { 168 mdclrerror(ep); 169 Free(sn); 170 continue; 171 } 172 173 if (sn->dname != NULL) 174 Free(sn->dname); 175 Free(sn); 176 return (-1); 177 } 178 179 /* Add to the end of the linked list */ 180 assert(*sn_next == NULL); 181 *sn_next = sn; 182 sn_next = &sn->next; 183 } 184 } 185 186 return (0); 187 } 188 189 static md_drive_desc * 190 rl_to_dd( 191 mdsetname_t *sp, 192 md_replicalist_t *rlp, 193 md_error_t *ep 194 ) 195 { 196 md_replicalist_t *rl; 197 md_replica_t *r; 198 md_drive_desc *dd = NULL; 199 md_drive_desc *d; 200 int found; 201 md_set_desc *sd; 202 daddr_t nblks = 0; 203 204 if ((sd = metaget_setdesc(sp, ep)) == NULL) 205 return (NULL); 206 207 /* find the smallest existing replica */ 208 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 209 r = rl->rl_repp; 210 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 211 } 212 213 if (nblks <= 0) 214 nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 215 216 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 217 r = rl->rl_repp; 218 219 found = 0; 220 for (d = dd; d != NULL; d = d->dd_next) { 221 if (strcmp(r->r_namep->drivenamep->cname, 222 d->dd_dnp->cname) == 0) { 223 found = 1; 224 dd->dd_dbcnt++; 225 break; 226 } 227 } 228 229 if (! found) 230 (void) metadrivedesc_append(&dd, r->r_namep->drivenamep, 231 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK); 232 } 233 234 return (dd); 235 } 236 237 /* 238 * Exported Entry Points 239 */ 240 241 set_t 242 get_max_sets(md_error_t *ep) 243 { 244 245 static set_t max_sets = 0; 246 247 if (max_sets == 0) 248 if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0) 249 return (0); 250 251 return (max_sets); 252 } 253 254 int 255 get_max_meds(md_error_t *ep) 256 { 257 static int max_meds = 0; 258 259 if (max_meds == 0) 260 if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0) 261 return (0); 262 263 return (max_meds); 264 } 265 266 side_t 267 getmyside(mdsetname_t *sp, md_error_t *ep) 268 { 269 md_set_desc *sd; 270 char *node = NULL; 271 side_t sideno; 272 273 if (sp->setno == 0) 274 return (0); 275 276 if ((sd = metaget_setdesc(sp, ep)) == NULL) 277 return (MD_SIDEWILD); 278 279 node = mynode(); 280 281 assert(node != NULL); 282 283 sideno = getnodeside(node, sd); 284 285 if (sideno != MD_SIDEWILD) 286 return (sideno); 287 288 return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node)); 289 } 290 291 /* 292 * get set info from name 293 */ 294 md_set_record * 295 getsetbyname(char *setname, md_error_t *ep) 296 { 297 md_set_record *sr = NULL; 298 md_mnset_record *mnsr = NULL; 299 char *p; 300 size_t len; 301 302 /* get set info from daemon */ 303 if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1) 304 return (NULL); 305 if (sr != NULL) { 306 /* 307 * Returned record could be for a multi-node set or a 308 * non-multi-node set. 309 */ 310 if (MD_MNSET_REC(sr)) { 311 /* 312 * Record is for a multi-node set. Reissue call 313 * to get mnset information. Need to free 314 * record as if a non-multi-node set record since 315 * that is what clnt_getset gave us. If in 316 * the daemon, don't free since this is a pointer 317 * into the setrecords array. 318 */ 319 if (! md_in_daemon) { 320 sr->sr_flags &= ~MD_SR_MN; 321 free_sr(sr); 322 } 323 if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr, 324 ep) == -1) 325 return (NULL); 326 if (mnsr != NULL) 327 return ((struct md_set_record *)mnsr); 328 } else { 329 return (sr); 330 } 331 } 332 333 /* no such set */ 334 len = strlen(setname) + 30; 335 p = Malloc(len); 336 (void) snprintf(p, len, "setname \"%s\"", setname); 337 (void) mderror(ep, MDE_NO_SET, p); 338 Free(p); 339 return (NULL); 340 } 341 342 /* 343 * get set info from number 344 */ 345 md_set_record * 346 getsetbynum(set_t setno, md_error_t *ep) 347 { 348 md_set_record *sr; 349 md_mnset_record *mnsr = NULL; 350 char buf[100]; 351 352 if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1) 353 return (NULL); 354 355 if (sr != NULL) { 356 /* 357 * Record is for a multi-node set. Reissue call 358 * to get mnset information. Need to free 359 * record as if a non-multi-node set record since 360 * that is what clnt_getset gave us. If in 361 * the daemon, don't free since this is a pointer 362 * into the setrecords array. 363 */ 364 if (MD_MNSET_REC(sr)) { 365 /* 366 * Record is for a multi-node set. Reissue call 367 * to get mnset information. 368 */ 369 if (! md_in_daemon) { 370 sr->sr_flags &= ~MD_SR_MN; 371 free_sr(sr); 372 } 373 if (clnt_mngetset(mynode(), NULL, setno, &mnsr, 374 ep) == -1) 375 return (NULL); 376 if (mnsr != NULL) 377 return ((struct md_set_record *)mnsr); 378 } else { 379 return (sr); 380 } 381 } 382 383 (void) sprintf(buf, "setno %u", setno); 384 (void) mderror(ep, MDE_NO_SET, buf); 385 return (NULL); 386 } 387 388 int 389 meta_check_drive_inuse( 390 mdsetname_t *sp, 391 mddrivename_t *dnp, 392 int check_db, 393 md_error_t *ep 394 ) 395 { 396 mdnamelist_t *nlp = NULL; 397 mdnamelist_t *p; 398 int rval = 0; 399 400 /* get all underlying partitions */ 401 if (meta_getalldevs(sp, &nlp, check_db, ep) != 0) 402 return (-1); 403 404 /* search for drive */ 405 for (p = nlp; (p != NULL); p = p->next) { 406 mdname_t *np = p->namep; 407 408 if (strcmp(dnp->cname, np->drivenamep->cname) == 0) { 409 rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno, 410 NULL, dnp->cname, sp->setname)); 411 break; 412 } 413 } 414 415 /* cleanup, return success */ 416 metafreenamelist(nlp); 417 return (rval); 418 } 419 420 /* 421 * simple check for ownership 422 */ 423 int 424 meta_check_ownership(mdsetname_t *sp, md_error_t *ep) 425 { 426 int ownset; 427 md_set_desc *sd; 428 md_drive_desc *dd; 429 md_replicalist_t *rlp = NULL; 430 md_error_t xep = mdnullerror; 431 432 if (metaislocalset(sp)) 433 return (0); 434 435 ownset = own_set(sp, NULL, TRUE, ep); 436 if (! mdisok(ep)) 437 return (-1); 438 439 if ((sd = metaget_setdesc(sp, ep)) == NULL) 440 return (-1); 441 442 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 443 if (! mdisok(ep)) 444 return (-1); 445 446 /* If we have no drive descriptors, check for no ownership */ 447 if (dd == NULL) { 448 if (ownset == MD_SETOWNER_NONE) 449 return (0); 450 451 /* If ownership somehow has come to exist, we must clean up */ 452 453 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, 454 &xep) < 0) 455 mdclrerror(&xep); 456 457 if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL) 458 if (! mdisok(&xep)) 459 mdclrerror(&xep); 460 461 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { 462 if (rel_own_bydd(sp, dd, TRUE, &xep)) 463 mdclrerror(&xep); 464 } 465 466 if (halt_set(sp, &xep)) 467 mdclrerror(&xep); 468 469 metafreereplicalist(rlp); 470 471 metafreedrivedesc(&dd); 472 473 return (0); 474 } 475 476 metafreedrivedesc(&sd->sd_drvs); 477 478 if (ownset == MD_SETOWNER_YES) 479 return (0); 480 481 return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL, 482 sp->setname)); 483 } 484 485 /* 486 * simple check for ownership 487 */ 488 int 489 meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep) 490 { 491 md_set_desc *sd; 492 md_drive_desc *dd; 493 int bool; 494 495 if (metaislocalset(sp)) 496 return (0); 497 498 if ((sd = metaget_setdesc(sp, ep)) == NULL) 499 return (-1); 500 501 if (getnodeside(hostname, sd) == MD_SIDEWILD) 502 return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 503 hostname, NULL, sp->setname)); 504 505 dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); 506 if (! mdisok(ep)) 507 return (-1); 508 509 if (clnt_ownset(hostname, sp, &bool, ep) == -1) 510 return (-1); 511 512 if (dd == NULL) 513 return (0); 514 515 metafreedrivedesc(&sd->sd_drvs); 516 517 if (bool == TRUE) 518 return (0); 519 520 return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL, 521 sp->setname)); 522 } 523 524 /* 525 * Function that determines if a node is in the multinode diskset 526 * membership list. Calling node passes in node to be checked and 527 * the nodelist as returned from meta_read_nodelist. This routine 528 * anticipates being called many times using the same diskset membership 529 * list which is why the alloc and free of the diskset membership list 530 * is left to the calling routine. 531 * Returns: 532 * 1 - if a member 533 * 0 - not a member 534 */ 535 int 536 meta_is_member( 537 char *node_name, 538 md_mn_nodeid_t node_id, 539 mndiskset_membershiplist_t *nl 540 ) 541 { 542 mndiskset_membershiplist_t *nl2; 543 int flag_check_name; 544 545 if (node_id != 0) 546 flag_check_name = 0; 547 else if (node_name != NULL) 548 flag_check_name = 1; 549 else 550 return (0); 551 552 nl2 = nl; 553 while (nl2) { 554 if (flag_check_name) { 555 /* Compare given name against name in member list */ 556 if (strcmp(nl2->msl_node_name, node_name) == 0) 557 break; 558 } else { 559 /* Compare given nodeid against nodeid in member list */ 560 if (nl2->msl_node_id == node_id) 561 break; 562 } 563 nl2 = nl2->next; 564 } 565 /* No match found in member list */ 566 if (nl2 == NULL) { 567 return (0); 568 } 569 /* Return 1 if node is in member list */ 570 return (1); 571 } 572 573 /* 574 * meta_getnext_devinfo should go to the host that 575 * has the device, to return the device name, driver name, minor num. 576 * We can take the big cheat for now, since it is a requirement 577 * that the device names and device numbers are the same, and 578 * just get the info locally. 579 * 580 * This routine is very similar to meta_getnextside_devinfo except 581 * that the specific side to be used is being passed in. 582 * 583 * Exit status: 584 * 0 - No more side info to return 585 * 1 - More side info's to return 586 * -1 - An error has been detected 587 */ 588 /*ARGSUSED*/ 589 int 590 meta_getside_devinfo( 591 mdsetname_t *sp, /* for this set */ 592 char *bname, /* local block name (myside) */ 593 side_t sideno, /* sideno */ 594 char **ret_bname, /* block device name of returned side */ 595 char **ret_dname, /* driver name of returned side */ 596 minor_t *ret_mnum, /* minor number of returned side */ 597 md_error_t *ep 598 ) 599 { 600 mdname_t *np; 601 602 if (ret_bname != NULL) 603 *ret_bname = NULL; 604 if (ret_dname != NULL) 605 *ret_dname = NULL; 606 if (ret_mnum != NULL) 607 *ret_mnum = NODEV32; 608 609 610 if ((np = metaname(&sp, bname, ep)) == NULL) 611 return (-1); 612 613 /* 614 * NOTE (future) - There will be more work here once devids are integrated 615 * into disksets. Then the side should be used to find the correct 616 * host and the b/d names should be gotten from that host. 617 */ 618 619 /* 620 * Return the side info. 621 */ 622 if (ret_bname != NULL) 623 *ret_bname = Strdup(np->bname); 624 625 if (ret_dname != NULL) { 626 mdcinfo_t *cinfo; 627 628 if ((cinfo = metagetcinfo(np, ep)) == NULL) 629 return (-1); 630 631 *ret_dname = Strdup(cinfo->dname); 632 } 633 634 if (ret_mnum != NULL) 635 *ret_mnum = meta_getminor(np->dev); 636 637 return (1); 638 } 639 640 /* 641 * Get the information on the device from the remote node using the devid 642 * of the disk. 643 * 644 * Exit status: 645 * 0 - No more side info to return 646 * 1 - More side info's to return 647 * -1 - An error has been detected 648 */ 649 int 650 meta_getnextside_devinfo( 651 mdsetname_t *sp, /* for this set */ 652 char *bname, /* local block name (myside) */ 653 side_t *sideno, /* previous sideno & returned sideno */ 654 char **ret_bname, /* block device name of returned side */ 655 char **ret_dname, /* driver name of returned side */ 656 minor_t *ret_mnum, /* minor number of returned side */ 657 md_error_t *ep 658 ) 659 { 660 md_set_desc *sd; 661 int i; 662 mdname_t *np; 663 mddrivename_t *dnp; 664 char *devidstr = NULL; 665 int devidstrlen; 666 md_dev64_t retdev = NODEV64; 667 char *ret_devname = NULL; 668 char *ret_blkdevname = NULL; 669 char *ret_driver = NULL; 670 char *nodename; 671 int fd; 672 int ret = -1; 673 char *minor_name = NULL; 674 md_mnnode_desc *nd; 675 676 677 if (ret_bname != NULL) 678 *ret_bname = NULL; 679 if (ret_dname != NULL) 680 *ret_dname = NULL; 681 if (ret_mnum != NULL) 682 *ret_mnum = NODEV32; 683 684 if (metaislocalset(sp)) { 685 /* no more sides - we are done */ 686 if (*sideno != MD_SIDEWILD) 687 return (0); 688 689 /* First time through - set up return sideno */ 690 *sideno = 0; 691 } else { 692 693 /* 694 * Find the next sideno, starting after the one given. 695 */ 696 if ((sd = metaget_setdesc(sp, ep)) == NULL) 697 return (-1); 698 699 if (MD_MNSET_DESC(sd)) { 700 nd = sd->sd_nodelist; 701 if ((*sideno == MD_SIDEWILD) && 702 (nd != (struct md_mnnode_desc *)NULL)) { 703 *sideno = nd->nd_nodeid; 704 } else { 705 while (nd) { 706 /* 707 * Found given sideno, now find 708 * next sideno, if there is one. 709 */ 710 if ((*sideno == nd->nd_nodeid) && 711 (nd->nd_next != 712 (struct md_mnnode_desc *)NULL)) { 713 *sideno = 714 nd->nd_next->nd_nodeid; 715 break; 716 } 717 nd = nd->nd_next; 718 } 719 if (nd == NULL) { 720 return (0); 721 } 722 } 723 if (*sideno == MD_SIDEWILD) 724 return (0); 725 } else { 726 for (i = (*sideno)+1; i < MD_MAXSIDES; i++) 727 /* Find next full slot */ 728 if (sd->sd_nodes[i][0] != '\0') 729 break; 730 731 /* No more sides - we are done */ 732 if (i == MD_MAXSIDES) 733 return (0); 734 735 /* Set up the return sideno */ 736 *sideno = i; 737 nodename = (char *)sd->sd_nodes[i]; 738 } 739 } 740 741 /* 742 * Need to pass the node the devid of the disk and get it to 743 * send back the details of the disk from that side. 744 */ 745 if ((np = metaname(&sp, bname, ep)) == NULL) 746 return (-1); 747 748 dnp = np->drivenamep; 749 750 /* 751 * By default, set up the parameters so that they are copied out. 752 */ 753 if (ret_bname != NULL) 754 *ret_bname = Strdup(np->bname); 755 756 if (ret_dname != NULL) { 757 mdcinfo_t *cinfo; 758 759 if ((cinfo = metagetcinfo(np, ep)) == NULL) 760 return (-1); 761 762 *ret_dname = Strdup(cinfo->dname); 763 } 764 765 if (ret_mnum != NULL) 766 *ret_mnum = meta_getminor(np->dev); 767 768 /* 769 * Try some optimization. If this is the local set or the device 770 * is a metadevice then just copy the information. If the device 771 * does not have a devid (due to not having a minor name) then 772 * fall back to the pre-devid behaviour of copying the information 773 * on the device: this is okay because the sanity checks before this 774 * call would have found any issues with the device. If it's a 775 * multi-node diskset also just return ie. copy. 776 */ 777 if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) || 778 (MD_MNSET_DESC(sd))) 779 return (1); 780 781 if (np->minor_name == (char *)NULL) { 782 /* 783 * Have to get the minor name then. The slice should exist 784 * on the disk because it will have already been repartitioned 785 * up prior to getting to this point. 786 */ 787 if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) { 788 (void) mdsyserror(ep, errno, np->bname); 789 return (-1); 790 } 791 (void) devid_get_minor_name(fd, &minor_name); 792 np->minor_name = Strdup(minor_name); 793 devid_str_free(minor_name); 794 (void) close(fd); 795 } 796 797 /* allocate extra space for "/" and NULL hence +2 */ 798 devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2; 799 devidstr = (char *)Malloc(devidstrlen); 800 801 /* 802 * As a minor name is supplied then the ret_devname will be 803 * appropriate to that minor_name and in this case it will be 804 * a block device ie /dev/dsk. 805 */ 806 (void) snprintf(devidstr, devidstrlen, 807 "%s/%s", dnp->devid, np->minor_name); 808 809 ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev, 810 np->bname, &ret_devname, &ret_driver, ep); 811 812 Free(devidstr); 813 814 /* 815 * If the other side is not running device id in disksets, 816 * 'ret' is set to ENOTSUP in which case we fallback to 817 * the existing behaviour 818 */ 819 if (ret == ENOTSUP) 820 return (1); 821 else if (ret == -1) 822 return (-1); 823 824 /* 825 * ret_devname comes from the rpc call and is a 826 * raw device name. We need to make this into a 827 * block device via blkname for further processing. 828 * Unfortunately, when our device id isn't found in 829 * the system, the rpc call will return a " " in 830 * ret_devname in which case we need to fill that in 831 * as ret_blkname because blkname of " " returns NULL. 832 */ 833 if (ret_bname != NULL && ret_devname != NULL) { 834 ret_blkdevname = blkname(ret_devname); 835 if (ret_blkdevname == NULL) 836 *ret_bname = Strdup(ret_devname); 837 else 838 *ret_bname = Strdup(ret_blkdevname); 839 } 840 841 if (ret_dname != NULL && ret_driver != NULL) 842 *ret_dname = Strdup(ret_driver); 843 844 if (ret_mnum != NULL) 845 *ret_mnum = meta_getminor(retdev); 846 847 return (1); 848 } 849 850 int 851 meta_is_drive_in_anyset( 852 mddrivename_t *dnp, 853 mdsetname_t **spp, 854 int bypass_daemon, 855 md_error_t *ep 856 ) 857 { 858 set_t setno; 859 mdsetname_t *this_sp; 860 int is_it; 861 set_t max_sets; 862 863 if ((max_sets = get_max_sets(ep)) == 0) 864 return (-1); 865 866 assert(spp != NULL); 867 *spp = NULL; 868 869 for (setno = 1; setno < max_sets; setno++) { 870 if (!bypass_daemon) { 871 if ((this_sp = metasetnosetname(setno, ep)) == NULL) { 872 if (mdismddberror(ep, MDE_DB_NODB)) { 873 mdclrerror(ep); 874 return (0); 875 } 876 if (mdiserror(ep, MDE_NO_SET)) { 877 mdclrerror(ep); 878 continue; 879 } 880 return (-1); 881 } 882 } else 883 this_sp = metafakesetname(setno, NULL); 884 885 if ((is_it = meta_is_drive_in_thisset(this_sp, dnp, 886 bypass_daemon, ep)) == -1) { 887 if (mdiserror(ep, MDE_NO_SET)) { 888 mdclrerror(ep); 889 continue; 890 } 891 return (-1); 892 } 893 if (is_it) { 894 *spp = this_sp; 895 return (0); 896 } 897 } 898 return (0); 899 } 900 901 int 902 meta_is_drive_in_thisset( 903 mdsetname_t *sp, 904 mddrivename_t *dnp, 905 int bypass_daemon, 906 md_error_t *ep 907 ) 908 { 909 md_drive_desc *dd, *p; 910 911 if (bypass_daemon) 912 dd = dr2drivedesc(sp, MD_SIDEWILD, 913 (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep); 914 else 915 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); 916 917 if (dd == NULL) { 918 if (! mdisok(ep)) 919 return (-1); 920 return (0); 921 } 922 923 924 for (p = dd; p != NULL; p = p->dd_next) 925 if (strcmp(p->dd_dnp->cname, dnp->cname) == 0) 926 return (1); 927 return (0); 928 } 929 930 int 931 meta_set_balance( 932 mdsetname_t *sp, 933 md_error_t *ep 934 ) 935 { 936 md_set_desc *sd; 937 md_drive_desc *dd, *curdd; 938 daddr_t dbsize; 939 daddr_t nblks; 940 int i; 941 int rval = 0; 942 sigset_t oldsigs; 943 md_setkey_t *cl_sk; 944 md_error_t xep = mdnullerror; 945 md_mnnode_desc *nd; 946 int suspend1_flag = 0; 947 948 if ((sd = metaget_setdesc(sp, ep)) == NULL) 949 return (-1); 950 951 dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; 952 953 /* Make sure we own the set */ 954 if (meta_check_ownership(sp, ep) != 0) 955 return (-1); 956 957 /* END CHECK CODE */ 958 959 /* 960 * Get drive descriptors for the drives that are currently in the set. 961 */ 962 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); 963 964 if (! mdisok(ep)) 965 return (-1); 966 967 /* Find the minimum replica size in use is or use the default */ 968 if ((nblks = meta_db_minreplica(sp, ep)) < 0) 969 mdclrerror(ep); 970 else 971 dbsize = nblks; /* adjust replica size */ 972 973 /* Make sure we are blocking all signals */ 974 if (procsigs(TRUE, &oldsigs, &xep) < 0) 975 mdclrerror(&xep); 976 977 /* 978 * Lock the set on current set members. 979 * For MN diskset lock_set and SUSPEND are used to protect against 980 * other meta* commands running on the other nodes. 981 */ 982 if (MD_MNSET_DESC(sd)) { 983 nd = sd->sd_nodelist; 984 while (nd) { 985 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 986 nd = nd->nd_next; 987 continue; 988 } 989 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 990 rval = -1; 991 goto out; 992 } 993 nd = nd->nd_next; 994 } 995 /* 996 * Lock out other meta* commands by suspending 997 * class 1 messages across the diskset. 998 */ 999 nd = sd->sd_nodelist; 1000 while (nd) { 1001 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1002 nd = nd->nd_next; 1003 continue; 1004 } 1005 if (clnt_mdcommdctl(nd->nd_nodename, 1006 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, 1007 MD_MSCF_NO_FLAGS, ep)) { 1008 rval = -1; 1009 goto out; 1010 } 1011 suspend1_flag = 1; 1012 nd = nd->nd_next; 1013 } 1014 } else { 1015 for (i = 0; i < MD_MAXSIDES; i++) { 1016 /* Skip empty slots */ 1017 if (sd->sd_nodes[i][0] == '\0') continue; 1018 1019 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1020 rval = -1; 1021 goto out; 1022 } 1023 } 1024 } 1025 1026 /* We are not adding or deleting any drives, just balancing */ 1027 dd = NULL; 1028 1029 /* 1030 * Balance the DB's according to the list of existing drives and the 1031 * list of added drives. 1032 */ 1033 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) 1034 goto out; 1035 1036 out: 1037 /* 1038 * Unlock diskset by resuming class 1 messages across the diskset. 1039 * Just resume all classes so that resume is the same whether 1040 * just one class was locked or all classes were locked. 1041 */ 1042 if (suspend1_flag) { 1043 nd = sd->sd_nodelist; 1044 while (nd) { 1045 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1046 nd = nd->nd_next; 1047 continue; 1048 } 1049 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 1050 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 1051 /* 1052 * We are here because we failed to resume 1053 * rpc.mdcommd. However we potentially have 1054 * an error from the previous call 1055 * (meta_db_balance). If the previous call 1056 * did fail, we capture that error and 1057 * generate a perror withthe string, 1058 * "Unable to resume...". 1059 * Setting rval to -1 ensures that in the 1060 * next iteration of the loop, ep is not 1061 * clobbered. 1062 */ 1063 if (rval == 0) 1064 (void) mdstealerror(ep, &xep); 1065 else 1066 mdclrerror(&xep); 1067 rval = -1; 1068 mde_perror(ep, dgettext(TEXT_DOMAIN, 1069 "Unable to resume rpc.mdcommd.")); 1070 } 1071 nd = nd->nd_next; 1072 } 1073 } 1074 1075 /* Unlock the set */ 1076 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1077 if (MD_MNSET_DESC(sd)) { 1078 nd = sd->sd_nodelist; 1079 while (nd) { 1080 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1081 nd = nd->nd_next; 1082 continue; 1083 } 1084 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1085 if (rval == 0) 1086 (void) mdstealerror(ep, &xep); 1087 else 1088 mdclrerror(&xep); 1089 rval = -1; 1090 } 1091 nd = nd->nd_next; 1092 } 1093 } else { 1094 for (i = 0; i < MD_MAXSIDES; i++) { 1095 /* Skip empty slots */ 1096 if (sd->sd_nodes[i][0] == '\0') 1097 continue; 1098 1099 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1100 if (rval == 0) 1101 (void) mdstealerror(ep, &xep); 1102 rval = -1; 1103 } 1104 } 1105 } 1106 1107 /* release signals back to what they were on entry */ 1108 if (procsigs(FALSE, &oldsigs, &xep) < 0) 1109 mdclrerror(&xep); 1110 1111 cl_set_setkey(NULL); 1112 1113 metaflushsetname(sp); 1114 1115 return (rval); 1116 } 1117 1118 int 1119 meta_set_destroy( 1120 mdsetname_t *sp, 1121 int lock_set, 1122 md_error_t *ep 1123 ) 1124 { 1125 int i; 1126 med_rec_t medr; 1127 md_set_desc *sd; 1128 md_drive_desc *dd, *p, *p1; 1129 mddrivename_t *dnp; 1130 mdname_t *np; 1131 mdnamelist_t *nlp = NULL; 1132 int num_users = 0; 1133 int has_set; 1134 side_t mysideno; 1135 sigset_t oldsigs; 1136 md_error_t xep = mdnullerror; 1137 md_setkey_t *cl_sk; 1138 int rval = 0; 1139 int delete_end = 1; 1140 1141 /* Make sure we are blocking all signals */ 1142 if (procsigs(TRUE, &oldsigs, ep) < 0) 1143 return (-1); 1144 1145 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1146 if (! mdisok(ep)) 1147 rval = -1; 1148 goto out; 1149 } 1150 1151 /* 1152 * meta_set_destroy should not be called for a MN diskset. 1153 * This routine destroys a set without communicating this information 1154 * to the other nodes which would lead to an inconsistency in 1155 * the MN diskset. 1156 */ 1157 if (MD_MNSET_DESC(sd)) { 1158 rval = -1; 1159 goto out; 1160 } 1161 1162 /* Continue if a traditional diskset */ 1163 1164 /* 1165 * Check to see who has the set. If we are not the last user of the 1166 * set, we will not touch the replicas. 1167 */ 1168 for (i = 0; i < MD_MAXSIDES; i++) { 1169 /* Skip empty slots */ 1170 if (sd->sd_nodes[i][0] == '\0') 1171 continue; 1172 1173 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, 1174 ep); 1175 1176 if (has_set < 0) { 1177 mdclrerror(ep); 1178 } else 1179 num_users++; 1180 } 1181 1182 if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) { 1183 if (! mdisok(ep)) { 1184 rval = -1; 1185 goto out; 1186 } 1187 } 1188 1189 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 1190 rval = -1; 1191 goto out; 1192 } 1193 1194 if (lock_set == TRUE) { 1195 /* Lock the set on our side */ 1196 if (clnt_lock_set(mynode(), sp, ep)) { 1197 rval = -1; 1198 goto out; 1199 } 1200 } 1201 1202 /* 1203 * A traditional diskset has no diskset stale information to send 1204 * since there can only be one owner node at a time. 1205 */ 1206 if (snarf_set(sp, FALSE, ep)) 1207 mdclrerror(ep); 1208 1209 if (dd != NULL) { 1210 /* 1211 * Make sure that no drives are in use as parts of metadrives 1212 * or hot spare pools, this is one of the few error conditions 1213 * that will stop this routine, unless the environment has 1214 * META_DESTROY_SET_OK set, in which case, the operation will 1215 * proceed. 1216 */ 1217 if (getenv("META_DESTROY_SET_OK") == NULL) { 1218 for (p = dd; p != NULL; p = p->dd_next) { 1219 dnp = p->dd_dnp; 1220 1221 i = meta_check_drive_inuse(sp, dnp, FALSE, ep); 1222 if (i == -1) { 1223 /* need xep - wire calls clear error */ 1224 i = metaget_setownership(sp, &xep); 1225 if (i == -1) { 1226 rval = -1; 1227 goto out; 1228 } 1229 1230 mysideno = getmyside(sp, &xep); 1231 1232 if (mysideno == MD_SIDEWILD) { 1233 rval = -1; 1234 goto out; 1235 } 1236 1237 if (sd->sd_isown[mysideno] == FALSE) 1238 if (halt_set(sp, &xep)) { 1239 rval = -1; 1240 goto out; 1241 } 1242 1243 rval = -1; 1244 goto out; 1245 } 1246 } 1247 } 1248 1249 for (i = 0; i < MD_MAXSIDES; i++) { 1250 /* Skip empty slots */ 1251 if (sd->sd_nodes[i][0] == '\0') 1252 continue; 1253 1254 /* Skip non local nodes */ 1255 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1256 continue; 1257 1258 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep)) 1259 mdclrerror(ep); 1260 } 1261 1262 /* 1263 * Go thru each drive and individually delete the replicas. 1264 * This way we can ignore individual errors. 1265 */ 1266 for (p = dd; p != NULL; p = p->dd_next) { 1267 uint_t rep_slice; 1268 1269 dnp = p->dd_dnp; 1270 if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 1271 (((np = metaslicename(dnp, rep_slice, ep)) 1272 == NULL) && 1273 ((np = metaslicename(dnp, MD_SLICE0, ep)) 1274 == NULL))) { 1275 rval = -1; 1276 goto out; 1277 } 1278 1279 if ((np = metaslicename(dnp, 1280 rep_slice, ep)) == NULL) { 1281 if ((np = metaslicename(dnp, 1282 MD_SLICE0, ep)) == NULL) { 1283 rval = -1; 1284 goto out; 1285 } 1286 mdclrerror(ep); 1287 } 1288 1289 /* Yes this is UGLY!!! */ 1290 p1 = p->dd_next; 1291 p->dd_next = NULL; 1292 if (rel_own_bydd(sp, p, FALSE, ep)) 1293 mdclrerror(ep); 1294 p->dd_next = p1; 1295 1296 if (p->dd_dbcnt == 0) 1297 continue; 1298 1299 /* 1300 * Skip the replica removal if we are not the last user 1301 */ 1302 if (num_users != 1) 1303 continue; 1304 1305 nlp = NULL; 1306 (void) metanamelist_append(&nlp, np); 1307 if (meta_db_detach(sp, nlp, 1308 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep)) 1309 mdclrerror(ep); 1310 metafreenamelist(nlp); 1311 } 1312 } 1313 1314 if (halt_set(sp, ep)) { 1315 rval = -1; 1316 goto out; 1317 } 1318 1319 /* Setup the mediator record */ 1320 (void) memset(&medr, '\0', sizeof (med_rec_t)); 1321 medr.med_rec_mag = MED_REC_MAGIC; 1322 medr.med_rec_rev = MED_REC_REV; 1323 medr.med_rec_fl = 0; 1324 medr.med_rec_sn = sp->setno; 1325 (void) strcpy(medr.med_rec_snm, sp->setname); 1326 medr.med_rec_meds = sd->sd_med; /* structure assigment */ 1327 (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t)); 1328 medr.med_rec_foff = 0; 1329 1330 /* 1331 * If we are the last remaining user, then remove the mediator hosts 1332 */ 1333 if (num_users == 1) { 1334 for (i = 0; i < MED_MAX_HOSTS; i++) { 1335 if (medr.med_rec_meds.n_lst[i].a_cnt != 0) 1336 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, 1337 SVM_TAG_MEDIATOR, sp->setno, i); 1338 (void) memset(&medr.med_rec_meds.n_lst[i], '\0', 1339 sizeof (md_h_t)); 1340 } 1341 medr.med_rec_meds.n_cnt = 0; 1342 } else { /* Remove this host from the mediator node list. */ 1343 for (i = 0; i < MD_MAXSIDES; i++) { 1344 /* Skip empty slots */ 1345 if (sd->sd_nodes[i][0] == '\0') 1346 continue; 1347 1348 /* Copy non local node */ 1349 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) { 1350 (void) strcpy(medr.med_rec_nodes[i], 1351 sd->sd_nodes[i]); 1352 continue; 1353 } 1354 1355 /* Clear local node */ 1356 (void) memset(&medr.med_rec_nodes[i], '\0', 1357 sizeof (md_node_nm_t)); 1358 } 1359 } 1360 1361 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); 1362 1363 /* 1364 * If the client is part of a cluster put the DCS service 1365 * into a deleteing state. 1366 */ 1367 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1368 if (metad_isautotakebyname(sp->setname)) { 1369 delete_end = 0; 1370 } else { 1371 mdclrerror(ep); 1372 goto out; 1373 } 1374 } 1375 1376 /* Inform the mediator hosts of the new information */ 1377 for (i = 0; i < MED_MAX_HOSTS; i++) { 1378 if (sd->sd_med.n_lst[i].a_cnt == 0) 1379 continue; 1380 1381 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) 1382 mdclrerror(ep); 1383 } 1384 1385 /* Delete the set locally */ 1386 for (i = 0; i < MD_MAXSIDES; i++) { 1387 /* Skip empty slots */ 1388 if (sd->sd_nodes[i][0] == '\0') 1389 continue; 1390 1391 /* Skip non local nodes */ 1392 if (strcmp(mynode(), sd->sd_nodes[i]) != 0) 1393 continue; 1394 1395 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) 1396 mdclrerror(ep); 1397 } 1398 if (delete_end && 1399 sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) 1400 rval = -1; 1401 1402 out: 1403 /* release signals back to what they were on entry */ 1404 if (procsigs(FALSE, &oldsigs, &xep) < 0) { 1405 if (rval == 0) 1406 (void) mdstealerror(ep, &xep); 1407 rval = -1; 1408 } 1409 1410 if (lock_set == TRUE) { 1411 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1412 if (clnt_unlock_set(mynode(), cl_sk, &xep)) { 1413 if (rval == 0) 1414 (void) mdstealerror(ep, &xep); 1415 rval = -1; 1416 } 1417 cl_set_setkey(NULL); 1418 } 1419 1420 metaflushsetname(sp); 1421 return (rval); 1422 } 1423 1424 int 1425 meta_set_purge( 1426 mdsetname_t *sp, 1427 int bypass_cluster, 1428 int forceflg, 1429 md_error_t *ep 1430 ) 1431 { 1432 char *thishost = mynode(); 1433 md_set_desc *sd; 1434 md_setkey_t *cl_sk; 1435 md_error_t xep = mdnullerror; 1436 int rval = 0; 1437 int i, num_hosts = 0; 1438 int has_set = 0; 1439 int max_node = 0; 1440 int delete_end = 1; 1441 md_mnnode_desc *nd; 1442 1443 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1444 /* unable to find set description */ 1445 rval = 1; 1446 return (rval); 1447 } 1448 1449 if (MD_MNSET_DESC(sd)) { 1450 /* 1451 * Get a count of the hosts in the set and also lock the set 1452 * on those hosts that know about it. 1453 */ 1454 nd = sd->sd_nodelist; 1455 while (nd) { 1456 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1457 nd = nd->nd_next; 1458 continue; 1459 } 1460 has_set = nodehasset(sp, nd->nd_nodename, 1461 NHS_NST_EQ, ep); 1462 1463 /* 1464 * The host is not aware of this set (has_set < 0) or 1465 * the set does not match (has_set == 0). This check 1466 * prevents the code getting confused by an apparent 1467 * inconsistancy in the set's state, this is in the 1468 * purge code so something is broken in any case and 1469 * this is just trying to fix the brokeness. 1470 */ 1471 if (has_set <= 0) { 1472 mdclrerror(ep); 1473 nd->nd_flags |= MD_MN_NODE_NOSET; 1474 } else { 1475 num_hosts++; 1476 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 1477 /* 1478 * If the force flag is set then 1479 * ignore any RPC failures because we 1480 * are only really interested with 1481 * the set on local node. 1482 */ 1483 if (forceflg && mdanyrpcerror(ep)) { 1484 mdclrerror(ep); 1485 } else { 1486 /* 1487 * set max_node so that in the 1488 * unlock code nodes in the 1489 * set that have not been 1490 * locked are not unlocked. 1491 */ 1492 max_node = nd->nd_nodeid; 1493 rval = 2; 1494 goto out1; 1495 } 1496 } 1497 1498 } 1499 nd = nd->nd_next; 1500 } 1501 max_node = 0; 1502 } else { 1503 /* 1504 * Get a count of the hosts in the set and also lock the set 1505 * on those hosts that know about it. 1506 */ 1507 for (i = 0; i < MD_MAXSIDES; i++) { 1508 /* Skip empty slots */ 1509 if (sd->sd_nodes[i][0] == '\0') 1510 continue; 1511 1512 has_set = nodehasset(sp, sd->sd_nodes[i], 1513 NHS_NST_EQ, ep); 1514 1515 /* 1516 * The host is not aware of this set (has_set < 0) or 1517 * the set does not match (has_set == 0). This check 1518 * prevents the code getting confused by an apparent 1519 * inconsistancy in the set's state, this is in the 1520 * purge code so something is broken in any case and 1521 * this is just trying to fix the brokeness. 1522 */ 1523 if (has_set <= 0) { 1524 mdclrerror(ep); 1525 /* 1526 * set the node to NULL to prevent further 1527 * requests to this unresponsive node. 1528 */ 1529 sd->sd_nodes[i][0] = '\0'; 1530 } else { 1531 num_hosts++; 1532 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { 1533 /* 1534 * If the force flag is set then 1535 * ignore any RPC failures because we 1536 * are only really interested with 1537 * the set on local node. 1538 */ 1539 if (forceflg && mdanyrpcerror(ep)) { 1540 mdclrerror(ep); 1541 } else { 1542 rval = 2; 1543 /* 1544 * set max_node so that in the 1545 * unlock code nodes in the 1546 * set that have not been 1547 * locked are not unlocked. 1548 */ 1549 max_node = i; 1550 goto out1; 1551 } 1552 } 1553 } 1554 } 1555 max_node = i; /* now MD_MAXSIDES */ 1556 } 1557 if (!bypass_cluster) { 1558 /* 1559 * If there is only one host associated with the 1560 * set then remove the set from the cluster. 1561 */ 1562 if (num_hosts == 1) { 1563 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { 1564 if (metad_isautotakebyname(sp->setname)) { 1565 delete_end = 0; 1566 } else { 1567 mdclrerror(ep); 1568 rval = 3; 1569 goto out1; 1570 } 1571 } 1572 } 1573 } 1574 1575 if (MD_MNSET_DESC(sd)) { 1576 /* 1577 * Get a count of the hosts in the set and also lock the set 1578 * on those hosts that know about it. 1579 */ 1580 nd = sd->sd_nodelist; 1581 while (nd) { 1582 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1583 nd = nd->nd_next; 1584 continue; 1585 } 1586 if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) { 1587 /* 1588 * Tell the remote node to remove this node 1589 */ 1590 if (clnt_delhosts(nd->nd_nodename, sp, 1, 1591 &thishost, ep) == -1) { 1592 /* 1593 * If we fail to delete ourselves 1594 * from the remote host it does not 1595 * really matter because the set is 1596 * being "purged" from this node. The 1597 * set can be purged from the other 1598 * node at a later time. 1599 */ 1600 mdclrerror(ep); 1601 } 1602 nd = nd->nd_next; 1603 continue; 1604 } 1605 /* remove the set from this host */ 1606 if (clnt_delset(nd->nd_nodename, sp, ep) == -1) { 1607 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1608 if (!bypass_cluster && num_hosts == 1) 1609 (void) sdssc_delete_end(sp->setname, 1610 SDSSC_CLEANUP); 1611 mdclrerror(ep); 1612 goto out1; 1613 } 1614 nd = nd->nd_next; 1615 } 1616 } else { 1617 for (i = 0; i < MD_MAXSIDES; i++) { 1618 /* Skip empty slots */ 1619 if (sd->sd_nodes[i][0] == '\0') 1620 continue; 1621 if (strcmp(thishost, sd->sd_nodes[i]) != 0) { 1622 /* 1623 * Tell the remote node to remove this node 1624 */ 1625 if (clnt_delhosts(sd->sd_nodes[i], sp, 1, 1626 &thishost, ep) == -1) { 1627 /* 1628 * If we fail to delete ourselves 1629 * from the remote host it does not 1630 * really matter because the set is 1631 * being "purged" from this node. The 1632 * set can be purged from the other 1633 * node at a later time. 1634 */ 1635 mdclrerror(ep); 1636 } 1637 continue; 1638 } 1639 1640 /* remove the set from this host */ 1641 if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) { 1642 md_perror(dgettext(TEXT_DOMAIN, "delset")); 1643 if (!bypass_cluster && num_hosts == 1) 1644 (void) sdssc_delete_end(sp->setname, 1645 SDSSC_CLEANUP); 1646 mdclrerror(ep); 1647 goto out1; 1648 } 1649 } 1650 } 1651 1652 if (!bypass_cluster && num_hosts == 1) { 1653 if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == 1654 SDSSC_ERROR) { 1655 rval = 4; 1656 } 1657 } 1658 1659 out1: 1660 1661 cl_sk = cl_get_setkey(sp->setno, sp->setname); 1662 1663 /* 1664 * Remove the set lock on those nodes that had the set locked 1665 * max_node will either be MD_MAXSIDES or array index of the last 1666 * node contacted (or rather failed to contact) for traditional 1667 * diskset. For a MN diskset, max_node is the node_id of the node 1668 * that failed the lock. 1669 */ 1670 if (MD_MNSET_DESC(sd)) { 1671 nd = sd->sd_nodelist; 1672 while (nd) { 1673 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1674 nd = nd->nd_next; 1675 continue; 1676 } 1677 if (nd->nd_nodeid == max_node) 1678 break; 1679 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 1680 if (forceflg && mdanyrpcerror(&xep)) { 1681 mdclrerror(&xep); 1682 nd = nd->nd_next; 1683 continue; 1684 } 1685 if (rval == 0) 1686 (void) mdstealerror(ep, &xep); 1687 rval = 5; 1688 } 1689 nd = nd->nd_next; 1690 } 1691 } else { 1692 for (i = 0; i < max_node; i++) { 1693 /* Skip empty slots */ 1694 if (sd->sd_nodes[i][0] == '\0') 1695 continue; 1696 1697 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { 1698 if (forceflg && mdanyrpcerror(&xep)) { 1699 mdclrerror(&xep); 1700 continue; 1701 } 1702 if (rval == 0) 1703 (void) mdstealerror(ep, &xep); 1704 rval = 5; 1705 } 1706 } 1707 } 1708 1709 cl_set_setkey(NULL); 1710 1711 return (rval); 1712 } 1713 1714 int 1715 meta_set_query( 1716 mdsetname_t *sp, 1717 mddb_dtag_lst_t **dtlpp, 1718 md_error_t *ep 1719 ) 1720 { 1721 mddb_dtag_get_parm_t dtgp; 1722 1723 (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t)); 1724 dtgp.dtgp_setno = sp->setno; 1725 1726 /*CONSTCOND*/ 1727 while (1) { 1728 if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0) 1729 if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) || 1730 *dtlpp == NULL) 1731 return (mdstealerror(ep, &dtgp.dtgp_mde)); 1732 else 1733 break; 1734 1735 /* 1736 * Run to the end of the list 1737 */ 1738 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) 1739 /* void */; 1740 1741 *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t)); 1742 1743 (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt, 1744 sizeof (mddb_dtag_t)); 1745 1746 dtgp.dtgp_dt.dt_id++; 1747 } 1748 return (0); 1749 } 1750 1751 /* 1752 * return drivename get by key 1753 */ 1754 mddrivename_t * 1755 metadrivename_withdrkey( 1756 mdsetname_t *sp, 1757 side_t sideno, 1758 mdkey_t key, 1759 int flags, 1760 md_error_t *ep 1761 ) 1762 { 1763 char *nm; 1764 mdname_t *np; 1765 mddrivename_t *dnp; 1766 ddi_devid_t devidp; 1767 md_set_desc *sd; 1768 1769 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1770 return (NULL); 1771 } 1772 1773 /* get namespace info */ 1774 if (MD_MNSET_DESC(sd)) { 1775 if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, 1776 key, ep)) == NULL) 1777 return (NULL); 1778 } else { 1779 if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW, 1780 key, ep)) == NULL) 1781 return (NULL); 1782 } 1783 1784 /* get device name */ 1785 if (flags & PRINT_FAST) { 1786 if ((np = metaname_fast(&sp, nm, ep)) == NULL) { 1787 Free(nm); 1788 return (NULL); 1789 } 1790 } else { 1791 if ((np = metaname(&sp, nm, ep)) == NULL) { 1792 Free(nm); 1793 return (NULL); 1794 } 1795 } 1796 Free(nm); 1797 1798 /* make sure it's OK */ 1799 if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0)) 1800 return (NULL); 1801 1802 /* get drivename */ 1803 dnp = np->drivenamep; 1804 dnp->side_names_key = key; 1805 1806 /* 1807 * Skip the following devid check if dnp is did device 1808 * The device id is disabled for did device due to the 1809 * lack of minor name support in the did driver. The following 1810 * devid code path can set and propagate the error and 1811 * eventually prevent did disks from being added to the 1812 * diskset under SunCluster systems 1813 */ 1814 if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) { 1815 goto out; 1816 } 1817 1818 /* Also, Skip the check if MN diskset, no devid's */ 1819 if (MD_MNSET_DESC(sd)) { 1820 goto out; 1821 } 1822 1823 /* 1824 * Get the devid associated with the key. 1825 * 1826 * If a devid was returned, it MUST be valid even in 1827 * the case where a device id has been "updated". The 1828 * "update" of the device id may have occured due to 1829 * a firmware upgrade. 1830 */ 1831 if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) 1832 != NULL) { 1833 dnp->devid = devid_str_encode(devidp, NULL); 1834 free(devidp); 1835 } else { 1836 /* 1837 * It is okay if replica is not in devid mode 1838 */ 1839 if (mdissyserror(ep, MDDB_F_NODEVID)) { 1840 mdclrerror(ep); 1841 goto out; 1842 } 1843 1844 /* 1845 * devid is missing so this means that we have 1846 * just upgraded from a configuration where 1847 * devid's were not used so try to add in 1848 * the devid and requery. 1849 */ 1850 if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, 1851 ep) < 0) 1852 return (NULL); 1853 if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, 1854 sideno+SKEW, key, ep)) == NULL) 1855 return (NULL); 1856 dnp->devid = devid_str_encode(devidp, NULL); 1857 devid_free(devidp); 1858 } 1859 1860 out: 1861 if (flags & MD_BYPASS_DAEMON) 1862 return (dnp); 1863 1864 if (get_sidenmlist(sp, dnp, ep)) 1865 return (NULL); 1866 1867 /* return success */ 1868 return (dnp); 1869 } 1870 1871 void 1872 metafreedrivedesc(md_drive_desc **dd) 1873 { 1874 md_drive_desc *p, *next = NULL; 1875 1876 for (p = *dd; p != NULL; p = next) { 1877 next = p->dd_next; 1878 Free(p); 1879 } 1880 *dd = NULL; 1881 } 1882 1883 md_drive_desc * 1884 metaget_drivedesc( 1885 mdsetname_t *sp, 1886 int flags, 1887 md_error_t *ep 1888 ) 1889 { 1890 side_t sideno = MD_SIDEWILD; 1891 1892 assert(! (flags & MD_BYPASS_DAEMON)); 1893 1894 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 1895 return (NULL); 1896 1897 return (metaget_drivedesc_sideno(sp, sideno, flags, ep)); 1898 } 1899 1900 md_drive_desc * 1901 metaget_drivedesc_fromnamelist( 1902 mdsetname_t *sp, 1903 mdnamelist_t *nlp, 1904 md_error_t *ep 1905 ) 1906 { 1907 md_set_desc *sd; 1908 mdnamelist_t *p; 1909 md_drive_desc *dd = NULL; 1910 1911 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1912 return (NULL); 1913 1914 for (p = nlp; p != NULL; p = p->next) 1915 (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0, 1916 sd->sd_ctime, sd->sd_genid, MD_DR_ADD); 1917 1918 return (dd); 1919 } 1920 1921 md_drive_desc * 1922 metaget_drivedesc_sideno( 1923 mdsetname_t *sp, 1924 side_t sideno, 1925 int flags, 1926 md_error_t *ep 1927 ) 1928 { 1929 md_set_desc *sd = NULL; 1930 1931 assert(! (flags & MD_BYPASS_DAEMON)); 1932 1933 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1934 return (NULL); 1935 1936 if (sd->sd_drvs) 1937 return (sd->sd_drvs); 1938 1939 if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL) 1940 return (NULL); 1941 1942 return (sd->sd_drvs); 1943 } 1944 1945 int 1946 metaget_setownership( 1947 mdsetname_t *sp, 1948 md_error_t *ep 1949 ) 1950 { 1951 md_set_desc *sd; 1952 int bool; 1953 int i; 1954 md_mnnode_desc *nd; 1955 1956 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1957 return (-1); 1958 1959 if (MD_MNSET_DESC(sd)) { 1960 nd = sd->sd_nodelist; 1961 while (nd) { 1962 /* If node isn't alive, can't own diskset */ 1963 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 1964 nd->nd_flags &= ~MD_MN_NODE_OWN; 1965 nd = nd->nd_next; 1966 continue; 1967 } 1968 /* 1969 * If can't communicate with rpc.metad, then mark 1970 * this node as not an owner. That node may 1971 * in fact, be an owner, but without rpc.metad running 1972 * that node can't do much. 1973 */ 1974 if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) { 1975 nd->nd_flags &= ~MD_MN_NODE_OWN; 1976 } else if (bool == TRUE) { 1977 nd->nd_flags |= MD_MN_NODE_OWN; 1978 } else { 1979 nd->nd_flags &= ~MD_MN_NODE_OWN; 1980 } 1981 nd = nd->nd_next; 1982 } 1983 return (0); 1984 } 1985 1986 /* Rest of code handles traditional disksets */ 1987 1988 for (i = 0; i < MD_MAXSIDES; i++) 1989 sd->sd_isown[i] = 0; 1990 1991 if (clnt_ownset(mynode(), sp, &bool, ep) == -1) 1992 return (-1); 1993 1994 if (bool == TRUE) 1995 sd->sd_isown[getmyside(sp, ep)] = 1; 1996 1997 return (0); 1998 } 1999 2000 char * 2001 mynode(void) 2002 { 2003 static struct utsname myuname; 2004 static int done = 0; 2005 2006 if (! done) { 2007 if (uname(&myuname) == -1) { 2008 md_perror(dgettext(TEXT_DOMAIN, "uname")); 2009 assert(0); 2010 } 2011 done = 1; 2012 } 2013 return (myuname.nodename); 2014 } 2015 2016 int 2017 strinlst(char *str, int cnt, char **lst) 2018 { 2019 int i; 2020 2021 for (i = 0; i < cnt; i++) 2022 if (strcmp(lst[i], str) == 0) 2023 return (TRUE); 2024 2025 return (FALSE); 2026 } 2027 2028 /* 2029 * meta_get_reserved_names 2030 * returns an mdnamelist_t of reserved slices 2031 * reserved slices are those that are used but don't necessarily 2032 * show up as metadevices (ex. reserved slice for db in sets, logs) 2033 */ 2034 2035 /*ARGSUSED*/ 2036 int 2037 meta_get_reserved_names( 2038 mdsetname_t *sp, 2039 mdnamelist_t **nlpp, 2040 int options, 2041 md_error_t *ep) 2042 { 2043 int count = 0; 2044 mdname_t *np = NULL; 2045 mdnamelist_t *transnlp = NULL; 2046 mdnamelist_t **tailpp = nlpp; 2047 mdnamelist_t *nlp; 2048 md_drive_desc *dd, *di; 2049 2050 if (metaislocalset(sp)) 2051 goto out; 2052 2053 if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) { 2054 count = -1; 2055 goto out; 2056 } 2057 2058 /* db in for sets on reserved slice */ 2059 for (di = dd; di && count >= 0; di = di->dd_next) { 2060 uint_t rep_slice; 2061 2062 /* 2063 * Add the name struct to the end of the 2064 * namelist but keep a pointer to the last 2065 * element so that we don't incur the overhead 2066 * of traversing the list each time 2067 */ 2068 if (di->dd_dnp && 2069 (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) && 2070 (np = metaslicename(di->dd_dnp, rep_slice, ep)) && 2071 (tailpp = meta_namelist_append_wrapper(tailpp, np))) 2072 count++; 2073 else 2074 count = -1; 2075 } 2076 2077 /* now find logs */ 2078 if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) { 2079 count = -1; 2080 goto out; 2081 } 2082 2083 for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) { 2084 mdname_t *transnp = nlp->namep; 2085 md_trans_t *transp; 2086 2087 if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) { 2088 count = -1; 2089 goto out; 2090 } 2091 if (transp->lognamep) { 2092 /* 2093 * Add the name struct to the end of the 2094 * namelist but keep a pointer to the last 2095 * element so that we don't incur the overhead 2096 * of traversing the list each time 2097 */ 2098 tailpp = meta_namelist_append_wrapper( 2099 tailpp, transp->lognamep); 2100 } 2101 } 2102 out: 2103 metafreenamelist(transnlp); 2104 return (count); 2105 } 2106 2107 /* 2108 * Entry point to join a node to MultiNode diskset. 2109 * 2110 * Validate host in diskset. 2111 * - Should be in membership list from API 2112 * - Should not already be joined into diskset. 2113 * - Set must have drives 2114 * Assume valid configuration is stored in the set/drive/node records 2115 * in the local mddb since no node or drive can be added to the MNset 2116 * unless all drives and nodes are available. Reconfig steps will 2117 * resync all ALIVE nodes in case of panic in critical areas. 2118 * 2119 * Lock down the set. 2120 * Verify host is a member of this diskset. 2121 * If drives exist in the configuration, load the mddbs. 2122 * Set this node to active by notifying master if one exists. 2123 * If this is the first node active in the diskset, this node 2124 * becomes the master. 2125 * Unlock the set. 2126 * 2127 * Mirror Resync: 2128 * If this node is the last node to join the set and clustering 2129 * isn't running, then start the 'metasync -r' type resync 2130 * on all mirrors in this diskset. 2131 * If clustering is running, this resync operation will 2132 * be handled by the reconfig steps and should NOT 2133 * be handled during a join operation. 2134 * 2135 * There are multiple return values in order to assist 2136 * the join operation of all sets in the metaset command. 2137 * 2138 * Return values: 2139 * 0 - Node successfully joined to set. 2140 * -1 - Join attempted but failed 2141 * - any failure from libmeta calls 2142 * - node not in the member list 2143 * -2 - Join not attempted since 2144 * - this set had no drives in set 2145 * - this node already joined to set 2146 * - set is not a multinode set 2147 * -3 - Node joined to STALE set. 2148 */ 2149 extern int 2150 meta_set_join( 2151 mdsetname_t *sp, 2152 md_error_t *ep 2153 ) 2154 { 2155 md_set_desc *sd; 2156 md_drive_desc *dd; 2157 md_mnnode_desc *nd, *nd2, my_nd; 2158 int rval = 0; 2159 md_setkey_t *cl_sk; 2160 md_error_t xep = mdnullerror; 2161 md_error_t ep_snarf = mdnullerror; 2162 int master_flag = 0; 2163 md_mnset_record *mas_mnsr = NULL; 2164 int clear_nr_flags = 0; 2165 md_mnnode_record *nr; 2166 int stale_set = 0; 2167 int rb_flags = 0; 2168 int stale_bool = FALSE; 2169 int suspendall_flag = 0; 2170 int suspend1_flag = 0; 2171 sigset_t oldsigs; 2172 int send_reinit = 0; 2173 2174 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2175 return (-1); 2176 } 2177 2178 /* Must be a multinode diskset */ 2179 if (!MD_MNSET_DESC(sd)) { 2180 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2181 return (-2); 2182 } 2183 2184 /* Verify that the node is ALIVE (i.e. is in the API membership list) */ 2185 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) { 2186 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, 2187 sd->sd_mn_mynode->nd_nodename, NULL, 2188 sp->setname); 2189 return (-1); 2190 } 2191 2192 /* Make sure we are blocking all signals */ 2193 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2194 mdclrerror(&xep); 2195 2196 /* 2197 * Lock the set on current set members. 2198 * For MN diskset lock_set and SUSPEND are used to protect against 2199 * other meta* commands running on the other nodes. 2200 */ 2201 nd = sd->sd_nodelist; 2202 while (nd) { 2203 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2204 nd = nd->nd_next; 2205 continue; 2206 } 2207 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2208 rval = -1; 2209 goto out; 2210 } 2211 nd = nd->nd_next; 2212 } 2213 2214 /* 2215 * Lock out other meta* commands by suspending 2216 * class 1 messages across the diskset. 2217 */ 2218 nd = sd->sd_nodelist; 2219 while (nd) { 2220 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2221 nd = nd->nd_next; 2222 continue; 2223 } 2224 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2225 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2226 rval = -1; 2227 goto out; 2228 } 2229 suspend1_flag = 1; 2230 nd = nd->nd_next; 2231 } 2232 2233 /* 2234 * Verify that this host is a member (in the host list) of the set. 2235 */ 2236 nd = sd->sd_nodelist; 2237 while (nd) { 2238 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2239 break; 2240 } 2241 nd = nd->nd_next; 2242 } 2243 if (!nd) { 2244 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2245 sd->sd_mn_mynode->nd_nodename, NULL, 2246 sp->setname); 2247 rval = -1; 2248 goto out; 2249 } 2250 2251 /* 2252 * Need to return failure if host is already 'joined' 2253 * into the set. This is done so that if later the user 2254 * issues a command to join all sets and a failure is 2255 * encountered - that the resulting cleanup effort 2256 * (withdrawing from all sets that were joined 2257 * during that command) won't withdraw from this set. 2258 */ 2259 if (nd->nd_flags & MD_MN_NODE_OWN) { 2260 rval = -2; 2261 goto out2; 2262 } 2263 2264 /* 2265 * Call metaget_setownership that calls each node in diskset and 2266 * marks in set descriptor if node is an owner of the set or not. 2267 * metaget_setownership checks to see if a node is an owner by 2268 * checking to see if that node's kernel has the mddb loaded. 2269 * If a node had panic'd during a reconfig or an 2270 * add/delete/join/withdraw operation, the other nodes' node 2271 * records may not reflect the current state of the diskset, 2272 * so calling metaget_setownership is the safest thing to do. 2273 */ 2274 if (metaget_setownership(sp, ep) == -1) { 2275 rval = -1; 2276 goto out; 2277 } 2278 2279 /* If first active member of diskset, become the master. */ 2280 nd = sd->sd_nodelist; 2281 while (nd) { 2282 if (nd->nd_flags & MD_MN_NODE_OWN) 2283 break; 2284 nd = nd->nd_next; 2285 } 2286 if (nd == NULL) 2287 master_flag = 1; 2288 2289 /* 2290 * If not first active member of diskset, then get the 2291 * master information from a node that is already joined 2292 * and set the master information for this node. Be sure 2293 * that this node (the already joined node) has its own 2294 * join flag set. If not, then this diskset isn't currently 2295 * consistent and shouldn't allow a node to join. This diskset 2296 * inconsistency should only occur when a node has panic'd in 2297 * the set while doing a metaset operation and the sysadmin is 2298 * attempting to join a node into the set. This inconsistency 2299 * will be fixed during a reconfig cycle which should be occurring 2300 * soon since a node panic'd. 2301 * 2302 * If unable to get this information from an owning node, then 2303 * this diskset isn't currently consistent and shouldn't 2304 * allow a node to join. 2305 */ 2306 if (!master_flag) { 2307 /* get master information from an owner (joined) node */ 2308 if (clnt_mngetset(nd->nd_nodename, sp->setname, 2309 sp->setno, &mas_mnsr, ep) == -1) { 2310 rval = -1; 2311 goto out; 2312 } 2313 2314 /* Verify that owner (joined) node has its own JOIN flag set */ 2315 nr = mas_mnsr->sr_nodechain; 2316 while (nr) { 2317 if ((nd->nd_nodeid == nr->nr_nodeid) && 2318 ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) { 2319 (void) mddserror(ep, MDE_DS_NODENOSET, 2320 sp->setno, nd->nd_nodename, NULL, 2321 nd->nd_nodename); 2322 free_sr((md_set_record *)mas_mnsr); 2323 rval = -1; 2324 goto out; 2325 } 2326 nr = nr->nr_next; 2327 } 2328 2329 /* 2330 * Does master have set marked as STALE? 2331 * If so, need to pass this down to kernel when 2332 * this node snarfs the set. 2333 */ 2334 if (clnt_mn_is_stale(nd->nd_nodename, sp, 2335 &stale_bool, ep) == -1) { 2336 rval = -1; 2337 goto out; 2338 } 2339 2340 /* set master information in my rpc.metad's set record */ 2341 if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm, 2342 mas_mnsr->sr_master_nodeid, ep)) { 2343 free_sr((md_set_record *)mas_mnsr); 2344 rval = -1; 2345 goto out; 2346 } 2347 2348 /* set master information in my cached set desc */ 2349 (void) strcpy(sd->sd_mn_master_nodenm, 2350 mas_mnsr->sr_master_nodenm); 2351 sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid; 2352 nd2 = sd->sd_nodelist; 2353 while (nd2) { 2354 if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) { 2355 sd->sd_mn_masternode = nd2; 2356 break; 2357 } 2358 nd2 = nd2->nd_next; 2359 } 2360 free_sr((md_set_record *)mas_mnsr); 2361 2362 /* 2363 * Set the node flags in mynode's rpc.metad node records for 2364 * the nodes that are in the diskset. Can use my sd 2365 * since earlier call to metaget_setownership set the 2366 * owner flags based on whether that node had snarfed 2367 * the MN diskset mddb. Reconfig steps guarantee that 2368 * return of metaget_setownership will match the owning 2369 * node's owner list except in the case where a node 2370 * has just panic'd and in this case, a reconfig will 2371 * be starting immediately and the owner lists will 2372 * be sync'd up by the reconfig. 2373 * 2374 * Flag of SET means to take no action except to 2375 * set the node flags as given in the nodelist linked list. 2376 */ 2377 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 2378 MD_NR_SET, NULL, ep)) { 2379 rval = -1; 2380 goto out; 2381 } 2382 } 2383 2384 /* 2385 * Read in the mddb if there are drives in the set. 2386 */ 2387 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2388 ep)) == NULL) { 2389 /* No drives in list */ 2390 if (! mdisok(ep)) { 2391 rval = -1; 2392 goto out; 2393 } 2394 rval = -2; 2395 goto out; 2396 } 2397 2398 /* 2399 * Notify rpc.mdcommd on all nodes of a nodelist change. 2400 * Start by suspending rpc.mdcommd (which drains it of all messages), 2401 * then change the nodelist followed by a reinit and resume. 2402 */ 2403 nd = sd->sd_nodelist; 2404 while (nd) { 2405 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2406 nd = nd->nd_next; 2407 continue; 2408 } 2409 2410 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, 2411 MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2412 rval = -1; 2413 goto out; 2414 } 2415 suspendall_flag = 1; 2416 nd = nd->nd_next; 2417 } 2418 2419 /* Set master in my set record in rpc.metad */ 2420 if (master_flag) { 2421 if (clnt_mnsetmaster(mynode(), sp, 2422 sd->sd_mn_mynode->nd_nodename, 2423 sd->sd_mn_mynode->nd_nodeid, ep)) { 2424 rval = -1; 2425 goto out; 2426 } 2427 } 2428 /* 2429 * Causes mddbs to be loaded into the kernel. 2430 * Set the force flag so that replica locations can be 2431 * loaded into the kernel even if a mediator node was 2432 * unavailable. This allows a node to join an MO 2433 * diskset when there are sufficient replicas available, 2434 * but a mediator node in unavailable. 2435 */ 2436 if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { 2437 mde_perror(ep, dgettext(TEXT_DOMAIN, 2438 "Host not able to start diskset.")); 2439 rval = -1; 2440 goto out; 2441 } 2442 2443 if (! mdisok(ep)) { 2444 rval = -1; 2445 goto out; 2446 } 2447 2448 /* 2449 * Set rollback flags to 1 so that halt_set is called if a failure 2450 * is seen after this point. If snarf_set fails, still need to 2451 * call halt_set to cleanup the diskset. 2452 */ 2453 rb_flags = 1; 2454 2455 /* Starts the set */ 2456 if (snarf_set(sp, stale_bool, ep) != 0) { 2457 if (mdismddberror(ep, MDE_DB_STALE)) { 2458 /* 2459 * Don't fail join, STALE means that set has 2460 * < 50% mddbs. 2461 */ 2462 (void) mdstealerror(&ep_snarf, ep); 2463 stale_set = 1; 2464 } else if (mdisok(ep)) { 2465 /* If snarf failed, but no error was set - set it */ 2466 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2467 sp->setno, 0, NULL); 2468 rval = -1; 2469 goto out; 2470 } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) { 2471 /* 2472 * Don't fail join if ACCOK; ACCOK means that mediator 2473 * provided extra vote. 2474 */ 2475 rval = -1; 2476 goto out; 2477 } 2478 } 2479 2480 /* Did set really get snarfed? */ 2481 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) { 2482 if (mdisok(ep)) { 2483 /* If snarf failed, but no error was set - set it */ 2484 (void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64, 2485 sp->setno, 0, NULL); 2486 } 2487 mde_perror(ep, dgettext(TEXT_DOMAIN, 2488 "Host not able to start diskset.")); 2489 rval = -1; 2490 goto out; 2491 } 2492 2493 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2494 send_reinit = 1; 2495 2496 /* If first node to enter set, setup master and clear change log */ 2497 if (master_flag) { 2498 /* Set master in my locally cached set descriptor */ 2499 (void) strcpy(sd->sd_mn_master_nodenm, 2500 sd->sd_mn_mynode->nd_nodename); 2501 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; 2502 sd->sd_mn_am_i_master = 1; 2503 2504 /* 2505 * If first node to join set, then clear out change log 2506 * entries. Change log entries are only needed when a 2507 * change of master is occurring in a diskset that has 2508 * multiple owners. Since this node is the first owner 2509 * of the diskset, clear the entries. 2510 * 2511 * Only do this if we are in a single node non-SC3.x 2512 * situation. 2513 */ 2514 if (meta_mn_singlenode() && 2515 mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) { 2516 mde_perror(ep, dgettext(TEXT_DOMAIN, 2517 "Unable to reset changelog.")); 2518 rval = -1; 2519 goto out; 2520 } 2521 } 2522 2523 /* Set my locally cached flag */ 2524 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 2525 2526 /* 2527 * Set this node's own flag on all joined nodes in the set 2528 * (including my node). 2529 */ 2530 clear_nr_flags = 1; 2531 2532 my_nd = *(sd->sd_mn_mynode); 2533 my_nd.nd_next = NULL; 2534 nd = sd->sd_nodelist; 2535 while (nd) { 2536 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2537 nd = nd->nd_next; 2538 continue; 2539 } 2540 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 2541 MD_NR_JOIN, NULL, ep)) { 2542 rval = -1; 2543 goto out; 2544 } 2545 nd = nd->nd_next; 2546 } 2547 2548 out: 2549 if (rval != NULL) { 2550 /* 2551 * If rollback flag is 1, then node was joined to set. 2552 * Since an error occurred, withdraw node from set in 2553 * order to rollback to before command was run. 2554 * Need to preserve ep so that calling function can 2555 * get error information. 2556 */ 2557 if (rb_flags == 1) { 2558 if (halt_set(sp, &xep)) { 2559 mdclrerror(&xep); 2560 } 2561 } 2562 2563 /* 2564 * If error, reset master to INVALID. 2565 * Ignore error since (next) first node to successfully join 2566 * will set master on all nodes. 2567 */ 2568 (void) clnt_mnsetmaster(mynode(), sp, "", 2569 MD_MN_INVALID_NID, &xep); 2570 mdclrerror(&xep); 2571 /* Reset master in my locally cached set descriptor */ 2572 sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; 2573 sd->sd_mn_am_i_master = 0; 2574 2575 /* 2576 * If nr flags set on other nodes, reset them. 2577 */ 2578 if (clear_nr_flags) { 2579 nd = sd->sd_nodelist; 2580 while (nd) { 2581 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 2582 nd = nd->nd_next; 2583 continue; 2584 } 2585 (void) clnt_upd_nr_flags(nd->nd_nodename, sp, 2586 &my_nd, MD_NR_WITHDRAW, NULL, &xep); 2587 mdclrerror(&xep); 2588 nd = nd->nd_next; 2589 } 2590 /* Reset my locally cached flag */ 2591 sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN; 2592 } 2593 } 2594 2595 /* 2596 * Notify rpc.mdcommd on all nodes of a nodelist change. 2597 * Send reinit command to mdcommd which forces it to get 2598 * fresh set description. 2599 */ 2600 if (send_reinit) { 2601 /* Send reinit */ 2602 nd = sd->sd_nodelist; 2603 while (nd) { 2604 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2605 nd = nd->nd_next; 2606 continue; 2607 } 2608 2609 /* Class is ignored for REINIT */ 2610 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 2611 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 2612 /* 2613 * We are here because we failed to resume 2614 * rpc.mdcommd. However we potentially have 2615 * an error from the previous call 2616 * If the previous call did fail, we capture 2617 * that error and generate a perror with 2618 * the string, "Unable to resume...". 2619 * Setting rval to -1 ensures that in the 2620 * next iteration of the loop, ep is not 2621 * clobbered. 2622 */ 2623 if (rval == 0) 2624 (void) mdstealerror(ep, &xep); 2625 else 2626 mdclrerror(&xep); 2627 rval = -1; 2628 mde_perror(ep, dgettext(TEXT_DOMAIN, 2629 "Unable to reinit rpc.mdcommd.")); 2630 } 2631 nd = nd->nd_next; 2632 } 2633 2634 } 2635 2636 out2: 2637 /* 2638 * Unlock diskset by resuming messages across the diskset. 2639 * Just resume all classes so that resume is the same whether 2640 * just one class was locked or all classes were locked. 2641 */ 2642 if ((suspend1_flag) || (suspendall_flag)) { 2643 nd = sd->sd_nodelist; 2644 while (nd) { 2645 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2646 nd = nd->nd_next; 2647 continue; 2648 } 2649 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 2650 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 2651 /* 2652 * We are here because we failed to resume 2653 * rpc.mdcommd. However we potentially have 2654 * an error from the previous call 2655 * If the previous call did fail, we capture 2656 * that error and generate a perror with 2657 * the string, "Unable to resume...". 2658 * Setting rval to -1 ensures that in the 2659 * next iteration of the loop, ep is not 2660 * clobbered. 2661 */ 2662 if (rval == 0) 2663 (void) mdstealerror(ep, &xep); 2664 else 2665 mdclrerror(&xep); 2666 rval = -1; 2667 mde_perror(ep, dgettext(TEXT_DOMAIN, 2668 "Unable to resume rpc.mdcommd.")); 2669 } 2670 nd = nd->nd_next; 2671 } 2672 meta_ping_mnset(sp->setno); 2673 } 2674 2675 /* 2676 * Unlock set. This flushes the caches on the servers. 2677 */ 2678 cl_sk = cl_get_setkey(sp->setno, sp->setname); 2679 nd = sd->sd_nodelist; 2680 while (nd) { 2681 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2682 nd = nd->nd_next; 2683 continue; 2684 } 2685 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 2686 if (rval == 0) 2687 (void) mdstealerror(ep, &xep); 2688 else 2689 mdclrerror(&xep); 2690 rval = -1; 2691 } 2692 nd = nd->nd_next; 2693 } 2694 2695 /* 2696 * If this node is the last to join the diskset and clustering isn't 2697 * running, then resync the mirrors in the diskset. We have to wait 2698 * until all nodes are joined so that the status gets propagated to 2699 * all of the members of the set. 2700 * Ignore any error from the resync as the join function shouldn't fail 2701 * because the mirror resync had a problem. 2702 * 2703 * Don't start resync if set is stale. 2704 */ 2705 if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) && 2706 (stale_set != 1)) { 2707 nd = sd->sd_nodelist; 2708 while (nd) { 2709 if (!(nd->nd_flags & MD_MN_NODE_OWN)) 2710 break; 2711 nd = nd->nd_next; 2712 } 2713 /* 2714 * nd set to NULL means that we have no nodes in the set that 2715 * haven't joined. In this case we start the resync. 2716 */ 2717 if (nd == NULL) { 2718 (void) meta_mirror_resync_all(sp, 0, &xep); 2719 mdclrerror(&xep); 2720 } 2721 } 2722 2723 /* Update ABR state for all soft partitions */ 2724 (void) meta_sp_update_abr(sp, &xep); 2725 mdclrerror(&xep); 2726 2727 /* 2728 * call metaflushsetnames to reset local cache for master and 2729 * node information. 2730 */ 2731 metaflushsetname(sp); 2732 2733 /* release signals back to what they were on entry */ 2734 if (procsigs(FALSE, &oldsigs, &xep) < 0) 2735 mdclrerror(&xep); 2736 2737 /* 2738 * If no error and stale_set is set, then set ep back 2739 * to ep from snarf_set call and return -3. If another error 2740 * occurred and rval is not 0, then that error would have 2741 * caused the node to be withdrawn from the set and would 2742 * have set ep to that error information. 2743 */ 2744 if ((rval == 0) && (stale_set)) { 2745 (void) mdstealerror(ep, &ep_snarf); 2746 return (-3); 2747 } 2748 2749 return (rval); 2750 } 2751 2752 /* 2753 * Entry point to withdraw a node from MultiNode diskset. 2754 * 2755 * Validate host in diskset. 2756 * - Should be joined into diskset. 2757 * Assume valid configuration is stored in the set/drive/node records 2758 * in the local mddb since no node or drive can be added to the MNset 2759 * unless all drives and nodes are available. Reconfig steps will 2760 * resync all ALIVE nodes in case of panic in critical areas. 2761 * 2762 * Lock down the set. 2763 * Verify that drives exist in configuration. 2764 * Verify host is a member of this diskset. 2765 * Verify host is an owner of the diskset (host is joined to diskset). 2766 * Only allow withdrawal of master node if master node is the only joined 2767 * in the diskset. 2768 * Halt the diskset on this node. 2769 * Reset Master on this node. 2770 * Updated node flags that this node with withdrawn. 2771 * Unlock the set. 2772 * 2773 * Return values: 2774 * 0 - Node successfully withdrew from set. 2775 * -1 - Withdrawal attempted but failed 2776 * - any failure from libmeta calls 2777 * - node not in the member list 2778 * -2 - Withdrawal not attempted since 2779 * - this set had no drives in set 2780 * - this node not joined to set 2781 * - set is not a multinode set 2782 */ 2783 extern int 2784 meta_set_withdraw( 2785 mdsetname_t *sp, 2786 md_error_t *ep 2787 ) 2788 { 2789 md_set_desc *sd; 2790 md_drive_desc *dd = 0; 2791 md_mnnode_desc *nd, my_nd; 2792 int rval = 0; 2793 md_setkey_t *cl_sk; 2794 md_error_t xep = mdnullerror; 2795 int set_halted = 0; 2796 int suspendall_flag = 0; 2797 int suspend1_flag = 0; 2798 bool_t stale_bool = FALSE; 2799 mddb_config_t c; 2800 int node_id_list[1]; 2801 sigset_t oldsigs; 2802 int send_reinit = 0; 2803 2804 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 2805 return (-1); 2806 } 2807 2808 /* Must be a multinode diskset */ 2809 if (!MD_MNSET_DESC(sd)) { 2810 (void) mderror(ep, MDE_NOT_MN, sp->setname); 2811 return (-1); 2812 } 2813 2814 /* Make sure we are blocking all signals */ 2815 if (procsigs(TRUE, &oldsigs, &xep) < 0) 2816 mdclrerror(&xep); 2817 2818 /* 2819 * Lock the set on current set members. 2820 * For MN diskset lock_set and SUSPEND are used to protect against 2821 * other meta* commands running on the other nodes. 2822 */ 2823 nd = sd->sd_nodelist; 2824 while (nd) { 2825 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2826 nd = nd->nd_next; 2827 continue; 2828 } 2829 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 2830 rval = -1; 2831 goto out; 2832 } 2833 nd = nd->nd_next; 2834 } 2835 /* 2836 * Lock out other meta* commands by suspending 2837 * class 1 messages across the diskset. 2838 */ 2839 nd = sd->sd_nodelist; 2840 while (nd) { 2841 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2842 nd = nd->nd_next; 2843 continue; 2844 } 2845 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2846 sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { 2847 rval = -1; 2848 goto out; 2849 } 2850 suspend1_flag = 1; 2851 nd = nd->nd_next; 2852 } 2853 2854 /* Get list of drives - needed in case of failure */ 2855 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 2856 ep)) == NULL) { 2857 /* Error getting drives in list */ 2858 if (! mdisok(ep)) { 2859 rval = -1; 2860 goto out2; 2861 } 2862 /* no drives in list */ 2863 rval = -2; 2864 goto out2; 2865 } 2866 2867 /* 2868 * Verify that this host is a member (in the host list) of the set. 2869 */ 2870 nd = sd->sd_nodelist; 2871 while (nd) { 2872 if (strcmp(mynode(), nd->nd_nodename) == 0) { 2873 break; 2874 } 2875 nd = nd->nd_next; 2876 } 2877 if (!nd) { 2878 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, 2879 sd->sd_mn_mynode->nd_nodename, NULL, 2880 sp->setname); 2881 rval = -1; 2882 goto out2; 2883 } 2884 2885 /* 2886 * Call metaget_setownership that calls each node in diskset and 2887 * marks in set descriptor if node is an owner of the set or not. 2888 * metaget_setownership checks to see if a node is an owner by 2889 * checking to see if that node's kernel has the mddb loaded. 2890 * If a node had panic'd during a reconfig or an 2891 * add/delete/join/withdraw operation, the other nodes' node 2892 * records may not reflect the current state of the diskset, 2893 * so calling metaget_setownership is the safest thing to do. 2894 */ 2895 if (metaget_setownership(sp, ep) == -1) { 2896 rval = -1; 2897 goto out2; 2898 } 2899 2900 /* 2901 * Verify that this node is joined 2902 * to diskset (i.e. is an owner of the diskset). 2903 */ 2904 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 2905 rval = -2; 2906 goto out2; 2907 } 2908 2909 /* 2910 * For a MN diskset, only withdraw master if it is 2911 * the only joined node. 2912 */ 2913 if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) { 2914 nd = sd->sd_nodelist; 2915 while (nd) { 2916 /* Skip my node since checking for other owners */ 2917 if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { 2918 nd = nd->nd_next; 2919 continue; 2920 } 2921 /* If another owner node if found, error */ 2922 if (nd->nd_flags & MD_MN_NODE_OWN) { 2923 (void) mddserror(ep, MDE_DS_WITHDRAWMASTER, 2924 sp->setno, 2925 sd->sd_mn_mynode->nd_nodename, NULL, 2926 sp->setname); 2927 rval = -1; 2928 goto out2; 2929 } 2930 nd = nd->nd_next; 2931 } 2932 } 2933 2934 /* 2935 * Is current set STALE? 2936 */ 2937 (void) memset(&c, 0, sizeof (c)); 2938 c.c_id = 0; 2939 c.c_setno = sp->setno; 2940 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 2941 (void) mdstealerror(ep, &c.c_mde); 2942 rval = -1; 2943 goto out; 2944 } 2945 if (c.c_flags & MDDB_C_STALE) { 2946 stale_bool = TRUE; 2947 } 2948 2949 /* 2950 * Notify rpc.mdcommd on all nodes of a nodelist change. 2951 * Start by suspending rpc.mdcommd (which drains it of all messages), 2952 * then change the nodelist followed by a reinit and resume. 2953 */ 2954 nd = sd->sd_nodelist; 2955 while (nd) { 2956 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 2957 nd = nd->nd_next; 2958 continue; 2959 } 2960 2961 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, 2962 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { 2963 rval = -1; 2964 goto out; 2965 } 2966 suspendall_flag = 1; 2967 nd = nd->nd_next; 2968 } 2969 2970 /* 2971 * Withdraw the set - halt set. 2972 * This will fail if any I/O is occuring to any metadevice which 2973 * includes a resync to a mirror metadevice. 2974 */ 2975 set_halted = 1; 2976 if (halt_set(sp, ep)) { 2977 /* Was set actually halted? */ 2978 if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) { 2979 set_halted = 0; 2980 } 2981 rval = -1; 2982 goto out; 2983 } 2984 2985 /* Change to nodelist so need to send reinit to rpc.mdcommd */ 2986 send_reinit = 1; 2987 2988 /* Reset master on withdrawn node */ 2989 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "", 2990 MD_MN_INVALID_NID, ep)) { 2991 rval = -1; 2992 goto out; 2993 } 2994 2995 /* Mark my node as withdrawn and send to other nodes */ 2996 nd = sd->sd_nodelist; 2997 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 2998 my_nd.nd_next = NULL; 2999 while (nd) { 3000 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3001 nd = nd->nd_next; 3002 continue; 3003 } 3004 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3005 MD_NR_WITHDRAW, NULL, ep)) { 3006 rval = -1; 3007 goto out; 3008 } 3009 nd = nd->nd_next; 3010 } 3011 3012 /* 3013 * If withdrawn node is a mirror owner, reset mirror owner 3014 * to NULL. If an error occurs, print a warning and continue. 3015 * Don't fail metaset because of mirror owner reset problem since 3016 * next node to grab mirror will resolve this issue. 3017 * Before next node grabs mirrors, metaset will show the withdrawn 3018 * node as owner which is why an attempt to reset the mirror owner 3019 * is made. 3020 */ 3021 node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */ 3022 nd = sd->sd_nodelist; 3023 while (nd) { 3024 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3025 nd = nd->nd_next; 3026 continue; 3027 } 3028 if (clnt_reset_mirror_owner(nd->nd_nodename, sp, 3029 1, &node_id_list[0], &xep) == 01) { 3030 mde_perror(&xep, dgettext(TEXT_DOMAIN, 3031 "Unable to reset mirror owner on node %s"), 3032 nd->nd_nodename); 3033 mdclrerror(&xep); 3034 } 3035 nd = nd->nd_next; 3036 } 3037 3038 out: 3039 if (rval == -1) { 3040 /* Rejoin node - Mark node as joined and send to other nodes */ 3041 nd = sd->sd_nodelist; 3042 my_nd = *(sd->sd_mn_mynode); /* structure copy */ 3043 my_nd.nd_next = NULL; 3044 while (nd) { 3045 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3046 nd = nd->nd_next; 3047 continue; 3048 } 3049 if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, 3050 MD_NR_JOIN, NULL, &xep)) { 3051 mdclrerror(&xep); 3052 } 3053 nd = nd->nd_next; 3054 } 3055 3056 /* Set master on withdrawn node */ 3057 if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, 3058 sd->sd_mn_master_nodenm, 3059 sd->sd_mn_master_nodeid, &xep)) { 3060 mdclrerror(&xep); 3061 } 3062 3063 /* Join set if halt_set had succeeded */ 3064 if (set_halted) { 3065 /* 3066 * Causes mddbs to be loaded into the kernel. 3067 * Set the force flag so that replica locations can be 3068 * loaded into the kernel even if a mediator node was 3069 * unavailable. This allows a node to join an MO 3070 * diskset when there are sufficient replicas available, 3071 * but a mediator node in unavailable. 3072 */ 3073 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) { 3074 mdclrerror(&xep); 3075 } 3076 /* If set previously stale - make it so at re-join */ 3077 if (snarf_set(sp, stale_bool, &xep) != 0) { 3078 mdclrerror(&xep); 3079 (void) halt_set(sp, &xep); 3080 mdclrerror(&xep); 3081 } 3082 } 3083 } 3084 3085 /* 3086 * Notify rpc.mdcommd on all nodes of a nodelist change. 3087 * Send reinit command to mdcommd which forces it to get 3088 * fresh set description. 3089 */ 3090 if (send_reinit) { 3091 /* Send reinit */ 3092 nd = sd->sd_nodelist; 3093 while (nd) { 3094 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3095 nd = nd->nd_next; 3096 continue; 3097 } 3098 3099 /* Class is ignored for REINIT */ 3100 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, 3101 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { 3102 /* 3103 * We are here because we failed to resume 3104 * rpc.mdcommd. However we potentially have 3105 * an error from the previous call. 3106 * If the previous call did fail, we 3107 * capture that error and generate a perror 3108 * withthe string, "Unable to resume...". 3109 * Setting rval to -1 ensures that in the 3110 * next iteration of the loop, ep is not 3111 * clobbered. 3112 */ 3113 if (rval == 0) 3114 (void) mdstealerror(ep, &xep); 3115 else 3116 mdclrerror(&xep); 3117 rval = -1; 3118 mde_perror(ep, dgettext(TEXT_DOMAIN, 3119 "Unable to reinit rpc.mdcommd.")); 3120 } 3121 nd = nd->nd_next; 3122 } 3123 } 3124 3125 out2: 3126 /* 3127 * Unlock diskset by resuming messages across the diskset. 3128 * Just resume all classes so that resume is the same whether 3129 * just one class was locked or all classes were locked. 3130 */ 3131 if ((suspend1_flag) || (suspendall_flag)) { 3132 nd = sd->sd_nodelist; 3133 while (nd) { 3134 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3135 nd = nd->nd_next; 3136 continue; 3137 } 3138 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, 3139 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { 3140 /* 3141 * We are here because we failed to resume 3142 * rpc.mdcommd. However we potentially have 3143 * an error from the previous call 3144 * If the previous call did fail, we capture 3145 * that error and generate a perror with 3146 * the string, "Unable to resume...". 3147 * Setting rval to -1 ensures that in the 3148 * next iteration of the loop, ep is not 3149 * clobbered. 3150 */ 3151 if (rval == 0) 3152 (void) mdstealerror(ep, &xep); 3153 else 3154 mdclrerror(&xep); 3155 rval = -1; 3156 mde_perror(ep, dgettext(TEXT_DOMAIN, 3157 "Unable to resume rpc.mdcommd.")); 3158 } 3159 nd = nd->nd_next; 3160 } 3161 meta_ping_mnset(sp->setno); 3162 } 3163 3164 /* 3165 * Unlock set. This flushes the caches on the servers. 3166 */ 3167 cl_sk = cl_get_setkey(sp->setno, sp->setname); 3168 nd = sd->sd_nodelist; 3169 while (nd) { 3170 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3171 nd = nd->nd_next; 3172 continue; 3173 } 3174 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 3175 if (rval == 0) 3176 (void) mdstealerror(ep, &xep); 3177 else 3178 mdclrerror(&xep); 3179 rval = -1; 3180 } 3181 nd = nd->nd_next; 3182 } 3183 3184 /* 3185 * call metaflushsetnames to reset local cache for master and 3186 * node information. 3187 */ 3188 metaflushsetname(sp); 3189 3190 /* release signals back to what they were on entry */ 3191 if (procsigs(FALSE, &oldsigs, &xep) < 0) 3192 mdclrerror(&xep); 3193 3194 return (rval); 3195 3196 } 3197 3198 /* 3199 * Update nodelist with cluster member information. 3200 * A node not in the member list will be marked 3201 * as not ALIVE and not OWN. 3202 * A node in the member list will be marked ALIVE, but 3203 * the OWN bit will not be changed. 3204 * 3205 * If mynode isn't in the membership list, fail causing 3206 * another reconfig cycle to be started since a non-member 3207 * node shouldn't be taking part in the reconfig cycle. 3208 * 3209 * Return values: 3210 * 0 - No problem. 3211 * 1 - Any failure including RPC failure to my node. 3212 */ 3213 int 3214 meta_reconfig_update_nodelist( 3215 mdsetname_t *sp, 3216 mndiskset_membershiplist_t *nl, 3217 md_set_desc *sd, 3218 md_error_t *ep 3219 ) 3220 { 3221 mndiskset_membershiplist_t *nl2; 3222 md_mnnode_desc *nd; 3223 md_error_t xep = mdnullerror; 3224 int rval = 0; 3225 3226 /* 3227 * Walk through nodelist, checking to see if each 3228 * node is in the member list. 3229 * If node is not a member, reset ALIVE and OWN node flag. 3230 * If node is a member, set ALIVE. 3231 * If mynode's OWN flag gets reset, then halt the diskset on this node. 3232 */ 3233 nd = sd->sd_nodelist; 3234 while (nd) { 3235 nl2 = nl; 3236 while (nl2) { 3237 /* If node is in member list, set ALIVE */ 3238 if (nl2->msl_node_id == nd->nd_nodeid) { 3239 nd->nd_flags |= MD_MN_NODE_ALIVE; 3240 break; 3241 } else { 3242 nl2 = nl2->next; 3243 } 3244 /* node is not in member list, mark !ALIVE and !OWN */ 3245 if (nl2 == NULL) { 3246 /* If node is mynode, then halt set if needed */ 3247 if (strcmp(mynode(), nd->nd_nodename) == 0) { 3248 /* 3249 * This shouldn't happen, but just 3250 * in case... Any node not in the 3251 * membership list should be dead and 3252 * not running reconfig step1. 3253 */ 3254 if (nd->nd_flags & MD_MN_NODE_OWN) { 3255 if (halt_set(sp, &xep)) { 3256 mde_perror(&xep, ""); 3257 mdclrerror(&xep); 3258 } 3259 } 3260 /* 3261 * Return failure since this node 3262 * (mynode) is not in the membership 3263 * list, but process the rest of the 3264 * nodelist first so that rpc.metad 3265 * can be updated with the latest 3266 * membership information. 3267 */ 3268 (void) mddserror(ep, 3269 MDE_DS_NOTINMEMBERLIST, 3270 sp->setno, nd->nd_nodename, NULL, 3271 sp->setname); 3272 rval = 1; 3273 } 3274 nd->nd_flags &= ~MD_MN_NODE_ALIVE; 3275 nd->nd_flags &= ~MD_MN_NODE_OWN; 3276 } 3277 } 3278 nd = nd->nd_next; 3279 } 3280 3281 /* Send this information to rpc.metad */ 3282 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, 3283 MD_NR_SET, MNSET_IN_RECONFIG, &xep)) { 3284 /* Return failure if can't send node flags to rpc.metad */ 3285 if (rval == 0) { 3286 (void) mdstealerror(ep, &xep); 3287 rval = 1; 3288 } 3289 } 3290 return (rval); 3291 } 3292 3293 /* 3294 * Choose master determines the master for a diskset. 3295 * Each node determines the master on its own and 3296 * adds this information to its local rpc.metad nodelist 3297 * and also sends it to the kernel. 3298 * 3299 * Nodelist in set descriptor (sd) is sorted in 3300 * monotonically increasing sequence of nodeid. 3301 * 3302 * Return values: 3303 * 0 - No problem. 3304 * 205 - There was an RPC problem to another node. 3305 * -1 - There was an error. This could be an RPC error to my node. 3306 * This is a catastrophic failure causing node to panic. 3307 */ 3308 int 3309 meta_reconfig_choose_master_for_set( 3310 mdsetname_t *sp, 3311 md_set_desc *sd, 3312 md_error_t *ep 3313 ) 3314 { 3315 int is_owner; 3316 md_mnset_record *mnsr = NULL; 3317 int lowest_alive_nodeid = 0; 3318 uint_t master_nodeid; 3319 md_mnnode_desc *nd, *nd2; 3320 md_mnnode_record *nr; 3321 md_drive_desc *dd; 3322 md_setkey_t *cl_sk; 3323 int rval = 0; 3324 md_error_t xep = mdnullerror; 3325 mddb_setflags_config_t sf; 3326 3327 /* 3328 * Is current node joined to diskset? 3329 * Don't trust flags, really check to see if mddb is snarfed. 3330 */ 3331 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 3332 /* 3333 * If a node is joined to the diskset, this node checks 3334 * to see if the current master of the diskset is valid and 3335 * is still in the membership list (ALIVE) and is 3336 * still joined (OWN). Need to verify if master is 3337 * really joined - don't trust the flags. (Can trust 3338 * ALIVE since set during earlier part of reconfig cycle.) 3339 * If the current master is valid, still in the membership 3340 * list and joined, then master is not changed on this node. 3341 * Just return. 3342 * 3343 * Verify that nodeid is valid before accessing masternode. 3344 */ 3345 if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) && 3346 (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) { 3347 if (clnt_ownset(sd->sd_mn_master_nodenm, sp, 3348 &is_owner, ep) == -1) { 3349 /* If RPC failure to another node return 205 */ 3350 if ((mdanyrpcerror(ep)) && 3351 (sd->sd_mn_mynode->nd_nodeid != 3352 sd->sd_mn_master_nodeid)) { 3353 return (205); 3354 } else { 3355 /* Any other failure */ 3356 return (-1); 3357 } 3358 } else { 3359 if (is_owner == TRUE) { 3360 3361 meta_mc_log(MC_LOG5, dgettext( 3362 TEXT_DOMAIN, "Set %s previous " 3363 "master chosen %s (%d): %s"), 3364 sp->setname, 3365 sd->sd_mn_master_nodenm, 3366 sd->sd_mn_master_nodeid, 3367 meta_print_hrtime(gethrtime() - 3368 start_time)); 3369 3370 /* Previous master is ok - done */ 3371 return (0); 3372 } 3373 } 3374 } 3375 3376 /* 3377 * If current master is no longer in the membership list or 3378 * is no longer joined, then this node uses the following 3379 * algorithm: 3380 * - node calls RPC routine clnt_ownset to get latest 3381 * information on which nodes are owners of diskset. 3382 * clnt_ownset checks on each node to see if its kernel 3383 * has that diskset snarfed. 3384 */ 3385 nd = sd->sd_nodelist; 3386 while (nd) { 3387 /* Don't consider node that isn't in member list */ 3388 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3389 nd = nd->nd_next; 3390 continue; 3391 } 3392 3393 if (clnt_ownset(nd->nd_nodename, sp, 3394 &is_owner, ep) == -1) { 3395 /* If RPC failure to another node return 205 */ 3396 if ((mdanyrpcerror(ep)) && 3397 (sd->sd_mn_mynode->nd_nodeid != 3398 nd->nd_nodeid)) { 3399 return (205); 3400 } else { 3401 /* Any other failure */ 3402 return (-1); 3403 } 3404 } 3405 3406 /* 3407 * Set owner flag for each node based on whether 3408 * that node really has a diskset mddb snarfed in 3409 * or not. 3410 */ 3411 if (is_owner == TRUE) 3412 nd->nd_flags |= MD_MN_NODE_OWN; 3413 else 3414 nd->nd_flags &= ~MD_MN_NODE_OWN; 3415 3416 nd = nd->nd_next; 3417 } 3418 3419 /* 3420 * - node walks through nodelist looking for nodes that are 3421 * owners of the diskset that are in the membership list. 3422 * - for each owner, node calls RPC routine clnt_getset to 3423 * see if that node has its node record set to OK. 3424 * - If so, master is chosen to be this owner node. 3425 */ 3426 nd = sd->sd_nodelist; 3427 while (nd) { 3428 /* Don't consider node that isn't in member list */ 3429 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3430 nd = nd->nd_next; 3431 continue; 3432 } 3433 3434 /* Don't consider a node that isn't an owner */ 3435 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3436 nd = nd->nd_next; 3437 continue; 3438 } 3439 3440 /* Does node has its own node record set to OK? */ 3441 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3442 MD_SET_BAD, &mnsr, ep) == -1) { 3443 /* If RPC failure to another node return 205 */ 3444 if ((mdanyrpcerror(ep)) && 3445 (sd->sd_mn_mynode->nd_nodeid != 3446 nd->nd_nodeid)) { 3447 return (205); 3448 } else { 3449 /* Any other failure */ 3450 return (-1); 3451 } 3452 } 3453 nr = mnsr->sr_nodechain; 3454 while (nr) { 3455 if (nd->nd_nodeid == nr->nr_nodeid) { 3456 if (nr->nr_flags & MD_MN_NODE_OK) { 3457 /* Found a master */ 3458 free_sr( 3459 (md_set_record *)mnsr); 3460 goto found_master; 3461 } 3462 } 3463 nr = nr->nr_next; 3464 } 3465 free_sr((md_set_record *)mnsr); 3466 nd = nd->nd_next; 3467 } 3468 3469 /* 3470 * - If no owner node has its own node record on its own node 3471 * set to OK, then this node checks all of the non-owner 3472 * nodes that are in the membership list. 3473 * - for each non-owner, node calls RPC routine clnt_getset to 3474 * see if that node has its node record set to OK. 3475 * - If set doesn't exist, don't choose node for master. 3476 * - If so, master is chosen to be this non-owner node. 3477 * 3478 */ 3479 nd = sd->sd_nodelist; 3480 while (nd) { 3481 /* Don't consider node that isn't in member list */ 3482 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3483 nd = nd->nd_next; 3484 continue; 3485 } 3486 3487 /* Only checking non-owner nodes this time around */ 3488 if (nd->nd_flags & MD_MN_NODE_OWN) { 3489 nd = nd->nd_next; 3490 continue; 3491 } 3492 3493 /* Does node has its own node record set to OK? */ 3494 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3495 MD_SET_BAD, &mnsr, ep) == -1) { 3496 /* 3497 * If set doesn't exist on non-owner node, 3498 * don't consider this node for master. 3499 */ 3500 if (mdiserror(ep, MDE_NO_SET)) { 3501 nd = nd->nd_next; 3502 continue; 3503 } else if ((mdanyrpcerror(ep)) && 3504 (sd->sd_mn_mynode->nd_nodeid != 3505 nd->nd_nodeid)) { 3506 /* RPC failure to another node */ 3507 return (205); 3508 } else { 3509 /* Any other failure */ 3510 return (-1); 3511 } 3512 } 3513 nr = mnsr->sr_nodechain; 3514 while (nr) { 3515 if (nd->nd_nodeid == nr->nr_nodeid) { 3516 if (nr->nr_flags & MD_MN_NODE_OK) { 3517 /* Found a master */ 3518 free_sr( 3519 (md_set_record *)mnsr); 3520 goto found_master; 3521 } 3522 } 3523 nr = nr->nr_next; 3524 } 3525 free_sr((md_set_record *)mnsr); 3526 nd = nd->nd_next; 3527 } 3528 3529 /* 3530 * - If no node can be found that has its own node record on 3531 * its node to be set to OK, then all alive nodes 3532 * were in the process of being added to or deleted 3533 * from set. Each alive node will remove all 3534 * information pertaining to this set from its node. 3535 * 3536 * If all nodes in set are ALIVE, then call sdssc end routines 3537 * since set was truly being initially created or destroyed. 3538 */ 3539 goto delete_set; 3540 } else { 3541 3542 /* 3543 * If node is not joined to diskset, then this 3544 * node uses the following algorithm: 3545 * - If unjoined node doesn't have a node record for itself, 3546 * just delete the diskset since diskset was in the 3547 * process of being created. 3548 * - node needs to find master of diskset before 3549 * reconfig cycle, if a master existed. 3550 * - node calls RPC routine clnt_ownset to get latest 3551 * information on which nodes are owners of diskset. 3552 * clnt_ownset checks on each node to see if its 3553 * kernel has that diskset snarfed. 3554 */ 3555 3556 /* 3557 * Is my node in the set description? 3558 * If not, delete the set from this node. 3559 * sr2setdesc sets sd_mn_mynode pointer to the node 3560 * descriptor for this node if there was a node 3561 * record for this node. 3562 * 3563 */ 3564 if (sd->sd_mn_mynode == NULL) { 3565 goto delete_set; 3566 } 3567 3568 nd = sd->sd_nodelist; 3569 while (nd) { 3570 /* Don't consider node that isn't in member list */ 3571 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3572 nd = nd->nd_next; 3573 continue; 3574 } 3575 3576 if (clnt_ownset(nd->nd_nodename, sp, 3577 &is_owner, ep) == -1) { 3578 /* If RPC failure to another node return 205 */ 3579 if ((mdanyrpcerror(ep)) && 3580 (sd->sd_mn_mynode->nd_nodeid != 3581 nd->nd_nodeid)) { 3582 return (205); 3583 } else { 3584 /* Any other failure */ 3585 return (-1); 3586 } 3587 } 3588 3589 /* 3590 * Set owner flag for each node based on whether 3591 * that node really has a diskset mddb snarfed in 3592 * or not. 3593 */ 3594 if (is_owner == TRUE) 3595 nd->nd_flags |= MD_MN_NODE_OWN; 3596 else 3597 nd->nd_flags &= ~MD_MN_NODE_OWN; 3598 3599 nd = nd->nd_next; 3600 } 3601 3602 /* 3603 * - node walks through nodelist looking for nodes that 3604 * are owners of the diskset that are in 3605 * the membership list. 3606 * - for each owner, node calls RPC routine clnt_getset to 3607 * see if that node has a master set and to get the 3608 * diskset description. 3609 * - If the owner node has a set description that doesn't 3610 * include the non-joined node in the nodelist, this node 3611 * removes its set description of that diskset 3612 * (i.e. removes the set from its local mddbs). This is 3613 * handling the case of when a node was removed from a 3614 * diskset while it was not in the cluster membership 3615 * list. 3616 * - If that node has a master set and the master is in the 3617 * membership list and is an owner, then either this was 3618 * the master from before the reconfig cycle or this 3619 * node has already chosen a new master - either way, 3620 * the master value is valid as long as it is in the 3621 * membership list and is an owner 3622 * - master is chosen to be owner node's master 3623 */ 3624 nd = sd->sd_nodelist; 3625 while (nd) { 3626 /* Don't consider node that isn't in member list */ 3627 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3628 nd = nd->nd_next; 3629 continue; 3630 } 3631 3632 /* Don't consider a node that isn't an owner */ 3633 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3634 nd = nd->nd_next; 3635 continue; 3636 } 3637 3638 /* Get owner node's set record */ 3639 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3640 MD_SET_BAD, &mnsr, ep) == -1) { 3641 /* If RPC failure to another node return 205 */ 3642 if ((mdanyrpcerror(ep)) && 3643 (sd->sd_mn_mynode->nd_nodeid != 3644 nd->nd_nodeid)) { 3645 return (205); 3646 } else { 3647 /* Any other failure */ 3648 return (-1); 3649 } 3650 } 3651 3652 /* Is this node in the owner node's set record */ 3653 nr = mnsr->sr_nodechain; 3654 while (nr) { 3655 if (sd->sd_mn_mynode->nd_nodeid == 3656 nr->nr_nodeid) { 3657 break; 3658 } 3659 nr = nr->nr_next; 3660 } 3661 if (nr == NULL) { 3662 /* my node not found - delete set */ 3663 free_sr((md_set_record *)mnsr); 3664 goto delete_set; 3665 } 3666 3667 /* Is owner's node's master valid? */ 3668 master_nodeid = mnsr->sr_master_nodeid; 3669 free_sr((md_set_record *)mnsr); 3670 if (master_nodeid == MD_MN_INVALID_NID) { 3671 nd = nd->nd_next; 3672 continue; 3673 } 3674 3675 nd2 = sd->sd_nodelist; 3676 while (nd2) { 3677 if ((nd2->nd_nodeid == master_nodeid) && 3678 (nd2->nd_flags & MD_MN_NODE_ALIVE) && 3679 (nd2->nd_flags & MD_MN_NODE_OWN)) { 3680 nd = nd2; 3681 goto found_master; 3682 } 3683 nd2 = nd2->nd_next; 3684 } 3685 nd = nd->nd_next; 3686 } 3687 3688 /* 3689 * - If no owner node has a valid master, then follow 3690 * algorithm of when a node is joined to the diskset. 3691 * - node walks through nodelist looking for nodes that are 3692 * owners of the diskset that are in the membership list. 3693 * - for each owner, node calls RPC routine clnt_getset to 3694 * see if that node has its node record set to OK. 3695 * - If so, master is chosen to be this owner node. 3696 */ 3697 nd = sd->sd_nodelist; 3698 while (nd) { 3699 /* Don't consider node that isn't in member list */ 3700 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3701 nd = nd->nd_next; 3702 continue; 3703 } 3704 3705 /* Don't consider a node that isn't an owner */ 3706 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 3707 nd = nd->nd_next; 3708 continue; 3709 } 3710 3711 /* Does node has its own node record set to OK? */ 3712 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3713 MD_SET_BAD, &mnsr, ep) == -1) { 3714 /* If RPC failure to another node return 205 */ 3715 if ((mdanyrpcerror(ep)) && 3716 (sd->sd_mn_mynode->nd_nodeid != 3717 nd->nd_nodeid)) { 3718 return (205); 3719 } else { 3720 /* Any other failure */ 3721 return (-1); 3722 } 3723 } 3724 nr = mnsr->sr_nodechain; 3725 while (nr) { 3726 if (nd->nd_nodeid == nr->nr_nodeid) { 3727 if (nr->nr_flags & MD_MN_NODE_OK) { 3728 /* Found a master */ 3729 free_sr( 3730 (md_set_record *)mnsr); 3731 goto found_master; 3732 } 3733 } 3734 nr = nr->nr_next; 3735 } 3736 free_sr((md_set_record *)mnsr); 3737 nd = nd->nd_next; 3738 } 3739 3740 /* 3741 * - If no owner node has its own node record on its own node 3742 * set to OK, then this node checks all of the non-owner 3743 * nodes that are in the membership list. 3744 * - for each non-owner, node calls RPC routine clnt_getset to 3745 * see if that node has its node record set to OK. 3746 * - If set doesn't exist, don't choose node for master. 3747 * - If this node doesn't exist in the nodelist on any of the 3748 * non-owner nodes, this node removes its set description 3749 * of that diskset (i.e. removes the set from its local 3750 * mddbs). This is handling the case of when a node was 3751 * removed from a diskset while it was not in the 3752 * cluster membership list. 3753 * - If non-owner node has its node record set to OK and if 3754 * this node hasn't removed this diskset (step directly 3755 * before this one), then the master is chosen to be this 3756 * non-owner node. 3757 */ 3758 nd = sd->sd_nodelist; 3759 while (nd) { 3760 /* Don't consider node that isn't in member list */ 3761 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 3762 nd->nd_flags |= MD_MN_NODE_DEL; 3763 nd = nd->nd_next; 3764 continue; 3765 } 3766 3767 /* Don't consider owner nodes since none are OK */ 3768 if (nd->nd_flags & MD_MN_NODE_OWN) { 3769 nd->nd_flags |= MD_MN_NODE_DEL; 3770 nd = nd->nd_next; 3771 continue; 3772 } 3773 3774 /* 3775 * Don't need to get nodelist from my node since 3776 * this is where sd_nodelist was obtained. 3777 */ 3778 if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3779 nd = nd->nd_next; 3780 continue; 3781 } 3782 3783 /* 3784 * If node has already been decided against for 3785 * master, then skip it. 3786 */ 3787 if (nd->nd_flags & MD_MN_NODE_DEL) { 3788 nd = nd->nd_next; 3789 continue; 3790 } 3791 3792 /* 3793 * Does node in my nodelist have its own node 3794 * record marked OK on its node? And does node 3795 * in my nodelist exist on all other nodes? 3796 * Don't want to choose a node for master unless 3797 * that node is marked OK on its own node and that 3798 * node exists on all other alive nodes. 3799 * 3800 * This is guarding against the case when several 3801 * nodes are down and one of the downed nodes is 3802 * deleted from the diskset. When the down nodes 3803 * are rebooted into the cluster, you don't want 3804 * any node to pick the deleted node as the master. 3805 */ 3806 if (clnt_mngetset(nd->nd_nodename, sp->setname, 3807 MD_SET_BAD, &mnsr, ep) == -1) { 3808 /* 3809 * If set doesn't exist on non-owner node, 3810 * don't consider this node for master. 3811 */ 3812 if (mdiserror(ep, MDE_NO_SET)) { 3813 nd->nd_flags |= MD_MN_NODE_DEL; 3814 nd = nd->nd_next; 3815 continue; 3816 } else if (mdanyrpcerror(ep)) { 3817 /* RPC failure to another node */ 3818 return (205); 3819 } else { 3820 /* Any other failure */ 3821 return (-1); 3822 } 3823 } 3824 /* 3825 * Is my node in the nodelist gotten from the other 3826 * node? If not, then remove the set from my node 3827 * since set was deleted from my node while my node 3828 * was out of the cluster. 3829 */ 3830 nr = mnsr->sr_nodechain; 3831 while (nr) { 3832 if (sd->sd_mn_mynode->nd_nodeid == 3833 nr->nr_nodeid) { 3834 break; 3835 } 3836 nr = nr->nr_next; 3837 } 3838 if (nr == NULL) { 3839 /* my node not found - delete set */ 3840 free_sr((md_set_record *)mnsr); 3841 goto delete_set; 3842 } 3843 3844 /* Is node being checked marked OK on its own node? */ 3845 nr = mnsr->sr_nodechain; 3846 while (nr) { 3847 if (nd->nd_nodeid == nr->nr_nodeid) { 3848 if (!(nr->nr_flags & MD_MN_NODE_OK)) { 3849 nd->nd_flags |= MD_MN_NODE_DEL; 3850 } 3851 break; 3852 } 3853 nr = nr->nr_next; 3854 } 3855 /* 3856 * If node being checked doesn't exist on its 3857 * own node - don't choose it as master. 3858 */ 3859 if (nr == NULL) { 3860 nd->nd_flags |= MD_MN_NODE_DEL; 3861 } 3862 3863 /* 3864 * Check every node in my node's nodelist against 3865 * the nodelist gotten from the other node. 3866 * If a node in my node's nodelist is not found in the 3867 * other node's nodelist, then set the DEL flag. 3868 */ 3869 nd2 = sd->sd_nodelist; 3870 while (nd2) { 3871 nr = mnsr->sr_nodechain; 3872 while (nr) { 3873 if (nd2->nd_nodeid == nr->nr_nodeid) { 3874 break; 3875 } 3876 nr = nr->nr_next; 3877 } 3878 /* nd2 not found in other node's nodelist */ 3879 if (nr == NULL) { 3880 nd2->nd_flags |= MD_MN_NODE_DEL; 3881 } 3882 nd2 = nd2->nd_next; 3883 } 3884 3885 free_sr((md_set_record *)mnsr); 3886 nd = nd->nd_next; 3887 } 3888 3889 /* 3890 * Rescan list look for node that has not been marked DEL. 3891 * First node found is the master. 3892 */ 3893 nd = sd->sd_nodelist; 3894 while (nd) { 3895 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 3896 break; 3897 } 3898 nd = nd->nd_next; 3899 continue; 3900 } 3901 if (nd) { 3902 /* Found a master */ 3903 goto found_master; 3904 } 3905 3906 /* 3907 * - If no node can be found that has its own node record on 3908 * its node to be set to OK, then all alive nodes 3909 * were in the process of being added to or deleted 3910 * from set. Each alive node will remove all 3911 * information pertaining to this set from its node. 3912 * 3913 * If all nodes in set are ALIVE, then call sdssc end routines 3914 * since set was truly being initially created or destroyed. 3915 */ 3916 goto delete_set; 3917 } 3918 3919 found_master: 3920 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 3921 "Set %s master chosen %s (%d): %s"), 3922 sp->setname, nd->nd_nodename, nd->nd_nodeid, 3923 meta_print_hrtime(gethrtime() - start_time)); 3924 3925 if (clnt_lock_set(mynode(), sp, ep) == -1) { 3926 return (-1); 3927 } 3928 3929 cl_sk = cl_get_setkey(sp->setno, sp->setname); 3930 3931 if (clnt_mnsetmaster(mynode(), sp, 3932 nd->nd_nodename, nd->nd_nodeid, ep)) { 3933 rval = -1; 3934 } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { 3935 /* If this node is new master, set flag in this node's kernel */ 3936 (void) memset(&sf, 0, sizeof (sf)); 3937 sf.sf_setno = sp->setno; 3938 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 3939 /* Use magic to help protect ioctl against attack. */ 3940 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 3941 sf.sf_flags = MDDB_NM_SET; 3942 3943 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 3944 "Setting new master flag for set %s: %s"), 3945 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 3946 3947 /* 3948 * Fail reconfig cycle if ioctl fails since it is critical 3949 * to set new master flag. 3950 */ 3951 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, 3952 NULL) != NULL) { 3953 (void) mdstealerror(ep, &sf.sf_mde); 3954 rval = -1; 3955 } 3956 } 3957 3958 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 3959 if (rval == 0) { 3960 (void) mdstealerror(ep, &xep); 3961 rval = -1; 3962 } 3963 } 3964 3965 cl_set_setkey(NULL); 3966 3967 metaflushsetname(sp); 3968 3969 return (rval); 3970 3971 delete_set: 3972 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 3973 "Master not chosen, deleting set %s: %s"), 3974 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 3975 3976 /* 3977 * Remove all set information from this node: 3978 * - node records for this set 3979 * - drive records for this set 3980 * - set record for this set 3981 * (Only do this on this node since each node 3982 * will do it for its own local mddb.) 3983 * 3984 * If all nodes in set are ALIVE, then 3985 * the lowest numbered ALIVE nodeid in set 3986 * (irregardless of whether an owner node or not) will 3987 * call the DCS service to cleanup for create/delete of set. 3988 * sdssc_create_end(cleanup) if set was being created or 3989 * sdssc_delete_end(cleanup) if set was being deleted. 3990 * A node record with flag ADD denotes a set being 3991 * created. A node record with flag DEL denotes a 3992 * set being deleted. 3993 */ 3994 nd = sd->sd_nodelist; 3995 while (nd) { 3996 /* Found a node that isn't alive */ 3997 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) 3998 break; 3999 4000 /* Is my node the lowest numbered ALIVE node? */ 4001 if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) { 4002 break; 4003 } 4004 nd = nd->nd_next; 4005 } 4006 if (nd == NULL) { 4007 /* All nodes ALIVE and this is the lowest nodeid */ 4008 lowest_alive_nodeid = 1; 4009 } 4010 4011 if (clnt_lock_set(mynode(), sp, ep) == -1) { 4012 return (-1); 4013 } 4014 4015 4016 /* 4017 * If this node had been joined, withdraw and reset master. 4018 * 4019 * This could happen if a node was being added to or removed 4020 * from a diskset and the node doing the add/delete operation and 4021 * all other nodes in the diskset have left the cluster. 4022 */ 4023 if (sd->sd_mn_mynode) { 4024 nd = sd->sd_mn_mynode; 4025 if (nd->nd_flags & MD_MN_NODE_OWN) { 4026 if (clnt_withdrawset(mynode(), sp, ep)) { 4027 rval = -1; 4028 goto out; 4029 } 4030 if (clnt_mnsetmaster(mynode(), sp, "", 4031 MD_MN_INVALID_NID, ep)) { 4032 rval = -1; 4033 goto out; 4034 } 4035 } 4036 } 4037 4038 /* 4039 * Remove side records for this node (side) from local mddb 4040 * (clnt_deldrvs does this) if there are drives in the set. 4041 * 4042 * Don't need to mark this node as DEL since already marked as 4043 * ADD or DEL (or this node would have been chosen as master). 4044 * Don't need to mark other node records, drive records or 4045 * set records as DEL. If a panic occurs during clnt_delset, 4046 * these records will be deleted the next time this node 4047 * becomes a member and goes through the reconfig cycle. 4048 */ 4049 /* Get the drive descriptors for this set */ 4050 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4051 ep)) == NULL) { 4052 if (! mdisok(ep)) { 4053 /* 4054 * Ignore and clear out any failures from 4055 * metaget_drivedesc since a panic could have 4056 * occurred when a node was partially added to a set. 4057 */ 4058 mdclrerror(ep); 4059 } 4060 } else { 4061 if (clnt_deldrvs(mynode(), sp, dd, ep)) { 4062 rval = -1; 4063 goto out; 4064 } 4065 } 4066 4067 /* 4068 * Now, delete the set - this removes the node, drive 4069 * and set records from the local mddb. 4070 */ 4071 if (clnt_delset(mynode(), sp, ep)) { 4072 rval = -1; 4073 goto out; 4074 } 4075 4076 out: 4077 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4078 4079 /* 4080 * Ignore errors from unlock of set since set is no longer 4081 * known (if clnt_delset worked). 4082 */ 4083 if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { 4084 mdclrerror(&xep); 4085 } 4086 4087 cl_set_setkey(NULL); 4088 4089 metaflushsetname(sp); 4090 4091 /* 4092 * If this node is the lowest numbered nodeid then 4093 * call sdssc_create/delete_end depending on whether 4094 * this node is marked as ADD or DEL in the node record. 4095 */ 4096 if (lowest_alive_nodeid) { 4097 if (nd->nd_flags & MD_MN_NODE_ADD) 4098 sdssc_create_end(sp->setname, SDSSC_CLEANUP); 4099 else if (nd->nd_flags & MD_MN_NODE_DEL) 4100 sdssc_delete_end(sp->setname, SDSSC_CLEANUP); 4101 } 4102 4103 /* Finished with this set -- return */ 4104 return (rval); 4105 } 4106 4107 /* 4108 * Reconfig step to choose a new master for all MN disksets. 4109 * Return values: 4110 * 0 - Everything is great. 4111 * 1 - This node failed to reconfig. 4112 * 205 - Cause another reconfig due to a nodelist problem 4113 * or RPC failure to another node 4114 */ 4115 int 4116 meta_reconfig_choose_master( 4117 md_error_t *ep 4118 ) 4119 { 4120 set_t max_sets, setno; 4121 int nodecnt; 4122 mndiskset_membershiplist_t *nl; 4123 md_set_desc *sd; 4124 mdsetname_t *sp; 4125 int rval = 0; 4126 mddb_setflags_config_t sf; 4127 int start_node_delayed = 0; 4128 4129 if ((max_sets = get_max_sets(ep)) == 0) { 4130 mde_perror(ep, dgettext(TEXT_DOMAIN, 4131 "Unable to get number of sets")); 4132 return (1); 4133 } 4134 4135 /* 4136 * Get membershiplist from API routine. If there's 4137 * an error, return a 205 to cause another reconfig. 4138 */ 4139 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { 4140 mde_perror(ep, ""); 4141 return (205); 4142 } 4143 4144 for (setno = 1; setno < max_sets; setno++) { 4145 if ((sp = metasetnosetname(setno, ep)) == NULL) { 4146 if (mdiserror(ep, MDE_NO_SET)) { 4147 /* No set for this setno - continue */ 4148 mdclrerror(ep); 4149 continue; 4150 } else { 4151 /* 4152 * If encountered an RPC error from my node, 4153 * then immediately fail. 4154 */ 4155 if (mdanyrpcerror(ep)) { 4156 mde_perror(ep, ""); 4157 return (1); 4158 } 4159 /* Can't get set information */ 4160 mde_perror(ep, dgettext(TEXT_DOMAIN, 4161 "Unable to get information for " 4162 "set number %d"), setno); 4163 mdclrerror(ep); 4164 continue; 4165 } 4166 } 4167 4168 /* If setname is there, set desc should exist. */ 4169 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4170 /* 4171 * If encountered an RPC error from my node, 4172 * then immediately fail. 4173 */ 4174 if (mdanyrpcerror(ep)) { 4175 mde_perror(ep, ""); 4176 return (1); 4177 } 4178 mde_perror(ep, dgettext(TEXT_DOMAIN, 4179 "Unable to get set %s desc information"), 4180 sp->setname); 4181 mdclrerror(ep); 4182 continue; 4183 } 4184 4185 /* Only reconfig MN disksets */ 4186 if (!MD_MNSET_DESC(sd)) { 4187 continue; 4188 } 4189 4190 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4191 "Begin choose master for set %s: %s"), 4192 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4193 4194 /* Update nodelist with member information. */ 4195 if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) { 4196 /* 4197 * If encountered an RPC error from my node, 4198 * then immediately fail. 4199 */ 4200 if (mdanyrpcerror(ep)) { 4201 mde_perror(ep, ""); 4202 return (1); 4203 } 4204 mde_perror(ep, ""); 4205 mdclrerror(ep); 4206 continue; 4207 } 4208 4209 /* 4210 * If all nodes in a cluster are starting, then 4211 * all nodes will attempt to contact all other nodes 4212 * to determine a master node. This can lead to a 4213 * problem where node 1 is trying to contact the rpc.metad 4214 * node 2 and node 2 is trying to contact the rpc.metad 4215 * on node 1 -- and this causes the rpc call to fail 4216 * on both nodes and causes a new reconfig cycle. 4217 * 4218 * In order to break this problem, a newly starting node 4219 * will delay a small amount of time (nodeid mod 4 seconds) 4220 * and will then run the code to choose a master for the 4221 * first set. Delay will only be done once regardless of the 4222 * number of sets. 4223 */ 4224 if (start_node_delayed == 0) { 4225 (void) memset(&sf, 0, sizeof (sf)); 4226 sf.sf_setno = sp->setno; 4227 sf.sf_flags = MDDB_NM_GET; 4228 /* Use magic to help protect ioctl against attack. */ 4229 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 4230 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, 4231 &sf.sf_mde, NULL) == 0) && 4232 ((sf.sf_setflags & MD_SET_MN_START_RC) == 4233 MD_SET_MN_START_RC)) { 4234 (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4); 4235 } 4236 start_node_delayed = 1; 4237 } 4238 4239 /* Choose master for this set */ 4240 rval = meta_reconfig_choose_master_for_set(sp, sd, ep); 4241 if (rval == -1) { 4242 mde_perror(ep, ""); 4243 return (1); 4244 } else if (rval == 205) { 4245 mde_perror(ep, ""); 4246 return (205); 4247 } 4248 4249 /* Send new nodelist to rpc.mdcommd */ 4250 (void) mdmn_reinit_set(sp->setno); 4251 4252 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4253 "Choose master for set %s completed: %s"), 4254 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4255 } 4256 4257 /* 4258 * Each node turns on I/Os for all MN disksets. 4259 * This is to recover from the situation where the master died 4260 * during a previous reconfig cycle when I/Os were suspended 4261 * for a MN diskset. 4262 * If a failure occurs return a 1 which will force this node to 4263 * panic. Cannot leave node in the situation where I/Os are 4264 * not resumed. 4265 */ 4266 setno = 0; /* 0 means all MN sets */ 4267 if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) { 4268 mde_perror(ep, ""); 4269 return (1); 4270 } 4271 4272 /* Free the nodelist */ 4273 if (nodecnt) 4274 meta_free_nodelist(nl); 4275 4276 return (0); 4277 } 4278 4279 /* 4280 * meta_mnsync_user_records will synchronize the diskset user records across 4281 * all nodes in the diskset. The diskset user records are stored in 4282 * each node's local set mddb. 4283 * 4284 * This needs to be done even if there is no master change during the 4285 * reconfig cycle since this routine should clean up any mess left by 4286 * the untimely termination of a metaset or metadb command (due to a 4287 * node panic or to user intervention). 4288 * 4289 * Caller is the Master node. 4290 * 4291 * Returns 0 - Success 4292 * 205 - Failure during RPC to another node 4293 * -1 - Any other failure and ep is filled in. 4294 */ 4295 int 4296 meta_mnsync_user_records( 4297 mdsetname_t *sp, 4298 md_error_t *ep 4299 ) 4300 { 4301 md_set_desc *sd; 4302 md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail; 4303 md_mnset_record *mnsr; 4304 md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL; 4305 md_mnnode_record *nr; 4306 md_drive_record *dr; 4307 int dr_cnt, dd_cnt; 4308 int found_my_nr; 4309 md_drive_desc *dd, *dd_prev, *master_dd, *other_dd; 4310 int all_drives_ok; 4311 int rval = 0; 4312 int max_genid = 0; 4313 int num_alive_nodes, num_alive_nodes_del = 0; 4314 int set_locked = 0; 4315 md_setkey_t *cl_sk; 4316 md_error_t xep = mdnullerror; 4317 char *anode[1]; 4318 mddb_setflags_config_t sf; 4319 4320 /* 4321 * Sync up node records first. 4322 * Construct a master nodelist using the nodelist from this 4323 * node's rpc.metad node records and then setting the state of each 4324 * node following these rules: 4325 * - If a node record is marked OK on its node, mark it OK 4326 * in the master nodelist (and later OK on all nodes) 4327 * If a node record is also marked OWN on its node, 4328 * mark it OWN in the master nodelist. 4329 * - If a node record is not marked OK on its node, then mark 4330 * it as DEL in the master list (later deleting it) 4331 * - If node record doesn't exist on that node, then mark it DEL 4332 * (later deleting it) 4333 * - If set record doesn't exist on that node, mark node as DEL 4334 * - If a node record doesn't exist on all nodes, then mark it DEL 4335 * - If a node is not ALIVE, then 4336 * - If that node marked DEL on any node - mark it DEL 4337 * in master list but leave in nodelist 4338 * - If that node is marked as ADD on any node, mark it 4339 * ADD in the master list but leave in nodelist 4340 * - When that node returns to the living, the DEL 4341 * node record will be removed and the ADD node 4342 * record may be removed if marked ADD on that 4343 * node. 4344 * The key rule is to not remove a node from the nodelist until 4345 * that node record is removed from its own node. Do not want to 4346 * remove a node's record from all other nodes and then have 4347 * that node have its own record marked OK so that a node will pick 4348 * a different master than the other nodes. 4349 * 4350 * Next, 4351 * If node is ALIVE and node record is marked DEL in master nodelist, 4352 * remove node from set. 4353 * If node is ALIVE and node record is marked OK in master nodelist, 4354 * mark it OK on all other nodes. 4355 * If node is not ALIVE and node record is marked DEL in master 4356 * nodelist, mark it DEL on all other nodes. 4357 * If node is not ALIVE and node record is marked ADD in master, 4358 * nodelist, mark it ADD on all other nodes. 4359 */ 4360 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4361 return (-1); 4362 } 4363 master_nodelist = sd->sd_nodelist; 4364 4365 /* 4366 * Walk through nodelist creating a master nodelist. 4367 */ 4368 num_alive_nodes = 0; 4369 nd = master_nodelist; 4370 while (nd) { 4371 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4372 nd = nd->nd_next; 4373 continue; 4374 } 4375 num_alive_nodes++; 4376 if (clnt_mngetset(nd->nd_nodename, sp->setname, 4377 MD_SET_BAD, &mnsr, ep) == -1) { 4378 if (mdiserror(ep, MDE_NO_SET)) { 4379 /* set doesn't exist, mark node as DEL */ 4380 nd->nd_flags &= ~MD_MN_NODE_OK; 4381 nd->nd_flags &= ~MD_MN_NODE_ADD; 4382 nd->nd_flags |= MD_MN_NODE_DEL; 4383 nd->nd_flags |= MD_MN_NODE_NOSET; 4384 nd = nd->nd_next; 4385 continue; 4386 } else { 4387 /* If RPC failure to another node return 205 */ 4388 if ((mdanyrpcerror(ep)) && 4389 (sd->sd_mn_mynode->nd_nodeid != 4390 nd->nd_nodeid)) { 4391 rval = 205; 4392 } else { 4393 /* Any other failure */ 4394 rval = -1; 4395 } 4396 goto out; 4397 } 4398 } 4399 /* Find biggest genid in records for this diskset */ 4400 if (mnsr->sr_genid > max_genid) 4401 max_genid = mnsr->sr_genid; 4402 4403 dr = mnsr->sr_drivechain; 4404 while (dr) { 4405 /* Find biggest genid in records for this diskset */ 4406 if (dr->dr_genid > max_genid) { 4407 max_genid = dr->dr_genid; 4408 } 4409 dr = dr->dr_next; 4410 } 4411 4412 found_my_nr = 0; 4413 nr = mnsr->sr_nodechain; 4414 /* nr is the list of node recs from nd_nodename node */ 4415 while (nr) { 4416 /* Find biggest genid in records for this diskset */ 4417 if (nr->nr_genid > max_genid) 4418 max_genid = nr->nr_genid; 4419 nd2 = master_nodelist; 4420 ndtail = NULL; 4421 /* For each node record, is it in master list? */ 4422 while (nd2) { 4423 if (nd2->nd_nodeid == nr->nr_nodeid) 4424 break; 4425 if (nd2->nd_next == NULL) 4426 ndtail = nd2; 4427 nd2 = nd2->nd_next; 4428 } 4429 /* 4430 * Found node record not in master list -- add it 4431 * to list marking it as DEL since node record 4432 * should exist on all nodes unless a panic occurred 4433 * during addition or deletion of host to diskset. 4434 */ 4435 if (nd2 == NULL) { 4436 nd2 = Zalloc(sizeof (*nd2)); 4437 (void) strcpy(nd2->nd_nodename, 4438 nr->nr_nodename); 4439 nd2->nd_flags = nr->nr_flags; 4440 nd2->nd_flags |= MD_MN_NODE_DEL; 4441 nd2->nd_nodeid = nr->nr_nodeid; 4442 nd2->nd_next = NULL; 4443 ndtail->nd_next = nd2; 4444 nd2 = NULL; 4445 nr = nr->nr_next; 4446 continue; 4447 } 4448 /* 4449 * Is this the node record for the node that 4450 * we requested the set desc from? 4451 * If so, check if node has its own node record 4452 * marked OK. If marked OK, check for the OWN bit. 4453 */ 4454 if (nr->nr_nodeid == nd->nd_nodeid) { 4455 found_my_nr = 1; 4456 if (nr->nr_flags & MD_MN_NODE_OK) { 4457 /* 4458 * If node record is marked OK 4459 * on its own node, then mark it OK 4460 * in the master list. Node record 4461 * would have to exist on all nodes 4462 * in the ADD state before it could 4463 * be put into the OK state. 4464 */ 4465 nd->nd_flags |= MD_MN_NODE_OK; 4466 nd->nd_flags &= 4467 ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL); 4468 /* 4469 * Mark own in master list as marked 4470 * on own node. 4471 */ 4472 if (nr->nr_flags & MD_MN_NODE_OWN) 4473 nd->nd_flags |= MD_MN_NODE_OWN; 4474 else 4475 nd->nd_flags &= ~MD_MN_NODE_OWN; 4476 } else { 4477 /* Otherwise, mark node as DEL */ 4478 nd->nd_flags &= ~MD_MN_NODE_OK; 4479 nd->nd_flags &= ~MD_MN_NODE_ADD; 4480 nd->nd_flags |= MD_MN_NODE_DEL; 4481 } 4482 } 4483 /* 4484 * If node is not ALIVE and marked DEL 4485 * on any node, make it DEL in master list. 4486 * If node is not ALIVE and marked ADD 4487 * on any node, make it ADD in master list 4488 * unless node record has already been marked DEL. 4489 */ 4490 if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) { 4491 if (nr->nr_flags & MD_MN_NODE_ADD) { 4492 if (!(nd->nd_flags & MD_MN_NODE_DEL)) { 4493 /* If not DEL - mark it ADD */ 4494 nd->nd_flags |= MD_MN_NODE_ADD; 4495 nd->nd_flags &= ~MD_MN_NODE_OK; 4496 } 4497 } 4498 if (nr->nr_flags & MD_MN_NODE_DEL) { 4499 nd->nd_flags |= MD_MN_NODE_DEL; 4500 nd->nd_flags &= ~MD_MN_NODE_OK; 4501 /* Could already be ADD - make it DEL */ 4502 nd->nd_flags &= ~MD_MN_NODE_ADD; 4503 } 4504 } 4505 nr = nr->nr_next; 4506 } 4507 /* 4508 * If a node record doesn't exist on its own node, 4509 * then mark node as DEL. 4510 */ 4511 if (found_my_nr == 0) { 4512 nd->nd_flags &= ~MD_MN_NODE_OK; 4513 nd->nd_flags |= MD_MN_NODE_DEL; 4514 } 4515 4516 /* 4517 * If node is OK - put mnsr onto master_mnsr_node list for 4518 * later use when syncing up the drive records in the set. 4519 */ 4520 if (nd->nd_flags & MD_MN_NODE_OK) { 4521 mnsr_node = Zalloc(sizeof (*mnsr_node)); 4522 mnsr_node->mmn_mnsr = mnsr; 4523 (void) strncpy(mnsr_node->mmn_nodename, 4524 nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1); 4525 mnsr_node->mmn_next = master_mnsr_node; 4526 master_mnsr_node = mnsr_node; 4527 } else { 4528 free_sr((struct md_set_record *)mnsr); 4529 } 4530 4531 nd = nd->nd_next; 4532 } 4533 4534 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4535 "Master nodelist created for set %s: %s"), 4536 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4537 4538 /* 4539 * Send master nodelist to the rpc.metad on all nodes (including 4540 * myself) and each node will update itself. This will set the 4541 * ADD and DEL flags on each node as setup in the master nodelist. 4542 * Don't send nodelist to node where set doesn't exist. 4543 */ 4544 nd = master_nodelist; 4545 while (nd) { 4546 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4547 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4548 nd = nd->nd_next; 4549 continue; 4550 } 4551 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 4552 master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 4553 /* If RPC failure to another node return 205 */ 4554 if ((mdanyrpcerror(ep)) && 4555 (sd->sd_mn_mynode->nd_nodeid != 4556 nd->nd_nodeid)) { 4557 rval = 205; 4558 } else { 4559 /* Any other failure */ 4560 rval = -1; 4561 } 4562 goto out; 4563 } 4564 nd = nd->nd_next; 4565 } 4566 4567 /* 4568 * Now, delete nodes that need to be deleted. 4569 */ 4570 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 4571 ep)) == NULL) { 4572 if (! mdisok(ep)) { 4573 rval = -1; 4574 goto out; 4575 } 4576 } 4577 4578 /* 4579 * May be doing lots of RPC commands to the nodes, so lock the 4580 * ALIVE members of the set since most of the rpc.metad routines 4581 * require this for security reasons. 4582 */ 4583 nd = master_nodelist; 4584 while (nd) { 4585 /* Skip non-alive nodes and node without set */ 4586 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4587 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4588 nd = nd->nd_next; 4589 continue; 4590 } 4591 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 4592 /* If RPC failure to another node return 205 */ 4593 if ((mdanyrpcerror(ep)) && 4594 (sd->sd_mn_mynode->nd_nodeid != 4595 nd->nd_nodeid)) { 4596 rval = 205; 4597 } else { 4598 /* Any other failure */ 4599 rval = -1; 4600 } 4601 goto out; 4602 } 4603 set_locked = 1; 4604 nd = nd->nd_next; 4605 } 4606 4607 nd = master_nodelist; 4608 while (nd) { 4609 /* Skip non-alive nodes */ 4610 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 4611 nd = nd->nd_next; 4612 continue; 4613 } 4614 if (nd->nd_flags & MD_MN_NODE_DEL) { 4615 num_alive_nodes_del++; 4616 /* 4617 * Delete this node rec from all ALIVE nodes in diskset. 4618 */ 4619 nd2 = master_nodelist; 4620 while (nd2) { 4621 /* Skip non-alive nodes and node without set */ 4622 if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) || 4623 (nd2->nd_flags & MD_MN_NODE_NOSET)) { 4624 nd2 = nd2->nd_next; 4625 continue; 4626 } 4627 4628 /* This is a node being deleted from set */ 4629 if (nd2->nd_nodeid == nd->nd_nodeid) { 4630 /* Mark set record as DEL */ 4631 if (clnt_upd_sr_flags(nd->nd_nodename, 4632 sp, MD_SR_DEL, ep)) { 4633 /* RPC failure to !my node */ 4634 if ((mdanyrpcerror(ep)) && 4635 (sd->sd_mn_mynode-> 4636 nd_nodeid 4637 != nd->nd_nodeid)) { 4638 rval = 205; 4639 } else { 4640 /* Any other failure */ 4641 rval = -1; 4642 } 4643 goto out; 4644 } 4645 if (clnt_deldrvs(nd->nd_nodename, sp, 4646 dd, ep)) { 4647 /* RPC failure to !my node */ 4648 if ((mdanyrpcerror(ep)) && 4649 (sd->sd_mn_mynode-> 4650 nd_nodeid 4651 != nd->nd_nodeid)) { 4652 rval = 205; 4653 } else { 4654 /* Any other failure */ 4655 rval = -1; 4656 } 4657 goto out; 4658 } 4659 if (clnt_delset(nd->nd_nodename, sp, 4660 ep) == -1) { 4661 /* RPC failure to !my node */ 4662 if ((mdanyrpcerror(ep)) && 4663 (sd->sd_mn_mynode-> 4664 nd_nodeid 4665 != nd->nd_nodeid)) { 4666 rval = 205; 4667 } else { 4668 /* Any other failure */ 4669 rval = -1; 4670 } 4671 goto out; 4672 } 4673 } else { 4674 /* 4675 * Delete host from sets on hosts 4676 * not being deleted. 4677 */ 4678 anode[0] = Strdup(nd->nd_nodename); 4679 if (clnt_delhosts(nd2->nd_nodename, sp, 4680 1, anode, ep) == -1) { 4681 Free(anode[0]); 4682 /* RPC failure to !my node */ 4683 if ((mdanyrpcerror(ep)) && 4684 (sd->sd_mn_mynode-> 4685 nd_nodeid 4686 != nd2->nd_nodeid)) { 4687 rval = 205; 4688 } else { 4689 /* Any other failure */ 4690 rval = -1; 4691 } 4692 goto out; 4693 } 4694 4695 meta_mc_log(MC_LOG5, 4696 dgettext(TEXT_DOMAIN, 4697 "Deleted node %s (%d) on node %s " 4698 "from set %s: %s"), 4699 nd->nd_nodename, nd->nd_nodeid, 4700 nd2->nd_nodename, 4701 sp->setname, 4702 meta_print_hrtime( 4703 gethrtime() - start_time)); 4704 4705 Free(anode[0]); 4706 } 4707 nd2 = nd2->nd_next; 4708 } 4709 } 4710 nd = nd->nd_next; 4711 } 4712 4713 nd = master_nodelist; 4714 cl_sk = cl_get_setkey(sp->setno, sp->setname); 4715 while (nd) { 4716 /* Skip non-alive nodes and node without set */ 4717 if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || 4718 (nd->nd_flags & MD_MN_NODE_NOSET)) { 4719 nd = nd->nd_next; 4720 continue; 4721 } 4722 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 4723 /* If RPC failure to another node return 205 */ 4724 if ((mdanyrpcerror(ep)) && 4725 (sd->sd_mn_mynode->nd_nodeid != 4726 nd->nd_nodeid)) { 4727 rval = 205; 4728 } else { 4729 /* Any other failure */ 4730 rval = -1; 4731 } 4732 goto out; 4733 } 4734 nd = nd->nd_next; 4735 } 4736 cl_set_setkey(NULL); 4737 set_locked = 0; 4738 4739 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4740 "Nodelist syncronization complete for set %s: %s"), 4741 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4742 4743 metaflushsetname(sp); 4744 4745 /* 4746 * If all alive nodes have been deleted from set, just 4747 * return since nothing else can be done until non-alive 4748 * nodes (if there are any) rejoin the cluster. 4749 */ 4750 if (num_alive_nodes == num_alive_nodes_del) { 4751 rval = 0; 4752 goto out; 4753 } 4754 4755 /* 4756 * Sync up drive records. 4757 * 4758 * If a node panic'd (or metaset command was killed) during the 4759 * addition or deletion of a drive to the diskset, the nodes 4760 * may have a different view of the drive list. During cleanup 4761 * of the drive list during reconfig, a drive will be deleted 4762 * from the list if the master node sees that the drive has been 4763 * marked in the ADD state on any node or is marked in the DEL state 4764 * on all nodes. 4765 * This cleanup must occur even if all nodes in the cluster are 4766 * not part of the cluster so that all nodes have the same view 4767 * of the drivelist. 4768 * Then if the entire cluster goes down and comes back up, the 4769 * new master node could be a node that wasn't in the cluster when 4770 * the node was deleted. This could lead to a situation where the 4771 * master node thinks that a drive is OK, but this drive isn't 4772 * known to the other nodes. 4773 * This situation can also occur during the addition of a drive 4774 * where a node has the drive marked OK, but the node executing the 4775 * metaset command enountered a failure before marking that drive OK 4776 * on the rest of the nodes. If the node with the OK drive then 4777 * panics, then rest of the nodes will remove that drive marked ADD 4778 * and when the node with the OK drive rejoins the cluster, it will 4779 * have a drive marked OK that is unknown by the other nodes. 4780 * 4781 * There are 2 situations to consider: 4782 * A) Master knows about a drive that other nodes don't know about. 4783 * B) At least one slave node knows about a drive that the master 4784 * node doesn't know about. 4785 * 4786 * To handle these situations the following steps are followed: 4787 * 1) Count number of drives known by this master node and the 4788 * other slave nodes. 4789 * If all nodes have the same number of drives and the master has 4790 * all drives marked OK, then skip to step4. 4791 * 4792 * 2) If a node has less drives listed than the master, the master 4793 * must get the drive descriptor list from that node so that 4794 * master can determine which drive it needs to delete from that 4795 * node. Master must get the drive descriptor list since the 4796 * drive record list does not contain the name of the drive, but 4797 * only a key and the key can only be interprested on that other 4798 * node. 4799 * 4800 * 3) The master will then create the master drive list by doing: 4801 * - Master starts with drive list known by master. 4802 * - Any drive marked ADD will be removed from the list. 4803 * - Any drive not known by another node (from step2) will be 4804 * removed from the drive list. 4805 * - If a drive is marked DEL on the master, the master must 4806 * verify that the drive record is marked DEL on all nodes. 4807 * If any node has the drive record marked OK, mark it OK 4808 * on the master. (The reason why is described below). 4809 * 4810 * 4) The master sends out the master drive list and the slave 4811 * nodes will force their drive lists to match the master 4812 * drive list by deleting drives, if necessary and by changing 4813 * the drive record states from ADD->OK if master has drive 4814 * marked OK and slave has drive marked ADD. 4815 * 4816 * Interesting scenarios: 4817 * 4818 * 1) System has 4 nodes with node 1 as the master. Node 3 starts 4819 * to delete a drive record (drive record on node 1 is marked DEL), 4820 * but is stopped when node 3 panics. Node 1 also panics. 4821 * During reconfig cycle, node 2 is picked as master and the drive 4822 * record is left alone since all nodes in the cluster have it 4823 * marked OK. User now sees drive as part of diskset. 4824 * Now, entire cluster is rebooted and node 1 rejoins the cluster. 4825 * Node 1 is picked as the master and node 1 has drive record 4826 * marked DEL. Node 1 contacts all other nodes in the cluster 4827 * and since at least one node has the drive record marked OK, 4828 * the master marks the drive record OK. 4829 * User continues to see the drive as part of the diskset. 4830 */ 4831 4832 /* Reget set descriptor since flushed above */ 4833 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 4834 rval = -1; 4835 goto out; 4836 } 4837 4838 /* Has side effect of setting sd->sd_drvs to same as master_dd */ 4839 if ((master_dd = metaget_drivedesc_sideno(sp, 4840 sd->sd_mn_mynode->nd_nodeid, 4841 (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { 4842 /* No drives in list */ 4843 if (!mdisok(ep)) { 4844 /* 4845 * Can't get drive list for this node, so 4846 * return -1 causing this node to be removed 4847 * cluster config and fixed. 4848 */ 4849 rval = -1; 4850 goto out; 4851 } 4852 } 4853 4854 /* Count the number of drives for all nodes */ 4855 mnsr_node = master_mnsr_node; 4856 while (mnsr_node) { 4857 dr_cnt = 0; 4858 dr = mnsr_node->mmn_mnsr->sr_drivechain; 4859 while (dr) { 4860 dr_cnt++; 4861 dr = dr->dr_next; 4862 } 4863 mnsr_node->mmn_numdrives = dr_cnt; 4864 mnsr_node = mnsr_node->mmn_next; 4865 } 4866 4867 /* Count the number of drives for the master; also check flags */ 4868 all_drives_ok = 1; 4869 dd_cnt = 0; 4870 dd = master_dd; 4871 while (dd) { 4872 dd_cnt++; 4873 if (!(dd->dd_flags & MD_DR_OK)) 4874 all_drives_ok = 0; 4875 dd = dd->dd_next; 4876 } 4877 4878 /* If all drives are ok, do quick check against number of drives */ 4879 if (all_drives_ok) { 4880 /* If all nodes have same number of drives, almost done */ 4881 mnsr_node = master_mnsr_node; 4882 while (mnsr_node) { 4883 if (mnsr_node->mmn_numdrives != dd_cnt) 4884 break; 4885 mnsr_node = mnsr_node->mmn_next; 4886 } 4887 /* All nodes have same number of drives, just send flags */ 4888 if (mnsr_node == NULL) { 4889 goto send_drive_list; 4890 } 4891 } 4892 4893 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4894 "Begin detailed drive synchronization for set %s: %s"), 4895 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4896 4897 /* Detailed check required */ 4898 mnsr_node = master_mnsr_node; 4899 while (mnsr_node) { 4900 /* Does slave node have less drives than master? */ 4901 if (mnsr_node->mmn_numdrives < dd_cnt) { 4902 /* Yes - must determine which drive is missing */ 4903 if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp, 4904 &other_dd, ep)) { 4905 /* RPC failure to !my node */ 4906 if ((mdanyrpcerror(ep)) && 4907 (strcmp(mynode(), mnsr_node->mmn_nodename) 4908 != 0)) { 4909 rval = 205; 4910 } else { 4911 /* Any other failure */ 4912 rval = -1; 4913 } 4914 mde_perror(ep, dgettext(TEXT_DOMAIN, 4915 "Master node %s unable to " 4916 "retrieve drive list from node %s"), 4917 mynode(), mnsr_node->mmn_nodename); 4918 goto out; 4919 } 4920 mnsr_node->mmn_dd = other_dd; 4921 dd = master_dd; 4922 while (dd) { 4923 if (!(dd->dd_flags & MD_DR_OK)) { 4924 dd = dd->dd_next; 4925 continue; 4926 } 4927 other_dd = mnsr_node->mmn_dd; 4928 while (other_dd) { 4929 /* Convert to devids, when available */ 4930 if (strcmp(other_dd->dd_dnp->cname, 4931 dd->dd_dnp->cname) == 0) { 4932 break; 4933 } 4934 other_dd = other_dd->dd_next; 4935 } 4936 /* 4937 * dd not found on slave so mark it 4938 * ADD for later deletion (drives in ADD 4939 * state are deleted later in this routine). 4940 */ 4941 if (other_dd == NULL) { 4942 dd->dd_flags = MD_DR_ADD; 4943 } 4944 dd = dd->dd_next; 4945 } 4946 4947 } 4948 mnsr_node = mnsr_node->mmn_next; 4949 } 4950 4951 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 4952 "Drive check completed for set %s: %s"), 4953 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 4954 4955 dd = master_dd; 4956 dd_prev = 0; 4957 while (dd) { 4958 /* Remove any ADD drives from list */ 4959 if (dd->dd_flags & MD_DR_ADD) { 4960 if (dd_prev) { 4961 dd_prev->dd_next = dd->dd_next; 4962 dd->dd_next = NULL; 4963 metafreedrivedesc(&dd); 4964 dd = dd_prev->dd_next; 4965 } else { 4966 /* 4967 * If removing drive descriptor from head 4968 * of linked list, also change sd->sd_drvs. 4969 */ 4970 master_dd = sd->sd_drvs = dd->dd_next; 4971 dd->dd_next = NULL; 4972 metafreedrivedesc(&dd); 4973 dd = master_dd; 4974 } 4975 /* dd setup in if/else above */ 4976 continue; 4977 } 4978 /* 4979 * If drive is marked DEL, check all other nodes. 4980 * If drive on another node is marked OK, mark drive OK 4981 * in master list. If drive is marked DEL or doesn't exist 4982 * on all nodes, remove drive from list. 4983 */ 4984 if (dd->dd_flags & MD_DR_DEL) { 4985 mnsr_node = master_mnsr_node; 4986 while (mnsr_node) { 4987 if (mnsr_node->mmn_dd == NULL) { 4988 if (clnt_getdrivedesc( 4989 mnsr_node->mmn_nodename, sp, 4990 &other_dd, ep)) { 4991 /* RPC failure to !my node */ 4992 if ((mdanyrpcerror(ep)) && 4993 (strcmp(mynode(), 4994 mnsr_node->mmn_nodename) 4995 != 0)) { 4996 rval = 205; 4997 } else { 4998 /* Any other failure */ 4999 rval = -1; 5000 } 5001 mde_perror(ep, dgettext(TEXT_DOMAIN, 5002 "Master node %s unable " 5003 "to retrieve drive list from " 5004 "node %s"), mynode(), 5005 mnsr_node->mmn_nodename); 5006 goto out; 5007 } 5008 mnsr_node->mmn_dd = other_dd; 5009 } 5010 other_dd = mnsr_node->mmn_dd; 5011 while (other_dd) { 5012 /* Found drive (OK) from other node */ 5013 if (strcmp(dd->dd_dnp->cname, 5014 other_dd->dd_dnp->cname) 5015 == 0) { 5016 /* Drive marked OK */ 5017 if (other_dd->dd_flags & 5018 MD_DR_OK) { 5019 dd->dd_flags = MD_DR_OK; 5020 } 5021 break; 5022 } 5023 other_dd = other_dd->dd_next; 5024 } 5025 if (dd->dd_flags == MD_DR_OK) 5026 break; 5027 5028 mnsr_node = mnsr_node->mmn_next; 5029 } 5030 /* 5031 * If no node had this drive marked OK, delete it. 5032 */ 5033 if (dd->dd_flags & MD_DR_DEL) { 5034 if (dd_prev) { 5035 dd_prev->dd_next = dd->dd_next; 5036 dd->dd_next = NULL; 5037 metafreedrivedesc(&dd); 5038 dd = dd_prev->dd_next; 5039 } else { 5040 /* 5041 * If removing drive descriptor from 5042 * head of linked list, also change 5043 * sd->sd_drvs. 5044 */ 5045 master_dd = sd->sd_drvs = dd->dd_next; 5046 dd->dd_next = NULL; 5047 metafreedrivedesc(&dd); 5048 dd = master_dd; 5049 } 5050 /* dd setup in if/else above */ 5051 continue; 5052 } 5053 } 5054 dd_prev = dd; 5055 dd = dd->dd_next; 5056 } 5057 5058 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5059 "Setting drive states completed for set %s: %s"), 5060 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5061 5062 send_drive_list: 5063 /* 5064 * Set genid on all drives to be the highest value seen. 5065 */ 5066 dd = master_dd; 5067 while (dd) { 5068 dd->dd_genid = max_genid; 5069 dd = dd->dd_next; 5070 } 5071 /* 5072 * Send updated drive list to all alive nodes. 5073 * Will also set genid on set and node records to have same 5074 * as the drive records. 5075 */ 5076 nd = sd->sd_nodelist; 5077 while (nd) { 5078 /* Skip non-alive nodes */ 5079 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5080 nd = nd->nd_next; 5081 continue; 5082 } 5083 if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) { 5084 /* RPC failure to another node */ 5085 if ((mdanyrpcerror(ep)) && 5086 (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) { 5087 rval = 205; 5088 } else { 5089 /* Any other failure */ 5090 rval = -1; 5091 } 5092 goto out; 5093 } 5094 nd = nd->nd_next; 5095 } 5096 5097 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5098 "Sent drive list to all nodes for set %s: %s"), 5099 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5100 5101 /* 5102 * If no drive records left in set and nodes had been joined, 5103 * withdraw the nodes. Always reset the master and mark 5104 * all nodes as withdrawn on all nodes. 5105 */ 5106 if (master_dd == NULL) { 5107 /* Reset new master flag since no longer master */ 5108 (void) memset(&sf, 0, sizeof (sf)); 5109 sf.sf_setno = sp->setno; 5110 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5111 sf.sf_flags = MDDB_NM_RESET; 5112 /* Use magic to help protect ioctl against attack. */ 5113 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5114 /* Ignore failure, failure to reset flag isn't catastrophic */ 5115 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5116 &sf.sf_mde, NULL); 5117 5118 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5119 "Reset new master flag for " "set %s: %s"), 5120 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5121 5122 nd = sd->sd_nodelist; 5123 while (nd) { 5124 /* Skip non-alive nodes */ 5125 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5126 nd = nd->nd_next; 5127 continue; 5128 } 5129 5130 if (clnt_lock_set(nd->nd_nodename, sp, ep)) { 5131 /* RPC failure to another node */ 5132 if ((mdanyrpcerror(ep)) && 5133 (sd->sd_mn_mynode->nd_nodeid != 5134 nd->nd_nodeid)) { 5135 rval = 205; 5136 } else { 5137 /* Any other failure */ 5138 rval = -1; 5139 } 5140 goto out; 5141 } 5142 set_locked = 1; 5143 5144 /* Withdraw node from set if owner */ 5145 if ((nd->nd_flags & MD_MN_NODE_OWN) && 5146 (clnt_withdrawset(nd->nd_nodename, sp, ep))) { 5147 /* RPC failure to another node */ 5148 if ((mdanyrpcerror(ep)) && 5149 (sd->sd_mn_mynode->nd_nodeid != 5150 nd->nd_nodeid)) { 5151 rval = 205; 5152 } else { 5153 /* Any other failure */ 5154 rval = -1; 5155 } 5156 goto out; 5157 } 5158 5159 /* Mark all nodes as withdrawn on this node */ 5160 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5161 sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) { 5162 /* RPC failure to another node */ 5163 if ((mdanyrpcerror(ep)) && 5164 (sd->sd_mn_mynode->nd_nodeid != 5165 nd->nd_nodeid)) { 5166 rval = 205; 5167 } else { 5168 /* Any other failure */ 5169 rval = -1; 5170 } 5171 goto out; 5172 } 5173 5174 /* Resets master to no-master on this node */ 5175 if (clnt_mnsetmaster(nd->nd_nodename, sp, 5176 "", MD_MN_INVALID_NID, ep)) { 5177 /* RPC failure to another node */ 5178 if ((mdanyrpcerror(ep)) && 5179 (sd->sd_mn_mynode->nd_nodeid != 5180 nd->nd_nodeid)) { 5181 rval = 205; 5182 } else { 5183 /* Any other failure */ 5184 rval = -1; 5185 } 5186 goto out; 5187 } 5188 5189 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5190 if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { 5191 /* RPC failure to another node */ 5192 if ((mdanyrpcerror(ep)) && 5193 (sd->sd_mn_mynode->nd_nodeid != 5194 nd->nd_nodeid)) { 5195 rval = 205; 5196 } else { 5197 /* Any other failure */ 5198 rval = -1; 5199 } 5200 goto out; 5201 } 5202 set_locked = 0; 5203 nd = nd->nd_next; 5204 } 5205 } 5206 5207 out: 5208 /* 5209 * If got here and set is still locked, then an error has 5210 * occurred and master_nodelist is still valid. 5211 * If error is not an RPC error, then unlock. 5212 * If error is an RPC error, skip unlocks since this could cause 5213 * yet another RPC timeout if a node has failed. 5214 * Ignore failures in unlock since unlock is just trying to 5215 * clean things up. 5216 */ 5217 if ((set_locked) && !(mdanyrpcerror(ep))) { 5218 nd = master_nodelist; 5219 cl_sk = cl_get_setkey(sp->setno, sp->setname); 5220 while (nd) { 5221 /* Skip non-alive nodes */ 5222 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 5223 nd = nd->nd_next; 5224 continue; 5225 } 5226 /* 5227 * If clnt_unlock fails, just break out since next 5228 * reconfig cycle will reset the locks anyway. 5229 */ 5230 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { 5231 break; 5232 } 5233 nd = nd->nd_next; 5234 } 5235 cl_set_setkey(NULL); 5236 } 5237 /* Free master_mnsr and drive descs */ 5238 mnsr_node = master_mnsr_node; 5239 while (mnsr_node) { 5240 master_mnsr_node = mnsr_node->mmn_next; 5241 free_sr((md_set_record *)mnsr_node->mmn_mnsr); 5242 free_rem_dd(mnsr_node->mmn_dd); 5243 Free(mnsr_node); 5244 mnsr_node = master_mnsr_node; 5245 } 5246 5247 /* Frees sd->sd_drvs (which is also master_dd) */ 5248 metaflushsetname(sp); 5249 return (rval); 5250 } 5251 5252 /* 5253 * meta_mnsync_diskset_mddbs 5254 * Calling node is guaranteed to be an owner node. 5255 * Calling node is the master node. 5256 * 5257 * Master node verifies that ondisk mddb format matches its incore format. 5258 * If no nodes are joined to set, remove the change log entries. 5259 * If a node is joined to set, play the change log. 5260 * 5261 * Returns 0 - Success 5262 * 1 - Master unable to join to set. 5263 * 205 - Failure during RPC to another node 5264 * -1 - Any other failure and ep is filled in. 5265 * -1 return will eventually cause node to panic 5266 * in a SunCluster environment. 5267 */ 5268 int 5269 meta_mnsync_diskset_mddbs( 5270 mdsetname_t *sp, 5271 md_error_t *ep 5272 ) 5273 { 5274 md_set_desc *sd; 5275 mddb_config_t c; 5276 md_mn_msgclass_t class; 5277 mddb_setflags_config_t sf; 5278 md_mnnode_desc *nd, *nd2; 5279 md_error_t xep = mdnullerror; 5280 int stale_set = 0; 5281 5282 /* If setname is there, set desc should exist. */ 5283 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5284 mde_perror(ep, dgettext(TEXT_DOMAIN, 5285 "Unable to get set %s desc information"), sp->setname); 5286 return (-1); 5287 } 5288 5289 /* Are there drives in the set? */ 5290 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5291 ep) == NULL) { 5292 if (! mdisok(ep)) { 5293 return (-1); 5294 } 5295 /* No drives in set -- nothing to sync up */ 5296 return (0); 5297 } 5298 5299 /* 5300 * Is master node (which is this node) joined to set? 5301 * If master node isn't joined (which means that no nodes 5302 * are joined to diskset), remove the change log entries 5303 * since no need to replay them - all nodes will have same 5304 * view of mddbs since all nodes are reading in the mddbs 5305 * from disk. 5306 * There is also no need to sync up the master and ondisk mddbs 5307 * since master has no incore knowledge. 5308 * Need to join master to set in order to flush the change 5309 * log entries. Don't need to block I/O during join of master 5310 * to set since no other nodes are joined to set and so no I/O 5311 * can be occurring. 5312 */ 5313 if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 5314 /* Join master to set */ 5315 if (clnt_joinset(mynode(), sp, 5316 MNSET_IN_RECONFIG, ep)) { 5317 if (mdismddberror(ep, MDE_DB_STALE)) { 5318 /* 5319 * If STALE, print message and continue on. 5320 * Don't do any writes or reads to mddbs 5321 * so don't clear change log. 5322 */ 5323 mde_perror(ep, dgettext(TEXT_DOMAIN, 5324 "Join of master node to STALE set %s"), 5325 sp->setname); 5326 stale_set = 1; 5327 mdclrerror(ep); 5328 } else if (mdismddberror(ep, MDE_DB_ACCOK)) { 5329 /* ACCOK means mediator provided extra vote */ 5330 mdclrerror(ep); 5331 } else { 5332 /* 5333 * If master is unable to join set, print an 5334 * error message. Don't return failure or node 5335 * will panic during cluster reconfig cycle. 5336 * Also, withdraw node from set in order to 5337 * cleanup from failed join attempt. 5338 */ 5339 mde_perror(ep, dgettext(TEXT_DOMAIN, 5340 "Join of master node in set %s failed"), 5341 sp->setname); 5342 if (clnt_withdrawset(mynode(), sp, &xep)) 5343 mdclrerror(&xep); 5344 return (1); 5345 } 5346 } 5347 /* 5348 * Master node successfully joined. 5349 * Set local copy of flags to OWN and 5350 * send owner flag to rpc.metad. If not stale, 5351 * flush the change log. 5352 */ 5353 sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; 5354 if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET, 5355 MNSET_IN_RECONFIG, ep)) { 5356 mde_perror(ep, dgettext(TEXT_DOMAIN, 5357 "Flag update of master node join in set %s failed"), 5358 sp->setname); 5359 return (-1); 5360 } 5361 5362 if (!stale_set) { 5363 if (mdmn_reset_changelog(sp, ep, 5364 MDMN_CLF_RESETLOG) != 0) { 5365 mde_perror(ep, dgettext(TEXT_DOMAIN, 5366 "Unable to reset changelog.")); 5367 return (-1); 5368 } 5369 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5370 "Removed changelog entries for set %s: %s"), 5371 sp->setname, 5372 meta_print_hrtime(gethrtime() - start_time)); 5373 } 5374 /* Reset new master flag before return */ 5375 (void) memset(&sf, 0, sizeof (sf)); 5376 sf.sf_setno = sp->setno; 5377 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5378 sf.sf_flags = MDDB_NM_RESET; 5379 /* Use magic to help protect ioctl against attack. */ 5380 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5381 /* Ignore failure, failure to reset flag isn't catastrophic */ 5382 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, 5383 &sf.sf_mde, NULL); 5384 5385 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5386 "Reset new master flag for set %s: %s"), 5387 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5388 5389 return (0); 5390 } 5391 5392 /* 5393 * Is master already joined to STALE set (< 50% mddbs avail)? 5394 * If so, can make no config changes to mddbs so don't check or play 5395 * changelog and don't sync master node to ondisk mddbs. 5396 * To get out of the stale state all nodes must be withdrawn 5397 * from set. Then as nodes are re-joined, all nodes will 5398 * have same view of mddbs since all nodes are reading the 5399 * mddbs from disk. 5400 */ 5401 (void) memset(&c, 0, sizeof (c)); 5402 c.c_id = 0; 5403 c.c_setno = sp->setno; 5404 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 5405 (void) mdstealerror(ep, &c.c_mde); 5406 return (-1); 5407 } 5408 if (c.c_flags & MDDB_C_STALE) { 5409 return (0); 5410 } 5411 5412 /* 5413 * If this node is NOT a newly chosen master, then there's 5414 * nothing else to do since the change log should be empty and 5415 * the ondisk and incore mddbs are already consistent. 5416 * 5417 * A newly chosen master is a node that was not the master 5418 * at the beginning of the reconfig cycle. If a node is a new 5419 * master, then the new master state is reset after the ondisk 5420 * and incore mddbs are consistent and the change log has 5421 * been replayed. 5422 */ 5423 (void) memset(&sf, 0, sizeof (sf)); 5424 sf.sf_setno = sp->setno; 5425 sf.sf_flags = MDDB_NM_GET; 5426 /* Use magic to help protect ioctl against attack. */ 5427 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5428 if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) && 5429 ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) { 5430 return (0); 5431 } 5432 5433 /* 5434 * Now, sync up incore master view to ondisk mddbs. 5435 * This is needed in the case where a master node 5436 * had made a change to the mddb, but this change 5437 * may not have been relayed to the slaves yet. 5438 * So, the new master needs to verify that the ondisk 5439 * mddbs match what the new master has incore - 5440 * if different, new master rewrites all of the mddbs. 5441 * Then the new master will replay the changelog and the 5442 * new master will then execute what the old master had 5443 * done. 5444 * 5445 * Block all I/Os to disks in this diskset on all nodes in 5446 * the diskset. This will allow the rewriting of the mddbs 5447 * (if needed), to proceed in a timely manner. 5448 * 5449 * If block of I/Os fail, return a -1. 5450 */ 5451 5452 nd = sd->sd_nodelist; 5453 while (nd) { 5454 /* Skip non-alive and non-owner nodes */ 5455 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5456 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5457 nd = nd->nd_next; 5458 continue; 5459 } 5460 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5461 MN_SUSP_IO, ep)) { 5462 mde_perror(ep, dgettext(TEXT_DOMAIN, 5463 "Unable to suspend I/O on node %s in set %s"), 5464 nd->nd_nodename, sp->setname); 5465 5466 /* 5467 * Resume all other nodes that had been suspended. 5468 * (Reconfig return step also resumes I/Os 5469 * for all sets.) 5470 */ 5471 nd2 = sd->sd_nodelist; 5472 while (nd2) { 5473 /* Stop when reaching failed node */ 5474 if (nd2->nd_nodeid == nd->nd_nodeid) 5475 break; 5476 /* Skip non-alive and non-owner nodes */ 5477 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5478 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5479 nd2 = nd2->nd_next; 5480 continue; 5481 } 5482 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5483 sp->setno, MN_RES_IO, &xep)); 5484 nd2 = nd2->nd_next; 5485 } 5486 5487 /* 5488 * If an RPC failure on another node, return a 205. 5489 * Otherwise, exit with failure. 5490 */ 5491 if ((mdanyrpcerror(ep)) && 5492 (sd->sd_mn_mynode->nd_nodeid != 5493 nd->nd_nodeid)) { 5494 return (205); 5495 } else { 5496 return (-1); 5497 } 5498 5499 } 5500 nd = nd->nd_next; 5501 } 5502 5503 (void) memset(&c, 0, sizeof (c)); 5504 c.c_id = 0; 5505 c.c_setno = sp->setno; 5506 /* Master can't sync up to ondisk mddbs? Kick it out of cluster */ 5507 if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0) 5508 return (-1); 5509 5510 /* 5511 * Resume I/Os that were suspended above. 5512 */ 5513 nd = sd->sd_nodelist; 5514 while (nd) { 5515 /* Skip non-alive and non-owner nodes */ 5516 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5517 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5518 nd = nd->nd_next; 5519 continue; 5520 } 5521 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5522 MN_RES_IO, ep)) { 5523 mde_perror(ep, dgettext(TEXT_DOMAIN, 5524 "Unable to resume I/O on node %s in set %s"), 5525 nd->nd_nodename, sp->setname); 5526 5527 /* 5528 * If an RPC failure then don't do any 5529 * more RPC calls, since one timeout is enough 5530 * to endure. If RPC failure to another node, return 5531 * 205. If RPC failure to my node, return -1. 5532 * If not an RPC failure, continue resuming the 5533 * rest of the nodes and then return -1. 5534 */ 5535 if (mdanyrpcerror(ep)) { 5536 if (sd->sd_mn_mynode->nd_nodeid == 5537 nd->nd_nodeid) { 5538 return (-1); 5539 } else { 5540 return (205); 5541 } 5542 } 5543 5544 /* 5545 * If not an RPC error, continue resuming rest of 5546 * nodes, ignoring any failures except for an 5547 * RPC failure which constitutes an immediate exit. 5548 * Start in middle of list with failing node. 5549 */ 5550 nd2 = nd->nd_next; 5551 while (nd2) { 5552 /* Skip non-alive and non-owner nodes */ 5553 if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || 5554 (!(nd2->nd_flags & MD_MN_NODE_OWN))) { 5555 nd2 = nd2->nd_next; 5556 continue; 5557 } 5558 (void) (clnt_mn_susp_res_io(nd2->nd_nodename, 5559 sp->setno, MN_RES_IO, &xep)); 5560 if (mdanyrpcerror(&xep)) { 5561 return (-1); 5562 } 5563 nd2 = nd2->nd_next; 5564 } 5565 } 5566 nd = nd->nd_next; 5567 } 5568 5569 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed " 5570 "checking/writing the mddb for set %s: %s"), sp->setname, 5571 meta_print_hrtime(gethrtime() - start_time)); 5572 5573 /* 5574 * Send (aka replay) all messages we find in the changelog. 5575 * Flag the messages with 5576 * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them 5577 * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd. 5578 */ 5579 for (class = MD_MN_NCLASSES - 1; class > 0; class--) { 5580 mdmn_changelog_record_t *lr; 5581 md_error_t xep = mdnullerror; 5582 md_mn_result_t *resultp = NULL; 5583 int ret; 5584 5585 lr = mdmn_get_changelogrec(sp->setno, class); 5586 if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) { 5587 /* no entry for this class */ 5588 continue; 5589 } 5590 5591 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5592 "replaying message ID=(%d, 0x%llx-%d)\n"), 5593 MSGID_ELEMS(lr->lr_msg.msg_msgid)); 5594 5595 ret = mdmn_send_message_with_msgid( 5596 lr->lr_msg.msg_setno, 5597 lr->lr_msg.msg_type, 5598 lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | 5599 MD_MSGF_OVERRIDE_SUSPEND, 5600 lr->lr_msg.msg_event_data, 5601 lr->lr_msg.msg_event_size, 5602 &resultp, 5603 &lr->lr_msg.msg_msgid, 5604 &xep); 5605 5606 meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, 5607 "mdmn_send_message returned %d\n"), ret); 5608 5609 if (resultp) 5610 free_result(resultp); 5611 } 5612 5613 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5614 "Playing changelog completed for set %s: %s"), 5615 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5616 5617 /* 5618 * Now that new master has ondisk and incore mddbs in sync, reset 5619 * this node's new master kernel flag (for this set). If this node 5620 * re-enters another reconfig cycle before the completion of this 5621 * reconfig cycle, this master node won't need to check if the ondisk 5622 * and incore mddbs are in sync since this node won't be considered 5623 * a new master (since this flag is being reset here in the middle of 5624 * step2). This will save time during any subsequent reconfig 5625 * cycles as long as this node continues to be master. 5626 */ 5627 (void) memset(&sf, 0, sizeof (sf)); 5628 sf.sf_setno = sp->setno; 5629 sf.sf_setflags = MD_SET_MN_NEWMAS_RC; 5630 sf.sf_flags = MDDB_NM_RESET; 5631 /* Use magic to help protect ioctl against attack. */ 5632 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 5633 /* Ignore failure, since failure to reset flag isn't catastrophic */ 5634 (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL); 5635 5636 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5637 "Reset new master flag for set %s: %s"), 5638 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5639 5640 return (0); 5641 } 5642 5643 /* 5644 * meta_mnjoin_all will join all starting nodes in the diskset. 5645 * A starting node is considered to be any node that is not 5646 * an owner of the set but is a member of the cluster. 5647 * Master node is already joined to set (done in meta_mnsync_diskset_mddbs). 5648 * 5649 * Caller is the Master node. 5650 * 5651 * Returns 0 - Success 5652 * 205 - Failure during RPC to another node 5653 * -1 - Any other failure and ep is filled in. 5654 */ 5655 int 5656 meta_mnjoin_all( 5657 mdsetname_t *sp, 5658 md_error_t *ep 5659 ) 5660 { 5661 md_set_desc *sd; 5662 md_mnnode_desc *nd, *nd2; 5663 int rval = 0; 5664 int stale_flag = 0; 5665 mddb_config_t c; 5666 int susp_res_flag = 0; 5667 md_error_t xep = mdnullerror; 5668 5669 /* If setname is there, set desc should exist. */ 5670 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 5671 mde_perror(ep, dgettext(TEXT_DOMAIN, 5672 "Unable to get set %s desc information"), sp->setname); 5673 return (-1); 5674 } 5675 5676 /* Are there drives in the set? */ 5677 if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), 5678 ep) == NULL) { 5679 if (! mdisok(ep)) { 5680 return (-1); 5681 } 5682 /* No drives in set -- nothing to join */ 5683 return (0); 5684 } 5685 5686 /* 5687 * Is set currently stale? 5688 */ 5689 (void) memset(&c, 0, sizeof (c)); 5690 c.c_id = 0; 5691 c.c_setno = sp->setno; 5692 /* Ignore failure since master node may not be joined yet */ 5693 (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); 5694 if (c.c_flags & MDDB_C_STALE) { 5695 stale_flag = MNSET_IS_STALE; 5696 } 5697 5698 /* 5699 * If any nodes are going to be joined to diskset, then 5700 * suspend I/O to all disks in diskset so that nodes can join 5701 * (read in mddbs) in a reasonable amount of time even under 5702 * high I/O load. Don't need to do this if set is STALE since 5703 * no I/O can be occurring to a STALE set. 5704 */ 5705 if (stale_flag != MNSET_IS_STALE) { 5706 nd = sd->sd_nodelist; 5707 while (nd) { 5708 /* Found a node that will be joined to diskset */ 5709 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5710 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5711 /* Set flag that diskset should be suspended */ 5712 susp_res_flag = 1; 5713 break; 5714 } 5715 nd = nd->nd_next; 5716 } 5717 } 5718 5719 if (susp_res_flag) { 5720 /* 5721 * Block all I/Os to disks in this diskset on all joined 5722 * nodes in the diskset. 5723 * If block of I/Os fails due to an RPC failure on another 5724 * node, return 205; otherwise, return -1. 5725 */ 5726 nd = sd->sd_nodelist; 5727 while (nd) { 5728 /* Skip non-alive and non-owner nodes */ 5729 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5730 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5731 nd = nd->nd_next; 5732 continue; 5733 } 5734 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5735 MN_SUSP_IO, ep)) { 5736 mde_perror(ep, dgettext(TEXT_DOMAIN, 5737 "Unable to suspend I/O on node %s" 5738 " in set %s"), nd->nd_nodename, 5739 sp->setname); 5740 /* 5741 * Resume other nodes that had been suspended. 5742 * (Reconfig return step also resumes I/Os 5743 * for all sets.) 5744 */ 5745 nd2 = sd->sd_nodelist; 5746 while (nd2) { 5747 /* Stop when reaching failed node */ 5748 if (nd2->nd_nodeid == nd->nd_nodeid) 5749 break; 5750 /* Skip non-alive/non-owner nodes */ 5751 if ((!(nd2->nd_flags & 5752 MD_MN_NODE_ALIVE)) || 5753 (!(nd2->nd_flags & 5754 MD_MN_NODE_OWN))) { 5755 nd2 = nd2->nd_next; 5756 continue; 5757 } 5758 (void) (clnt_mn_susp_res_io( 5759 nd2->nd_nodename, sp->setno, 5760 MN_RES_IO, &xep)); 5761 nd2 = nd2->nd_next; 5762 } 5763 5764 /* 5765 * If the suspend failed due to an 5766 * RPC failure on another node, return 5767 * a 205. 5768 * Otherwise, exit with failure. 5769 * The return reconfig step will resume 5770 * I/Os for all disksets. 5771 */ 5772 if ((mdanyrpcerror(ep)) && 5773 (sd->sd_mn_mynode->nd_nodeid != 5774 nd->nd_nodeid)) { 5775 return (205); 5776 } else { 5777 return (-1); 5778 } 5779 } 5780 nd = nd->nd_next; 5781 } 5782 } 5783 5784 nd = sd->sd_nodelist; 5785 while (nd) { 5786 /* 5787 * If a node is in the membership list but isn't joined 5788 * to the set, try to join the node. 5789 */ 5790 if ((nd->nd_flags & MD_MN_NODE_ALIVE) && 5791 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5792 if (clnt_joinset(nd->nd_nodename, sp, 5793 (MNSET_IN_RECONFIG | stale_flag), ep)) { 5794 /* 5795 * If RPC failure to another node 5796 * then exit without attempting anything else. 5797 * (Reconfig return step will resume I/Os 5798 * for all sets.) 5799 */ 5800 if (mdanyrpcerror(ep)) { 5801 mde_perror(ep, ""); 5802 return (205); 5803 } 5804 /* 5805 * STALE and ACCOK failures aren't true 5806 * failures. STALE means that <50% mddbs 5807 * are available. ACCOK means that the 5808 * mediator provided the extra vote. 5809 * If a true failure, then print messasge 5810 * and withdraw node from set in order to 5811 * cleanup from failed join attempt. 5812 */ 5813 if ((!mdismddberror(ep, MDE_DB_STALE)) && 5814 (!mdismddberror(ep, MDE_DB_ACCOK))) { 5815 mde_perror(ep, 5816 "WARNING: Unable to join node %s " 5817 "to set %s", nd->nd_nodename, 5818 sp->setname); 5819 mdclrerror(ep); 5820 if (clnt_withdrawset(nd->nd_nodename, 5821 sp, &xep)) 5822 mdclrerror(&xep); 5823 nd = nd->nd_next; 5824 continue; 5825 } 5826 } 5827 /* Set owner flag even if STALE or ACCOK */ 5828 nd->nd_flags |= MD_MN_NODE_OWN; 5829 } 5830 nd = nd->nd_next; 5831 } 5832 /* 5833 * Resume I/Os if suspended above. 5834 */ 5835 if (susp_res_flag) { 5836 nd = sd->sd_nodelist; 5837 while (nd) { 5838 /* 5839 * Skip non-alive and non-owner nodes 5840 * (this list doesn't include any of 5841 * the nodes that were joined). 5842 */ 5843 if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || 5844 (!(nd->nd_flags & MD_MN_NODE_OWN))) { 5845 nd = nd->nd_next; 5846 continue; 5847 } 5848 if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, 5849 MN_RES_IO, ep)) { 5850 mde_perror(ep, dgettext(TEXT_DOMAIN, 5851 "Unable to resume I/O on node %s" 5852 " in set %s"), nd->nd_nodename, 5853 sp->setname); 5854 5855 /* 5856 * If an RPC failure then don't do any 5857 * more RPC calls, since one timeout is enough 5858 * to endure. If RPC failure to another node, 5859 * return 205. If RPC failure to my node, 5860 * return -1. 5861 * (Reconfig return step will resume I/Os 5862 * for all sets.) 5863 * If not an RPC failure, continue resuming the 5864 * rest of the nodes and then return -1. 5865 */ 5866 if (mdanyrpcerror(ep)) { 5867 if (sd->sd_mn_mynode->nd_nodeid == 5868 nd->nd_nodeid) { 5869 return (-1); 5870 } else { 5871 return (205); 5872 } 5873 } 5874 5875 /* 5876 * If not an RPC error, continue resuming rest 5877 * of nodes, ignoring any failures except for 5878 * an RPC failure which constitutes an 5879 * immediate exit. 5880 * Start in middle of list with failing node. 5881 */ 5882 nd2 = nd->nd_next; 5883 while (nd2) { 5884 /* Skip non-owner nodes */ 5885 if ((!(nd2->nd_flags & 5886 MD_MN_NODE_ALIVE)) || 5887 (!(nd2->nd_flags & 5888 MD_MN_NODE_OWN))) { 5889 nd2 = nd2->nd_next; 5890 continue; 5891 } 5892 (void) (clnt_mn_susp_res_io( 5893 nd2->nd_nodename, sp->setno, 5894 MN_RES_IO, &xep)); 5895 if (mdanyrpcerror(&xep)) { 5896 return (-1); 5897 } 5898 nd2 = nd2->nd_next; 5899 } 5900 } 5901 nd = nd->nd_next; 5902 } 5903 } 5904 5905 nd = sd->sd_nodelist; 5906 while (nd) { 5907 if (!(nd->nd_flags & MD_MN_NODE_OWN)) { 5908 nd = nd->nd_next; 5909 continue; 5910 } 5911 /* 5912 * If 1 node fails - go ahead and update the rest except 5913 * in the case of an RPC failure, fail immediately. 5914 */ 5915 if (clnt_upd_nr_flags(nd->nd_nodename, sp, 5916 sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { 5917 /* RPC failure to another node */ 5918 if (mdanyrpcerror(ep)) { 5919 return (205); 5920 } 5921 nd = nd->nd_next; 5922 rval = -1; 5923 continue; 5924 } 5925 nd = nd->nd_next; 5926 } 5927 5928 meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, 5929 "Join of all nodes completed for set %s: %s"), 5930 sp->setname, meta_print_hrtime(gethrtime() - start_time)); 5931 5932 return (rval); 5933 } 5934