1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <meta.h> 30 #include <sdssc.h> 31 #include <signal.h> 32 #include <syslog.h> 33 #include <sys/types.h> 34 #include <sys/wait.h> 35 #include <sys/lvm/md_mirror.h> 36 #include <metad.h> 37 38 #define MY_VERSION "1.0" /* the highest supported version */ 39 #define MAX_DEBUG_LEVEL 5 /* maximum verbosity level */ 40 41 #define RESET_OWNER 0x0001 42 #define CHOOSE_OWNER 0x0002 43 #define RESET_ABR 0x0004 44 #define UPDATE_ABR 0x0008 45 #define GET_MIRROR_STATE 0x0010 46 47 #define SET_INFO_NO_WR 0x0002 48 #define SET_INFO_MN 0x0004 49 50 /* 51 * This table defines all the metaclust reconfig steps we understand 52 */ 53 typedef enum stpnum { 54 MC_UNK = 0, 55 MC_START, 56 MC_STOP, 57 MC_ABORT, 58 MC_RETURN, 59 MC_STEP1, 60 MC_STEP2, 61 MC_STEP3, 62 MC_STEP4 63 } stepnum_t; 64 65 /* 66 * Structure for step_name -> step_number mapping 67 */ 68 struct step_t { 69 char *step_nam; 70 stepnum_t step_num; 71 }; 72 73 /* 74 * Step name to step number mapping table 75 * This table MUST be sorted alphabetically in ascending order of step name 76 */ 77 static struct step_t step_table[] = { 78 { "abort", MC_ABORT }, 79 { "return", MC_RETURN }, 80 { "start", MC_START }, 81 { "step1", MC_STEP1 }, 82 { "step2", MC_STEP2 }, 83 { "step3", MC_STEP3 }, 84 { "step4", MC_STEP4 }, 85 { "stop", MC_STOP } 86 }; 87 88 /* 89 * If support for a different version is added, the new version number should 90 * be appended to the version_table below. This list will be searched to 91 * determine if a version requested via the -V option is supported or not. 92 */ 93 static char *version_table[] = { 94 MY_VERSION 95 }; 96 97 uint_t timeout = 0; /* disable timeout by default */ 98 char *version = MY_VERSION; /* use latest version by default */ 99 int stepnum = MC_UNK; /* reconfiguration step number */ 100 pid_t c_pid; /* child process id */ 101 102 /* 103 * Binary search comparison routine 104 */ 105 static int 106 mc_compare(const void *stp1, const void *stp2) 107 { 108 return (strcmp((const char *)stp1, 109 ((const struct step_t *)stp2)->step_nam)); 110 } 111 112 /* 113 * Timeout expiry alarm signal handler 114 */ 115 /*ARGSUSED*/ 116 static void 117 sigalarmhandler(int sig) 118 { 119 int i, n, ret, stat_loc = 0; 120 121 n = sizeof (step_table) / sizeof (step_table[0]); 122 for (i = 0; i < n; i++) { 123 if (stepnum == step_table[i].step_num) 124 break; 125 } 126 127 assert(i != n); 128 129 meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"), 130 step_table[i].step_nam, 131 meta_print_hrtime(gethrtime() - start_time)); 132 133 if ((ret = kill(c_pid, SIGKILL)) == 0) { 134 /* 135 * The child will wait forever until the status is retrieved 136 * so get it now. Keep retrying if the call is interrupted. 137 * 138 * The possible results are, 139 * 140 * - child killed successfully 141 * - signal sent but child not killed 142 * - waitpid failed/interrupted 143 */ 144 sleep(2); 145 while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) { 146 if (errno != EINTR) { 147 break; 148 } 149 } 150 if ((ret == c_pid) || (errno == ECHILD)) { 151 ret = 0; 152 } else { 153 ret = 1; 154 } 155 } else if (errno == ESRCH) { 156 /* 157 * If the kill did not catch the child then it means the child 158 * exited immediately after the timeout occured. 159 */ 160 ret = 0; 161 } 162 163 /* 164 * make sure not to exit with 205 for any steps other than step1-step4. 165 * Suncluster reconfiguration can't handle it otherwise. 166 */ 167 switch (stepnum) { 168 case MC_STEP1: 169 case MC_STEP2: 170 case MC_STEP3: 171 case MC_STEP4: 172 /* 173 * If the child was killed successfully return 205 for a 174 * new reconfig cycle otherwise send 1 to panic the node. 175 */ 176 if (ret != 0) { 177 md_eprintf(gettext("Could not kill child\n")); 178 exit(1); 179 } else { 180 exit(205); 181 } 182 break; 183 case MC_START: 184 case MC_STOP: 185 case MC_ABORT: 186 case MC_RETURN: 187 default: 188 exit(1); 189 break; 190 } 191 } 192 193 /* 194 * Attempt to load local set. 195 * Returns: 196 * pointer to mdsetname_t for local set (local_sp) is successful. 197 * 0 if failure 198 * if there are no local set mddbs, no error message is printed. 199 * Otherwise, error message is printed so that user 200 * can determine why the local set didn't start. 201 */ 202 mdsetname_t * 203 load_local_set(md_error_t *ep) 204 { 205 mdsetname_t *local_sp = NULL; 206 207 /* Does local set exist? If not, give no error */ 208 if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) { 209 return (0); 210 } 211 212 /* 213 * snarf local set 214 * If fails with MDE_DB_NODB, then just return 1 printing 215 * no failure. 216 * Otherwise, print error message, and return 1. 217 */ 218 if (meta_setup_db_locations(ep) != 0) { 219 if (!(mdismddberror(ep, MDE_DB_NODB))) 220 mde_perror(ep, ""); 221 return (0); 222 } 223 224 /* local set loaded successfully */ 225 return (local_sp); 226 } 227 228 /* 229 * Purpose: Compose a full path name for a metadevice 230 * 231 * On entry: sp - setname pointer 232 * mnum - minor number of metadevice 233 * pathname - pointer to array to return path string 234 * pathlen - max length of pathname array 235 */ 236 static int 237 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen) 238 { 239 int rtn; 240 mdname_t *np; 241 md_error_t status = mdnullerror; 242 243 if (MD_MIN2SET(mnum) != sp->setno) { 244 md_eprintf(gettext("minor number 0x%x invalid for set %d\n"), 245 mnum, sp->setno); 246 return (-1); 247 } 248 249 if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) { 250 return (-1); 251 } 252 253 rtn = snprintf(pathname, pathlen, "%s", np->rname); 254 255 if ((pathname[0] == '\0') || (rtn >= pathlen)) { 256 md_eprintf(gettext( 257 "Could not create path for device %s\n"), 258 get_mdname(sp, mnum)); 259 return (-1); 260 } 261 return (0); 262 } 263 264 /* 265 * Purpose: Walk through all the devices specified for the given set 266 * and do the action specified in mode 267 */ 268 static int 269 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep) 270 { 271 mdnamelist_t *devnlp = NULL; 272 mdnamelist_t *p; 273 mdname_t *devnp = NULL; 274 md_set_mmown_params_t ownpar_p; 275 md_set_mmown_params_t *ownpar = &ownpar_p; 276 md_unit_t *mm; 277 int mirror_dev = 0; 278 mndiskset_membershiplist_t *nl; 279 int cnt; 280 int has_parent; 281 md_mn_get_mir_state_t mir_state_p; 282 md_mn_get_mir_state_t *mir_state = &mir_state_p; 283 284 /* 285 * if we are choosing or resetting the owners then make sure 286 * we are only doing it for mirror devices 287 */ 288 mirror_dev = (strcmp(MD_MIRROR, drivername) == 0); 289 if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) { 290 return (-1); 291 } 292 293 /* get a list of all the metadevices for current set */ 294 if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) { 295 mde_perror(ep, gettext("Could not get mirrors for set %s"), 296 sp->setname); 297 return (-1); 298 } else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) { 299 mde_perror(ep, gettext( 300 "Could not get soft partitions for set %s"), sp->setname); 301 return (-1); 302 } 303 304 /* If resetting the owner, get the known membership list */ 305 if (mode & RESET_OWNER) { 306 if (meta_read_nodelist(&cnt, &nl, ep)) { 307 mde_perror(ep, "Could not get nodelist"); 308 return (-1); 309 } 310 } 311 312 /* for each metadevice */ 313 for (p = devnlp; (p != NULL); p = p->next) { 314 devnp = p->namep; 315 316 /* 317 * Get the current setting for mirror ABR state and all of the 318 * submirror state and flags from the master node. We only 319 * perform this when going through a 'start' cycle. 320 */ 321 if ((mode & GET_MIRROR_STATE) && mirror_dev) { 322 char *miscname; 323 324 /* 325 * Ensure that we ignore soft-parts that are returned 326 * from the meta_get_mirror_names() call 327 */ 328 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 329 goto out; 330 if (strcmp(miscname, MD_MIRROR) != 0) 331 continue; 332 333 mir_state->mnum = meta_getminor(devnp->dev); 334 MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno); 335 meta_mc_log(MC_LOG4, gettext("Getting mirror state" 336 " for %s: %s"), get_mdname(sp, mir_state->mnum), 337 meta_print_hrtime(gethrtime() - start_time)); 338 339 if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep, 340 "MD_MN_GET_MIRROR_STATE") != 0) { 341 mde_perror(ep, gettext("Unable to get " 342 "mirror state for %s"), 343 get_mdname(sp, mir_state->mnum)); 344 goto out; 345 } else { 346 continue; 347 } 348 } 349 350 /* check if this is a top level metadevice */ 351 if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL) 352 goto out; 353 if (MD_HAS_PARENT(MD_PARENT(mm))) { 354 has_parent = 1; 355 } else { 356 has_parent = 0; 357 } 358 Free(mm); 359 360 if (mode & (RESET_OWNER | CHOOSE_OWNER)) { 361 char *miscname; 362 363 /* 364 * we can only do these for mirrors so make sure we 365 * really have a mirror device and not a softpartition 366 * imitating one. meta_get_mirror_names seems to think 367 * softparts on top of a mirror are mirrors! 368 */ 369 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 370 goto out; 371 if (strcmp(miscname, MD_MIRROR) != 0) 372 continue; 373 374 (void) memset(ownpar, 0, sizeof (*ownpar)); 375 ownpar->d.mnum = meta_getminor(devnp->dev); 376 MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno); 377 378 meta_mc_log(MC_LOG4, gettext("Setting owner " 379 "for %s: %s"), get_mdname(sp, ownpar->d.mnum), 380 meta_print_hrtime(gethrtime() - start_time)); 381 382 /* get the current owner id */ 383 if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep, 384 "MD_MN_GET_MM_OWNER") != 0) { 385 mde_perror(ep, gettext("Unable to get " 386 "mirror owner for %s"), 387 get_mdname(sp, ownpar->d.mnum)); 388 goto out; 389 } 390 } 391 392 if (mode & RESET_OWNER) { 393 if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) { 394 mdclrerror(ep); 395 continue; 396 } 397 398 /* 399 * reset owner only if the current owner is 400 * not in the membership list 401 * Also kill the resync thread so that when the resync 402 * is started, it will perform an optimized resync 403 * for any resync regions that were dirty when the 404 * current owner left the membership. 405 */ 406 if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) { 407 if (meta_mn_change_owner(&ownpar, 408 sp->setno, ownpar->d.mnum, 409 MD_MN_MIRROR_UNOWNED, 410 MD_MN_MM_ALLOW_CHANGE) == -1) { 411 md_eprintf(gettext( 412 "Unable to reset mirror owner " 413 "for %s\n"), 414 get_mdname(sp, ownpar->d.mnum)); 415 goto out; 416 } 417 if (meta_mirror_resync(sp, devnp, 0, ep, 418 MD_RESYNC_KILL_NO_WAIT) != 0) { 419 md_eprintf(gettext( 420 "Unable to kill resync for" 421 " %s\n"), 422 get_mdname(sp, ownpar->d.mnum)); 423 goto out; 424 } 425 } 426 } 427 428 if (mode & CHOOSE_OWNER) { 429 /* 430 * only orphaned resyncs will have no owner. 431 * if that is the case choose a new owner. Otherwise 432 * re-establish the existing owner. This covers the 433 * case where a node that owned the mirror 434 * reboots/panics and comes back into the cluster before 435 * the reconfig cycle has completed. In this case the 436 * other cluster nodes will have the mirror owner marked 437 * as the rebooted node while it has the owner marked 438 * as 'None'. We have to reestablish the ownership so 439 * that the subsequent resync can continue. 440 */ 441 if (meta_mn_change_owner(&ownpar, sp->setno, 442 ownpar->d.mnum, ownpar->d.owner, 443 MD_MN_MM_CHOOSE_OWNER) == -1) { 444 md_eprintf(gettext("Unable to choose " 445 "mirror owner for %s\n"), 446 get_mdname(sp, ownpar->d.mnum)); 447 goto out; 448 } 449 } 450 451 /* 452 * For RESET_ABR and UPDATE_ABR - only handle top 453 * level metadevices. 454 */ 455 if (has_parent) 456 continue; 457 458 if (mode & RESET_ABR) { 459 /* 460 * Reset the ABR (application based recovery) 461 * value on all nodes. We are dealing with 462 * the possibility that we have ABR set but the 463 * only node that had the device open with ABR has 464 * left the cluster. We simply open and close the 465 * device and if this is the last close in the 466 * cluster, ABR will be cleared on all nodes. 467 */ 468 char *miscname; 469 char name[MAXPATHLEN]; 470 int mnum, fd; 471 472 name[0] = '\0'; 473 mnum = meta_getminor(devnp->dev); 474 475 /* 476 * Ensure that we don't include soft-parts in the 477 * mirror-only call to RESET_ABR. meta_get_mirror_names 478 * returns a bogus list that includes all soft-parts 479 * built on mirrors. 480 */ 481 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 482 goto out; 483 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 484 continue; 485 486 meta_mc_log(MC_LOG4, gettext("Re-setting ABR state " 487 "for %s: %s"), get_mdname(sp, mnum), 488 meta_print_hrtime(gethrtime() - start_time)); 489 490 /* compose the absolute device path and open it */ 491 if (compose_path(sp, mnum, &name[0], 492 sizeof (name)) != 0) 493 goto out; 494 if ((fd = open(name, O_RDWR, 0)) < 0) { 495 md_perror(gettext("Could not open device %s"), 496 name); 497 continue; 498 } 499 500 (void) close(fd); 501 } 502 503 if (mode & UPDATE_ABR) { 504 /* 505 * Update the ABR value on this node. We obtain the 506 * current ABR state from the master node. 507 */ 508 509 char *miscname; 510 char name[MAXPATHLEN]; 511 int mnum, fd; 512 volcap_t vc; 513 uint_t tstate; 514 515 name[0] = '\0'; 516 mnum = meta_getminor(devnp->dev); 517 518 /* 519 * Ensure that we don't include soft-parts in the 520 * mirror-only call to UPDATE_ABR. meta_get_mirror_names 521 * returns a bogus list that includes all soft-parts 522 * built on mirrors. 523 */ 524 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 525 goto out; 526 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 527 continue; 528 529 /* Get tstate from Master */ 530 if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) 531 != 0) 532 continue; 533 /* If not set on the master, nothing to do */ 534 if (!(tstate & MD_ABR_CAP)) 535 continue; 536 537 meta_mc_log(MC_LOG4, gettext("Updating ABR state " 538 "for %s: %s"), get_mdname(sp, mnum), 539 meta_print_hrtime(gethrtime() - start_time)); 540 541 /* compose the absolute device path and open it */ 542 if (compose_path(sp, mnum, &name[0], 543 sizeof (name)) != 0) 544 goto out; 545 if ((fd = open(name, O_RDWR, 0)) < 0) { 546 md_perror(gettext("Could not open device %s"), 547 name); 548 continue; 549 } 550 551 /* set ABR state */ 552 vc.vc_info = 0; 553 vc.vc_set = 0; 554 if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) { 555 /* 556 * Ignore if device does not support this 557 * ioctl 558 */ 559 if ((errno != ENOTTY) && (errno != ENOTSUP)) { 560 md_perror(gettext("Could not get " 561 "ABR/DMR state for device %s"), 562 name); 563 } 564 (void) close(fd); 565 continue; 566 } 567 if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) { 568 (void) close(fd); 569 continue; 570 } 571 572 vc.vc_set = DKV_ABR_CAP; 573 if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) { 574 md_perror(gettext( 575 "Could not set ABR state for " 576 "device %s"), name); 577 (void) close(fd); 578 goto out; 579 } else { 580 md_eprintf(gettext( 581 "Setting ABR state on device %s\n"), name); 582 } 583 584 (void) close(fd); 585 } 586 } 587 588 /* cleanup */ 589 if (mode & RESET_OWNER) { 590 meta_free_nodelist(nl); 591 } 592 metafreenamelist(devnlp); 593 return (0); 594 595 out: 596 /* cleanup */ 597 if (mode & RESET_OWNER) { 598 meta_free_nodelist(nl); 599 } 600 metafreenamelist(devnlp); 601 return (-1); 602 } 603 604 /* 605 * Print usage message 606 */ 607 static void 608 usage(mdsetname_t *sp, int eval) 609 { 610 (void) fprintf(stderr, gettext("usage:" 611 "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n" 612 "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n" 613 "\t%s [-V version] [-t timeout] [-d level] abort | stop\n" 614 "\t%s [-V | -? | -h]\n"), 615 myname, myname, myname, myname); 616 if (!eval) { 617 fprintf(stderr, gettext("\n" 618 "\tValid debug (-d) levels are 1-%d for increasing " 619 "verbosity.\n\tDefault is -d 3.\n\n" 620 "\tValid step values are: return | step1 | step2 | " 621 "step3 | step4\n\n" 622 "\tNodelist is a space-separated list of node id's\n\n"), 623 MAX_DEBUG_LEVEL); 624 } 625 md_exit(sp, eval); 626 } 627 628 /* 629 * Input: Input takes a config step name followed by a list of 630 * possible node id's. 631 * 632 * Returns: 0 - Success 633 * 1 - Fail 634 * Node will be removed from cluster membership 635 * by forcing node to panic. 636 * 205 - Unsuccessful. Start another reconfig cycle. 637 * Problem was encountered that could be fixed by 638 * running another reconfig cycle. 639 * Problem could be a result of a failure to read 640 * the nodelist file or that all work could not be 641 * accomplished in a reconfig step in the amount of 642 * time given so another reconfig cycle is needed in 643 * order to finish the current step. 644 */ 645 int 646 main(int argc, char **argv) 647 { 648 mdsetname_t *sp = NULL; 649 md_error_t status = mdnullerror; 650 md_error_t *ep = &status; 651 set_t max_sets, setno; 652 int c, clust = 0; 653 struct sigaction nsa, osa; 654 struct step_t *step_ptr; 655 mdsetname_t *local_sp = NULL; 656 md_drive_desc *dd; 657 int rval = 0; 658 md_set_desc *sd; 659 mddb_block_parm_t mbp; 660 uint_t debug = 3; /* log upto MC_LOG3 by default */ 661 int version_table_size; 662 mddb_setflags_config_t sf; 663 int ret_val; 664 mddb_config_t cfg; 665 int set_info[MD_MAXSETS]; 666 long commd_timeout = 0; 667 668 /* 669 * Get the locale set up before calling any other routines 670 * with messages to ouput. Just in case we're not in a build 671 * environment, make sure that TEXT_DOMAIN gets set to 672 * something. 673 */ 674 #if !defined(TEXT_DOMAIN) 675 #define TEXT_DOMAIN "SYS_TEST" 676 #endif 677 (void) setlocale(LC_ALL, ""); 678 (void) textdomain(TEXT_DOMAIN); 679 680 if ((clust = sdssc_bind_library()) == SDSSC_ERROR) { 681 md_eprintf(gettext("Interface error with libsds_sc.so\n")); 682 exit(1); 683 } 684 685 if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) { 686 mde_perror(ep, ""); 687 md_exit(sp, 1); 688 } 689 690 /* 691 * open log and enable libmeta logging. Do it here explicitly 692 * rather than letting md_init() do it because we are not really 693 * a daemon and that is what md_init() opens the log as. 694 */ 695 openlog("metaclust", LOG_CONS, LOG_USER); 696 697 version_table_size = sizeof (version_table) / sizeof (version_table[0]); 698 699 optind = 1; 700 opterr = 0; 701 while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) { 702 switch (c) { 703 case 'h': 704 usage(sp, 0); 705 break; 706 707 case 'd': 708 if (sscanf(optarg, "%u", &debug) != 1) { 709 md_eprintf(gettext("Invalid debug level\n")); 710 md_exit(sp, 1); 711 } else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) { 712 debug = min(max(debug, 1), MAX_DEBUG_LEVEL); 713 md_eprintf(gettext("Debug level must be " 714 "between 1 and %d inclusive.\n"), 715 MAX_DEBUG_LEVEL); 716 md_eprintf(gettext("Debug level set to %d.\n"), 717 debug); 718 } 719 break; 720 721 case 'V': 722 version = Strdup(optarg); 723 break; 724 725 case 't': 726 if (sscanf(optarg, "%u", &timeout) != 1) { 727 md_eprintf(gettext("Invalid timeout value\n")); 728 md_exit(sp, 1); 729 } 730 break; 731 732 case '?': 733 if (optopt == '?') { 734 usage(sp, 0); 735 } else if (optopt == 'V') { 736 int i; 737 738 fprintf(stdout, gettext( 739 "%s: Versions Supported:"), myname); 740 for (i = 0; i < version_table_size; i++) { 741 fprintf(stdout, " %s", 742 version_table[i]); 743 } 744 fprintf(stdout, "\n"); 745 md_exit(sp, 0); 746 } 747 /*FALLTHROUGH*/ 748 749 default: 750 usage(sp, 1); 751 break; 752 } 753 } 754 755 /* initialise the debug level and start time */ 756 setup_mc_log(debug); 757 758 /* 759 * check that the version specified (if any) is supported. 760 */ 761 if (version != NULL) { 762 int i, found = 0; 763 764 for (i = 0; i < version_table_size; i++) { 765 if (strcmp(version, version_table[i]) == 0) { 766 found = 1; 767 break; 768 } 769 } 770 if (!found) { 771 md_eprintf(gettext("Version %s not supported\n"), 772 version); 773 md_exit(sp, 1); 774 } 775 } 776 777 argc -= optind; 778 argv += optind; 779 780 /* parse arguments */ 781 if (argc <= 0) { 782 usage(sp, 1); 783 } 784 785 /* convert the step name to the corresponding number */ 786 step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) / 787 sizeof (step_table[0])), sizeof (step_table[0]), mc_compare); 788 if (step_ptr != NULL) { 789 stepnum = step_ptr->step_num; 790 } 791 792 --argc; 793 ++argv; 794 795 /* set timeout alarm signal, a value of 0 will disable timeout */ 796 if (timeout > 0) { 797 int stat_loc = 0; 798 commd_timeout = (long)(timeout * .75); 799 800 c_pid = fork(); 801 802 if (c_pid == (pid_t)-1) { 803 md_perror(gettext("Unable to fork")); 804 md_exit(sp, 1); 805 } else if (c_pid) { 806 /* parent */ 807 nsa.sa_flags = 0; 808 if (sigfillset(&nsa.sa_mask) < 0) { 809 md_perror(gettext("Unable to set signal mask")); 810 md_exit(sp, 1); 811 } 812 813 nsa.sa_handler = sigalarmhandler; 814 if (sigaction(SIGALRM, &nsa, &osa) == -1) { 815 md_perror(gettext("Unable to set alarm " 816 "handler")); 817 md_exit(sp, 1); 818 } 819 820 (void) alarm(timeout); 821 822 /* 823 * wait for child to exit or timeout to expire. 824 * keep retrying if the call is interrupted 825 */ 826 while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) { 827 if (errno != EINTR) { 828 break; 829 } 830 } 831 if (ret_val == c_pid) { 832 /* exit with the childs exit value */ 833 exit(WEXITSTATUS(stat_loc)); 834 } else if (errno == ECHILD) { 835 md_exit(sp, 0); 836 } else { 837 perror(myname); 838 md_exit(sp, 1); 839 } 840 } 841 } 842 843 /* 844 * If a timeout value is given, everything from this point onwards is 845 * executed in the child process. 846 */ 847 848 switch (stepnum) { 849 case MC_START: 850 /* 851 * Start Step 852 * 853 * - Suspend all rpc.mdcommd messages 854 */ 855 856 /* expect the local node id to be given only */ 857 if (argc != 1) 858 usage(sp, 1); 859 860 meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"), 861 meta_print_hrtime(0)); 862 863 /* 864 * Does local set exist? If not, exit with 0 865 * since there's no reason to have this node panic if 866 * the local set cannot be started. 867 */ 868 if ((local_sp = load_local_set(ep)) == NULL) { 869 md_exit(local_sp, 0); 870 } 871 872 if ((max_sets = get_max_sets(ep)) == 0) { 873 mde_perror(ep, ""); 874 md_exit(sp, 1); 875 } 876 877 /* start walking through all possible disksets */ 878 for (setno = 1; setno < max_sets; setno++) { 879 if ((sp = metasetnosetname(setno, ep)) == NULL) { 880 if (mdiserror(ep, MDE_NO_SET)) { 881 /* No set for this setno - continue */ 882 mdclrerror(ep); 883 continue; 884 } else { 885 mde_perror(ep, gettext("Unable to " 886 "get set %d information"), setno); 887 md_exit(sp, 1); 888 } 889 } 890 891 /* only check multi-node disksets */ 892 if (!meta_is_mn_set(sp, ep)) { 893 mdclrerror(ep); 894 continue; 895 } 896 897 meta_mc_log(MC_LOG3, gettext("Start - block parse " 898 "messages for set %s: %s"), sp->setname, 899 meta_print_hrtime(gethrtime() - start_time)); 900 901 /* 902 * Mddb parse messages are sent amongst the nodes 903 * in a diskset whenever the locator block or 904 * locator names structure has been changed. 905 * A locator block change could occur as a result 906 * of a disk failure during the reconfig cycle, 907 * so block the mddb parse messages while the 908 * rpc.mdcommd is suspended during the reconfig cycle. 909 */ 910 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 911 (void) memset(&mbp, 0, sizeof (mbp)); 912 mbp.c_setno = setno; 913 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 914 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 915 &mbp.c_mde, NULL)) { 916 mdstealerror(ep, &mbp.c_mde); 917 mde_perror(ep, gettext("Could not " 918 "block set %s"), sp->setname); 919 md_exit(sp, 1); 920 } 921 } 922 923 /* suspend commd and spin waiting for drain */ 924 while ((ret_val = mdmn_suspend(setno, 925 MD_COMM_ALL_CLASSES, commd_timeout)) == 926 MDE_DS_COMMDCTL_SUSPEND_NYD) { 927 sleep(1); 928 } 929 930 if (ret_val) { 931 md_eprintf(gettext("Could not suspend " 932 "rpc.mdcommd for set %s\n"), sp->setname); 933 md_exit(sp, 1); 934 } 935 936 /* 937 * Set start step flag for set. This is set to indicate 938 * that this node entered the reconfig cycle through 939 * the start step. This is used during the reconfig 940 * cycle to determine whether the node had entered 941 * through the start step or the return step. 942 */ 943 (void) memset(&sf, 0, sizeof (sf)); 944 sf.sf_setno = sp->setno; 945 sf.sf_setflags = MD_SET_MN_START_RC; 946 sf.sf_flags = MDDB_NM_SET; 947 /* Use magic to help protect ioctl against attack. */ 948 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 949 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 950 &sf.sf_mde, NULL)) { 951 mdstealerror(ep, &sf.sf_mde); 952 mde_perror(ep, gettext("Could not set " 953 "start_step flag for set %s"), sp->setname); 954 md_exit(sp, 1); 955 } 956 957 } 958 959 meta_mc_log(MC_LOG2, gettext("Start step completed: %s"), 960 meta_print_hrtime(gethrtime() - start_time)); 961 962 break; 963 964 case MC_STOP: 965 /* 966 * Stop Step 967 * 968 * - ??? 969 */ 970 971 /* don't expect any more arguments to follow the step name */ 972 if (argc != 0) 973 usage(sp, 1); 974 975 break; 976 977 case MC_ABORT: 978 /* 979 * Abort Step 980 * 981 * - Abort rpc.mdcommd 982 */ 983 984 /* don't expect any more arguments to follow the step name */ 985 if (argc != 0) 986 usage(sp, 1); 987 988 meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"), 989 meta_print_hrtime(0)); 990 991 /* 992 * Does local set exist? If not, exit with 0 993 * since there's no reason to have this node panic if 994 * the local set cannot be started. 995 */ 996 if ((local_sp = load_local_set(ep)) == NULL) { 997 md_exit(local_sp, 0); 998 } 999 1000 /* 1001 * abort the rpc.mdcommd. The abort is only issued on this node 1002 * meaning that the abort reconfig step is called on this 1003 * node before a panic while the rest of the cluster will 1004 * undergo a reconfig cycle. 1005 * There is no time relation between this node running a 1006 * reconfig abort and the the rest of the cluster 1007 * running a reconfig cycle meaning that this node may 1008 * panic before, during or after the cluster has run 1009 * a reconfig cycle. 1010 */ 1011 mdmn_abort(); 1012 1013 meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"), 1014 meta_print_hrtime(gethrtime() - start_time)); 1015 1016 break; 1017 1018 case MC_RETURN: 1019 /* 1020 * Return Step 1021 * 1022 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL 1023 * and release local set lock. Grabbing the local set 1024 * lock allows any active metaset/metadb commands to 1025 * terminate gracefully and will keep a metaset/metadb 1026 * command from starting until the DRAIN ALL is issued. 1027 * The metaset/metadb commands can issue 1028 * DRAIN ALL/RESUME ALL commands to rpc.mdcommd, 1029 * so the return step must not issue the DRAIN ALL command 1030 * until metaset/metadb have finished or metaset may issue 1031 * a RESUME ALL after this return reconfig step has issued 1032 * the DRAIN ALL command. 1033 * After this reconfig step has issued the DRAIN_ALL and 1034 * released the local set lock, metaset/metadb will fail 1035 * when attempting to contact the rpc.mdcommd and will 1036 * terminate without making any configuration changes. 1037 * The DRAIN ALL command will keep all other meta* commands 1038 * from running during the reconfig cycle (these commands 1039 * will wait until the rpc.mdcommd is resumed) since the 1040 * reconfig cycle may be changing the diskset configuration. 1041 */ 1042 1043 /* expect the nodelist to follow the step name */ 1044 if (argc < 1) 1045 usage(sp, 1); 1046 1047 meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"), 1048 meta_print_hrtime(0)); 1049 1050 /* 1051 * Does local set exist? If not, exit with 0 1052 * since there's no reason to have this node panic if 1053 * the local set cannot be started. 1054 */ 1055 if ((local_sp = load_local_set(ep)) == NULL) { 1056 md_exit(local_sp, 0); 1057 } 1058 1059 /* 1060 * Suspend any mirror resyncs that are in progress. This 1061 * stops unnecessary timeouts. 1062 */ 1063 meta_mirror_resync_block_all(); 1064 1065 if (meta_lock(local_sp, TRUE, ep) != 0) { 1066 mde_perror(ep, ""); 1067 md_exit(local_sp, 1); 1068 } 1069 1070 /* 1071 * All metaset and metadb commands on this node have now 1072 * terminated gracefully. Now, issue a drain all to 1073 * the rpc.mdcommd. Any meta command issued after the 1074 * drain all will either spin sending the command to the 1075 * master until after the reconfig cycle has finished OR 1076 * will terminate gracefully (metaset/metadb). 1077 */ 1078 if ((max_sets = get_max_sets(ep)) == 0) { 1079 mde_perror(ep, ""); 1080 md_exit(sp, 1); 1081 } 1082 1083 /* start walking through all possible disksets */ 1084 for (setno = 1; setno < max_sets; setno++) { 1085 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1086 if (mdiserror(ep, MDE_NO_SET)) { 1087 /* No set for this setno - continue */ 1088 mdclrerror(ep); 1089 continue; 1090 } else { 1091 mde_perror(ep, gettext("Unable to " 1092 "get set %d information"), setno); 1093 md_exit(sp, 1); 1094 } 1095 } 1096 1097 /* only check multi-node disksets */ 1098 if (!meta_is_mn_set(sp, ep)) { 1099 mdclrerror(ep); 1100 continue; 1101 } 1102 1103 meta_mc_log(MC_LOG3, gettext("Return - block parse " 1104 "messages for set %s: %s"), sp->setname, 1105 meta_print_hrtime(gethrtime() - start_time)); 1106 1107 /* 1108 * Mddb parse messages are sent amongst the nodes 1109 * in a diskset whenever the locator block or 1110 * locator names structure has been changed. 1111 * A locator block change could occur as a result 1112 * of a disk failure during the reconfig cycle, 1113 * so block the mddb parse messages while the 1114 * rpc.commd is suspended during the reconfig cycle. 1115 */ 1116 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1117 (void) memset(&mbp, 0, sizeof (mbp)); 1118 mbp.c_setno = setno; 1119 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 1120 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1121 &mbp.c_mde, NULL)) { 1122 mdstealerror(ep, &mbp.c_mde); 1123 mde_perror(ep, gettext("Could not " 1124 "block set %s"), sp->setname); 1125 md_exit(sp, 1); 1126 } 1127 } 1128 1129 /* suspend commd and spin waiting for drain */ 1130 while ((ret_val = mdmn_suspend(setno, 1131 MD_COMM_ALL_CLASSES, commd_timeout)) == 1132 MDE_DS_COMMDCTL_SUSPEND_NYD) { 1133 sleep(1); 1134 } 1135 1136 if (ret_val) { 1137 md_eprintf(gettext("Could not suspend " 1138 "rpc.mdcommd for set %s\n"), sp->setname); 1139 md_exit(sp, 1); 1140 } 1141 } 1142 /* 1143 * Resume all I/Os for this node for all MN sets in 1144 * case master node had suspended I/Os but panic'd 1145 * before resuming I/Os. In case of failure, exit 1146 * with a 1 since unable to resume I/Os on this node. 1147 */ 1148 if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) { 1149 mde_perror(ep, gettext( 1150 "Unable to resume I/O on node %s for all sets"), 1151 mynode()); 1152 md_exit(sp, 1); 1153 } 1154 1155 1156 /* 1157 * Can now unlock local set lock. New metaset/metadb 1158 * commands are now held off using drain all. 1159 */ 1160 (void) meta_unlock(local_sp, ep); 1161 1162 meta_mc_log(MC_LOG2, gettext("Return step completed: %s"), 1163 meta_print_hrtime(gethrtime() - start_time)); 1164 1165 break; 1166 1167 case MC_STEP1: 1168 /* 1169 * Step 1 1170 * 1171 * - Populate nodelist file if we are on clustering 1172 * and pick a master node for each MN diskset. 1173 */ 1174 1175 /* expect the nodelist to follow the step name */ 1176 if (argc < 1) 1177 usage(sp, 1); 1178 1179 meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"), 1180 meta_print_hrtime(0)); 1181 1182 /* Always write nodelist file even if no local set exists */ 1183 if (clust == SDSSC_OKAY) { 1184 /* skip to the nodelist args */ 1185 if (meta_write_nodelist(argc, argv, ep) != 0) { 1186 mde_perror(ep, gettext( 1187 "Could not populate nodelist file")); 1188 md_exit(sp, 1); 1189 } 1190 } 1191 1192 /* 1193 * Does local set exist? If not, exit with 0 1194 * since there's no reason to have this node panic if 1195 * the local set cannot be started. 1196 */ 1197 if ((local_sp = load_local_set(ep)) == NULL) { 1198 md_exit(local_sp, 0); 1199 } 1200 1201 /* 1202 * At this point, all meta* commands are blocked across 1203 * all disksets since the master rpc.mdcommd has drained or 1204 * the master node has died. 1205 * If a metaset or metadb command had been in progress 1206 * at the start of the reconfig cycle, this command has 1207 * either completed or it has been terminated due to 1208 * the death of the master node. 1209 * 1210 * This means that that it is now ok to remove any 1211 * outstanding clnt_locks associated with multinode 1212 * disksets on this node due to a node panic during 1213 * a metaset operation. This allows the routines that 1214 * choose the master to use rpc.metad to determine the 1215 * master of the diskset. 1216 */ 1217 if (clnt_clr_mnsetlock(mynode(), ep) != 0) { 1218 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1219 "clear locks failed %s"), 1220 meta_print_hrtime(gethrtime() - start_time)); 1221 md_exit(local_sp, 1); 1222 } 1223 1224 /* 1225 * Call reconfig_choose_master to choose a master for 1226 * each MN diskset, update the nodelist for each diskset 1227 * given the member information and send a reinit message 1228 * to rpc.mdcommd to reload the nodelist. 1229 */ 1230 rval = meta_reconfig_choose_master(commd_timeout, ep); 1231 if (rval == 205) { 1232 /* 1233 * NOTE: Should issue call to reboot remote host that 1234 * is causing the RPC failure. Clustering to 1235 * provide interface in the future. This should 1236 * stop a never-ending set of 205 reconfig cycles. 1237 * Remote host causing failure is stored in 1238 * ep->host if ep is an RPC error. 1239 * if (mdanyrpcerror(ep)) 1240 * reboot (ep->host); 1241 */ 1242 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1243 "choose master failure of 205 %s"), 1244 meta_print_hrtime(gethrtime() - start_time)); 1245 md_exit(local_sp, 205); 1246 } else if (rval != 0) { 1247 meta_mc_log(MC_LOG2, gettext("Step1 failure: " 1248 "choose master failure %s"), 1249 meta_print_hrtime(gethrtime() - start_time)); 1250 md_exit(local_sp, 1); 1251 } 1252 1253 meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"), 1254 meta_print_hrtime(gethrtime() - start_time)); 1255 1256 md_exit(local_sp, rval); 1257 break; 1258 1259 case MC_STEP2: 1260 /* 1261 * Step 2 1262 * 1263 * In Step 2, each node walks the list of disksets. If a 1264 * node is a master of a MN diskset, it synchronizes 1265 * the local set USER records for that diskset. 1266 * 1267 * If disks exist in the diskset and there is a joined 1268 * (owner) node in the diskset, the master will also: 1269 * - synchronize the diskset mddbs to the master 1270 * - play the change log 1271 * 1272 * The master node will now attempt to join any unjoined 1273 * nodes that are currently members in the membership list. 1274 */ 1275 1276 /* expect the nodelist to follow the step name */ 1277 if (argc < 1) 1278 usage(sp, 1); 1279 1280 meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"), 1281 meta_print_hrtime(0)); 1282 1283 /* 1284 * Does local set exist? If not, exit with 0 1285 * since there's no reason to have this node panic if 1286 * the local set cannot be started. 1287 */ 1288 if ((local_sp = load_local_set(ep)) == NULL) { 1289 md_exit(local_sp, 0); 1290 } 1291 1292 if ((max_sets = get_max_sets(ep)) == 0) { 1293 mde_perror(ep, ""); 1294 md_exit(local_sp, 1); 1295 } 1296 1297 /* start walking through all possible disksets */ 1298 for (setno = 1; setno < max_sets; setno++) { 1299 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1300 if (mdiserror(ep, MDE_NO_SET)) { 1301 /* No set for this setno - continue */ 1302 mdclrerror(ep); 1303 continue; 1304 } else if (mdanyrpcerror(ep)) { 1305 /* Fail on RPC failure to self */ 1306 mde_perror(ep, gettext( 1307 "Unable to get information for " 1308 "set number %d"), setno); 1309 md_exit(local_sp, 1); 1310 } else { 1311 mde_perror(ep, gettext( 1312 "Unable to get information for " 1313 "set number %d"), setno); 1314 mdclrerror(ep); 1315 continue; 1316 } 1317 } 1318 1319 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1320 if (mdanyrpcerror(ep)) { 1321 /* Fail on RPC failure to self */ 1322 mde_perror(ep, gettext( 1323 "Unable to get information for " 1324 "set number %d"), setno); 1325 md_exit(local_sp, 1); 1326 } 1327 mde_perror(ep, gettext("Unable to get set " 1328 "%s desc information"), sp->setname); 1329 mdclrerror(ep); 1330 continue; 1331 } 1332 1333 /* Only check MN disksets */ 1334 if (!(MD_MNSET_DESC(sd))) { 1335 continue; 1336 } 1337 1338 /* All actions in step 2 are driven by master */ 1339 if (!(sd->sd_mn_am_i_master)) { 1340 continue; 1341 } 1342 1343 meta_mc_log(MC_LOG3, gettext("Step2 - begin record " 1344 "synchronization for set %s: %s"), sp->setname, 1345 meta_print_hrtime(gethrtime() - start_time)); 1346 1347 /* 1348 * Synchronize the USER records in the local mddbs 1349 * for hosts that are members. The USER records 1350 * contain set, drive and host information. 1351 */ 1352 rval = meta_mnsync_user_records(sp, ep); 1353 if (rval != 0) { 1354 mde_perror(ep, gettext( 1355 "Synchronization of user records " 1356 "in set %s failed\n"), sp->setname); 1357 if (rval == 205) { 1358 /* 1359 * NOTE: Should issue call to reboot 1360 * remote host that is causing the RPC 1361 * failure. Clustering to provide 1362 * interface in the future. This 1363 * should stop a never-ending set of 1364 * 205 reconfig cycles. 1365 * Remote host causing failure is 1366 * stored in ep->host if ep is an 1367 * RPC error. 1368 * if (mdanyrpcerror(ep)) 1369 * reboot (ep->host); 1370 */ 1371 md_exit(local_sp, 205); 1372 } else { 1373 md_exit(local_sp, 1); 1374 } 1375 } 1376 1377 /* Reget sd since sync_user_recs may have flushed it */ 1378 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1379 mde_perror(ep, gettext("Unable to get set " 1380 "%s desc information"), sp->setname); 1381 md_exit(local_sp, 1); 1382 } 1383 1384 dd = metaget_drivedesc(sp, 1385 (MD_BASICNAME_OK | PRINT_FAST), ep); 1386 if (! mdisok(ep)) { 1387 mde_perror(ep, gettext("Unable to get set " 1388 "%s drive information"), sp->setname); 1389 md_exit(local_sp, 1); 1390 } 1391 1392 /* 1393 * No drives in set, continue to next set. 1394 */ 1395 if (dd == NULL) { 1396 /* Done with this set */ 1397 continue; 1398 } 1399 1400 meta_mc_log(MC_LOG3, gettext("Step2 - local set user " 1401 "records completed for set %s: %s"), sp->setname, 1402 meta_print_hrtime(gethrtime() - start_time)); 1403 1404 /* 1405 * Synchronize the diskset mddbs for hosts 1406 * that are members. This may involve 1407 * playing the changelog and writing out 1408 * to the diskset mddbs. 1409 */ 1410 rval = meta_mnsync_diskset_mddbs(sp, ep); 1411 if (rval != 0) { 1412 mde_perror(ep, gettext( 1413 "Synchronization of diskset mddbs " 1414 "in set %s failed\n"), sp->setname); 1415 meta_mc_log(MC_LOG3, gettext("Step2 - diskset " 1416 "mddb synchronization failed for " 1417 "set %s: %s"), sp->setname, 1418 meta_print_hrtime(gethrtime() - 1419 start_time)); 1420 if (rval == 205) { 1421 /* 1422 * NOTE: Should issue call to reboot 1423 * remote host that is causing the RPC 1424 * failure. Clustering to provide 1425 * interface in the future. This 1426 * should stop a never-ending set of 1427 * 205 reconfig cycles. 1428 * Remote host causing failure is 1429 * stored in ep->host if ep is an 1430 * RPC error. 1431 * if (mdanyrpcerror(ep)) 1432 * reboot (ep->host); 1433 */ 1434 md_exit(local_sp, 205); 1435 } else if (rval == 1) { 1436 continue; 1437 } else { 1438 md_exit(local_sp, 1); 1439 } 1440 } 1441 1442 meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb " 1443 "synchronization completed for set %s: %s"), 1444 sp->setname, 1445 meta_print_hrtime(gethrtime() - start_time)); 1446 1447 /* Join the starting nodes to the diskset */ 1448 rval = meta_mnjoin_all(sp, ep); 1449 if (rval != 0) { 1450 mde_perror(ep, gettext( 1451 "Join of non-owner (starting) nodes " 1452 "in set %s failed\n"), sp->setname); 1453 meta_mc_log(MC_LOG3, gettext("Step2 - non owner" 1454 "nodes joined for set %s: %s"), 1455 sp->setname, 1456 meta_print_hrtime(gethrtime() - 1457 start_time)); 1458 if (rval == 205) { 1459 /* 1460 * NOTE: Should issue call to reboot 1461 * remote host that is causing the RPC 1462 * failure. Clustering to provide 1463 * interface in the future. This 1464 * should stop a never-ending set of 1465 * 205 reconfig cycles. 1466 * Remote host causing failure is 1467 * stored in ep->host if ep is an 1468 * RPC error. 1469 * if (mdanyrpcerror(ep)) 1470 * reboot (ep->host); 1471 */ 1472 md_exit(local_sp, 205); 1473 } else { 1474 md_exit(local_sp, 1); 1475 } 1476 } 1477 1478 meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes " 1479 "joined for set %s: %s"), sp->setname, 1480 meta_print_hrtime(gethrtime() - start_time)); 1481 1482 } 1483 1484 meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"), 1485 meta_print_hrtime(gethrtime() - start_time)); 1486 1487 break; 1488 1489 case MC_STEP3: 1490 /* 1491 * Step 3 1492 * 1493 * For all multinode sets do, 1494 * - Reinitialise rpc.mdcommd 1495 * - Reset mirror owners to null if the current owner is 1496 * no longer in the membership list 1497 */ 1498 1499 /* expect the nodelist to follow the step name */ 1500 if (argc < 1) 1501 usage(sp, 1); 1502 1503 meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"), 1504 meta_print_hrtime(0)); 1505 1506 /* 1507 * Does local set exist? If not, exit with 0 1508 * since there's no reason to have this node panic if 1509 * the local set cannot be started. 1510 */ 1511 if ((local_sp = load_local_set(ep)) == NULL) { 1512 md_exit(local_sp, 0); 1513 } 1514 1515 /* 1516 * walk through all sets on this node which could include: 1517 * - MN disksets 1518 * - traditional disksets 1519 * - non-existent disksets 1520 * start mirror resync for all MN sets 1521 */ 1522 if ((max_sets = get_max_sets(ep)) == 0) { 1523 mde_perror(ep, ""); 1524 md_exit(local_sp, 1); 1525 } 1526 1527 /* start walking through all possible disksets */ 1528 for (setno = 1; setno < max_sets; setno++) { 1529 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1530 if (mdiserror(ep, MDE_NO_SET)) { 1531 /* No set for this setno - continue */ 1532 mdclrerror(ep); 1533 continue; 1534 } else { 1535 mde_perror(ep, gettext("Unable to " 1536 "get set %d information"), setno); 1537 md_exit(local_sp, 1); 1538 } 1539 } 1540 1541 /* only check multi-node disksets */ 1542 if (!meta_is_mn_set(sp, ep)) { 1543 mdclrerror(ep); 1544 continue; 1545 } 1546 1547 if (meta_lock(sp, TRUE, ep) != 0) { 1548 mde_perror(ep, ""); 1549 md_exit(local_sp, 1); 1550 } 1551 1552 /* If this node isn't joined to set, do nothing */ 1553 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1554 if (!mdisok(ep)) { 1555 mde_perror(ep, gettext("Could " 1556 "not get set %s ownership"), 1557 sp->setname); 1558 md_exit(sp, 1); 1559 } 1560 mdclrerror(ep); 1561 meta_unlock(sp, ep); 1562 continue; 1563 } 1564 1565 meta_mc_log(MC_LOG3, gettext("Step3 - begin " 1566 "re-initialising rpc.mdcommd and resetting mirror " 1567 "owners for set %s: %s"), sp->setname, 1568 meta_print_hrtime(gethrtime() - start_time)); 1569 1570 /* reinitialzse rpc.mdcommd with new nodelist */ 1571 if (mdmn_reinit_set(setno, commd_timeout)) { 1572 md_eprintf(gettext( 1573 "Could not re-initialise rpc.mdcommd for " 1574 "set %s\n"), sp->setname); 1575 md_exit(sp, 1); 1576 } 1577 1578 (void) memset(&cfg, 0, sizeof (cfg)); 1579 cfg.c_id = 0; 1580 cfg.c_setno = sp->setno; 1581 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1582 NULL) != 0) { 1583 mdstealerror(ep, &cfg.c_mde); 1584 mde_perror(ep, gettext("Could " 1585 "not get set %s information"), 1586 sp->setname); 1587 md_exit(sp, 1); 1588 } 1589 1590 /* Don't do anything else if set is stale */ 1591 if (cfg.c_flags & MDDB_C_STALE) { 1592 meta_unlock(sp, ep); 1593 mdclrerror(ep); 1594 continue; 1595 } 1596 1597 /* reset mirror owners */ 1598 if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) { 1599 md_exit(sp, 1); 1600 } 1601 1602 meta_unlock(sp, ep); 1603 1604 meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd " 1605 "re-initialised and mirror owners reset for " 1606 "set %s: %s"), sp->setname, 1607 meta_print_hrtime(gethrtime() - start_time)); 1608 } 1609 1610 meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"), 1611 meta_print_hrtime(gethrtime() - start_time)); 1612 1613 break; 1614 1615 case MC_STEP4: 1616 /* 1617 * Step 4 1618 * 1619 * For all multinode sets do: 1620 * - Resume the rpc.mdcommd messages. Must resume all 1621 * sets before issuing I/O to any set since an error 1622 * encountered in a commd suspended set could be 1623 * blocked waiting for commd in another set to resume. 1624 * (This happens since the daemon queues service 1625 * all sets). An open of a soft partition causes 1626 * a read of the watermarks during the open. 1627 * - If set is non-writable (not an owner or STALE), then 1628 * continue to next set. 1629 * 1630 * For all multinode sets do, 1631 * - Reset ABR states for all mirrors, ie clear ABR if not 1632 * open on any node. 1633 * - Reset ABR states for all soft partitions, ie clear ABR if 1634 * not open on any node. 1635 * - For all slave nodes that have entered through the start 1636 * step, update the ABR state to that of the master and 1637 * get the submirror state from the master 1638 * - meta_lock set 1639 * - Resync all mirrors 1640 * - unlock meta_lock for this set. 1641 * - Choose a new owner for any orphaned resyncs 1642 * 1643 * There is one potential issue here. when concurrently 1644 * resetting and updating the ABR state. If the master has ABR 1645 * set, but should no longer have because the only node that 1646 * had the metadevice open and had ABR set has paniced, the 1647 * master will send a message to all nodes to clear the ABR 1648 * state. Meanwhile any node that has come through the 1649 * start step will get tstate from the master and will update 1650 * ABR if it was set in tstate. So, we appear to have a problem 1651 * if the following sequence occurs:- 1652 * - The slave gets tstate with ABR set 1653 * - The master sends a message to clear ABR 1654 * - The slave updates ABR with the value it got from tstate. 1655 * We now have the master with ABR clear and the slave with ABR 1656 * set. Fortunately, having set ABR, the slave will close the 1657 * metadevice after setting ABR and as there are no nodes with 1658 * the device open, the close will send a message to clear ABR 1659 * on all nodes. So, the nodes will all have ABR unset. 1660 */ 1661 1662 /* expect the nodelist to follow the step name */ 1663 if (argc < 1) 1664 usage(sp, 1); 1665 1666 meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"), 1667 meta_print_hrtime(0)); 1668 1669 /* 1670 * Does local set exist? If not, exit with 0 1671 * since there's no reason to have this node panic if 1672 * the local set cannot be started. 1673 */ 1674 if ((local_sp = load_local_set(ep)) == NULL) { 1675 md_exit(local_sp, 0); 1676 } 1677 1678 /* 1679 * walk through all sets on this node which could include: 1680 * - MN disksets 1681 * - traditional disksets 1682 * - non-existent disksets 1683 * start mirror resync for all MN sets 1684 */ 1685 if ((max_sets = get_max_sets(ep)) == 0) { 1686 mde_perror(ep, ""); 1687 md_exit(local_sp, 1); 1688 } 1689 1690 /* Clear set_info structure */ 1691 for (setno = 1; setno < max_sets; setno++) { 1692 set_info[setno] = 0; 1693 } 1694 1695 /* start walking through all possible disksets */ 1696 for (setno = 1; setno < max_sets; setno++) { 1697 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1698 if (mdiserror(ep, MDE_NO_SET)) { 1699 /* No set for this setno - continue */ 1700 mdclrerror(ep); 1701 continue; 1702 } else { 1703 mde_perror(ep, gettext("Unable to " 1704 "get set %d information"), setno); 1705 md_exit(local_sp, 1); 1706 } 1707 } 1708 1709 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1710 mde_perror(ep, gettext("Unable to get set " 1711 "%s desc information"), sp->setname); 1712 mdclrerror(ep); 1713 continue; 1714 } 1715 1716 /* only check multi-node disksets */ 1717 if (!meta_is_mn_set(sp, ep)) { 1718 mdclrerror(ep); 1719 continue; 1720 } 1721 1722 set_info[setno] |= SET_INFO_MN; 1723 1724 /* 1725 * If not an owner (all mddbs failed) or stale 1726 * (< 50% mddbs operational), then set is 1727 * non-writable so just resume commd and 1728 * unblock mddb messages. 1729 */ 1730 mdclrerror(ep); 1731 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1732 set_info[setno] |= SET_INFO_NO_WR; 1733 } 1734 if (!mdisok(ep)) { 1735 mde_perror(ep, gettext("Could " 1736 "not get set %s ownership"), 1737 sp->setname); 1738 md_exit(local_sp, 1); 1739 } 1740 /* Set is owned - is it stale? */ 1741 if (!set_info[setno] & SET_INFO_NO_WR) { 1742 (void) memset(&cfg, 0, sizeof (cfg)); 1743 cfg.c_id = 0; 1744 cfg.c_setno = sp->setno; 1745 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1746 NULL) != 0) { 1747 mdstealerror(ep, &cfg.c_mde); 1748 mde_perror(ep, gettext("Could " 1749 "not get set %s information"), 1750 sp->setname); 1751 md_exit(local_sp, 1); 1752 } 1753 if (cfg.c_flags & MDDB_C_STALE) { 1754 set_info[setno] |= SET_INFO_NO_WR; 1755 } 1756 } 1757 1758 /* resume rpc.mdcommd */ 1759 if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0, 1760 commd_timeout)) { 1761 md_eprintf(gettext("Unable to resume " 1762 "rpc.mdcommd for set %s\n"), sp->setname); 1763 md_exit(local_sp, 1); 1764 } 1765 meta_ping_mnset(setno); 1766 1767 /* Unblock mddb parse messages */ 1768 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1769 (void) memset(&mbp, 0, sizeof (mbp)); 1770 mbp.c_setno = setno; 1771 mbp.c_blk_flags = MDDB_UNBLOCK_PARSE; 1772 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1773 &mbp.c_mde, NULL)) { 1774 mdstealerror(ep, &mbp.c_mde); 1775 mde_perror(ep, gettext("Could not " 1776 "unblock set %s"), sp->setname); 1777 md_exit(local_sp, 1); 1778 } 1779 } 1780 meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd " 1781 "resumed and messages unblocked for set %s: %s"), 1782 sp->setname, 1783 meta_print_hrtime(gethrtime() - start_time)); 1784 } 1785 1786 for (setno = 1; setno < max_sets; setno++) { 1787 int start_step; 1788 1789 /* Skip traditional disksets. */ 1790 if ((set_info[setno] & SET_INFO_MN) == 0) 1791 continue; 1792 1793 /* 1794 * If already determined that this set is 1795 * a non-writable set, then just continue 1796 * to next set since there's nothing else 1797 * to do for a non-writable set. 1798 */ 1799 if (set_info[setno] & SET_INFO_NO_WR) 1800 continue; 1801 1802 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1803 if (mdiserror(ep, MDE_NO_SET)) { 1804 /* No set for this setno - continue */ 1805 mdclrerror(ep); 1806 continue; 1807 } else { 1808 mde_perror(ep, gettext("Unable to " 1809 "get set %d information"), setno); 1810 md_exit(local_sp, 1); 1811 } 1812 } 1813 1814 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1815 mde_perror(ep, gettext("Unable to get set " 1816 "%s desc information"), sp->setname); 1817 mdclrerror(ep); 1818 continue; 1819 } 1820 1821 /* See if this node came through the start step */ 1822 (void) memset(&sf, 0, sizeof (sf)); 1823 sf.sf_setno = sp->setno; 1824 sf.sf_flags = MDDB_NM_GET; 1825 /* Use magic to help protect ioctl against attack. */ 1826 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1827 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1828 &sf.sf_mde, NULL)) { 1829 mdstealerror(ep, &sf.sf_mde); 1830 mde_perror(ep, gettext("Could not get " 1831 "start_step flag for set %s"), sp->setname); 1832 md_exit(local_sp, 1); 1833 } 1834 start_step = 1835 (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0; 1836 1837 /* 1838 * We can now reset the start_step flag for the set 1839 * if it was already set. 1840 */ 1841 if (start_step) { 1842 (void) memset(&sf, 0, sizeof (sf)); 1843 sf.sf_setno = sp->setno; 1844 sf.sf_setflags = MD_SET_MN_START_RC; 1845 sf.sf_flags = MDDB_NM_RESET; 1846 /* 1847 * Use magic to help protect ioctl 1848 * against attack. 1849 */ 1850 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1851 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1852 &sf.sf_mde, NULL)) { 1853 mdstealerror(ep, &sf.sf_mde); 1854 mde_perror(ep, 1855 gettext("Could not reset " 1856 "start_step flag for set %s"), 1857 sp->setname); 1858 } 1859 } 1860 1861 meta_mc_log(MC_LOG3, gettext("Step4 - begin setting " 1862 "ABR state and restarting io's for " 1863 "set %s: %s"), sp->setname, 1864 meta_print_hrtime(gethrtime() - start_time)); 1865 1866 1867 /* 1868 * If we are not the master and we have come through 1869 * the start step, we must update the ABR states 1870 * for mirrors and soft partitions. Also the submirror 1871 * states need to be synchronised so that we see the 1872 * same status as other previously joined members. 1873 * This _must_ be done before starting the resync. 1874 */ 1875 if (!(sd->sd_mn_am_i_master) && start_step) { 1876 if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR, 1877 ep) == -1) { 1878 md_exit(local_sp, 1); 1879 } 1880 if (reset_state(UPDATE_ABR, sp, MD_SP, 1881 ep) == -1) { 1882 md_exit(local_sp, 1); 1883 } 1884 /* 1885 * Mark the fact that we've got the mirror 1886 * state. This allows the resync thread to 1887 * determine if _it_ needs to issue this. This 1888 * can happen if a node is added to a set after 1889 * a reconfig cycle has completed. 1890 */ 1891 (void) memset(&sf, 0, sizeof (sf)); 1892 sf.sf_setno = sp->setno; 1893 sf.sf_setflags = MD_SET_MN_MIR_STATE_RC; 1894 sf.sf_flags = MDDB_NM_SET; 1895 /* 1896 * Use magic to help protect ioctl 1897 * against attack. 1898 */ 1899 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1900 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1901 &sf.sf_mde, NULL)) { 1902 mdstealerror(ep, &sf.sf_mde); 1903 mde_perror(ep, 1904 gettext("Could not set " 1905 "submirror state flag for set %s"), 1906 sp->setname); 1907 } 1908 } 1909 1910 /* 1911 * All remaining actions are only performed by the 1912 * master 1913 */ 1914 if (!(sd->sd_mn_am_i_master)) { 1915 if (meta_lock(sp, TRUE, ep) != 0) { 1916 mde_perror(ep, ""); 1917 md_exit(local_sp, 1); 1918 } 1919 meta_mirror_resync_unblock(sp); 1920 meta_unlock(sp, ep); 1921 continue; 1922 } 1923 1924 /* 1925 * If the master came through the start step, this 1926 * implies that all of the nodes must have done the 1927 * same and hence there can be no applications 1928 * running. Hence no need to reset ABR 1929 */ 1930 if (!start_step) { 1931 /* Reset ABR state for mirrors */ 1932 if (reset_state(RESET_ABR, sp, MD_MIRROR, 1933 ep) == -1) { 1934 md_exit(local_sp, 1); 1935 } 1936 /* ...and now the same for soft partitions */ 1937 if (reset_state(RESET_ABR, sp, MD_SP, 1938 ep) == -1) { 1939 md_exit(local_sp, 1); 1940 } 1941 } 1942 1943 /* 1944 * choose owners for orphaned resyncs and reset 1945 * non-orphaned resyncs so that an owner node that 1946 * reboots will restart the resync if needed. 1947 */ 1948 if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1) 1949 md_exit(local_sp, 1); 1950 1951 /* 1952 * Must unlock set lock before meta_mirror_resync_all 1953 * sends a message to run the metasync command 1954 * which also grabs the meta_lock. 1955 */ 1956 if (meta_lock(sp, TRUE, ep) != 0) { 1957 mde_perror(ep, ""); 1958 md_exit(local_sp, 1); 1959 } 1960 meta_mirror_resync_unblock(sp); 1961 meta_unlock(sp, ep); 1962 1963 /* resync all mirrors in set */ 1964 if (meta_mirror_resync_all(sp, 0, ep) != 0) { 1965 mde_perror(ep, gettext("Mirror resyncs " 1966 "failed for set %s"), sp->setname); 1967 md_exit(local_sp, 1); 1968 } 1969 1970 meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted " 1971 "for set %s: %s"), sp->setname, 1972 meta_print_hrtime(gethrtime() - start_time)); 1973 } 1974 1975 meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"), 1976 meta_print_hrtime(gethrtime() - start_time)); 1977 1978 break; 1979 1980 default: 1981 usage(sp, 1); 1982 break; 1983 } 1984 1985 md_exit(sp, 0); 1986 /* NOTREACHED */ 1987 return (0); 1988 } 1989