1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <meta.h> 30 #include <sdssc.h> 31 #include <signal.h> 32 #include <syslog.h> 33 #include <sys/types.h> 34 #include <sys/wait.h> 35 #include <sys/lvm/md_mirror.h> 36 #include <metad.h> 37 38 #define MY_VERSION "1.0" /* the highest supported version */ 39 #define MAX_DEBUG_LEVEL 5 /* maximum verbosity level */ 40 41 #define RESET_OWNER 0x0001 42 #define CHOOSE_OWNER 0x0002 43 #define RESET_ABR 0x0004 44 #define UPDATE_ABR 0x0008 45 #define GET_MIRROR_STATE 0x0010 46 47 #define SET_INFO_NO_WR 0x0002 48 #define SET_INFO_MN 0x0004 49 50 /* 51 * This table defines all the metaclust reconfig steps we understand 52 */ 53 typedef enum stpnum { 54 MC_UNK = 0, 55 MC_START, 56 MC_STOP, 57 MC_ABORT, 58 MC_RETURN, 59 MC_STEP1, 60 MC_STEP2, 61 MC_STEP3, 62 MC_STEP4 63 } stepnum_t; 64 65 /* 66 * Structure for step_name -> step_number mapping 67 */ 68 struct step_t { 69 char *step_nam; 70 stepnum_t step_num; 71 }; 72 73 /* 74 * Step name to step number mapping table 75 * This table MUST be sorted alphabetically in ascending order of step name 76 */ 77 static struct step_t step_table[] = { 78 { "abort", MC_ABORT }, 79 { "return", MC_RETURN }, 80 { "start", MC_START }, 81 { "step1", MC_STEP1 }, 82 { "step2", MC_STEP2 }, 83 { "step3", MC_STEP3 }, 84 { "step4", MC_STEP4 }, 85 { "stop", MC_STOP } 86 }; 87 88 /* 89 * If support for a different version is added, the new version number should 90 * be appended to the version_table below. This list will be searched to 91 * determine if a version requested via the -V option is supported or not. 92 */ 93 static char *version_table[] = { 94 MY_VERSION 95 }; 96 97 uint_t timeout = 0; /* disable timeout by default */ 98 char *version = MY_VERSION; /* use latest version by default */ 99 int stepnum = MC_UNK; /* reconfiguration step number */ 100 pid_t c_pid; /* child process id */ 101 102 /* 103 * Binary search comparison routine 104 */ 105 static int 106 mc_compare(const void *stp1, const void *stp2) 107 { 108 return (strcmp((const char *)stp1, 109 ((const struct step_t *)stp2)->step_nam)); 110 } 111 112 /* 113 * Timeout expiry alarm signal handler 114 */ 115 /*ARGSUSED*/ 116 static void 117 sigalarmhandler(int sig) 118 { 119 int i, n, ret, stat_loc = 0; 120 121 n = sizeof (step_table) / sizeof (step_table[0]); 122 for (i = 0; i < n; i++) { 123 if (stepnum == step_table[i].step_num) 124 break; 125 } 126 127 assert(i != n); 128 129 meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"), 130 step_table[i].step_nam, 131 meta_print_hrtime(gethrtime() - start_time)); 132 133 if ((ret = kill(c_pid, SIGKILL)) == 0) { 134 /* 135 * The child will wait forever until the status is retrieved 136 * so get it now. Keep retrying if the call is interrupted. 137 * 138 * The possible results are, 139 * 140 * - child killed successfully 141 * - signal sent but child not killed 142 * - waitpid failed/interrupted 143 */ 144 sleep(2); 145 while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) { 146 if (errno != EINTR) { 147 break; 148 } 149 } 150 if ((ret == c_pid) || (errno == ECHILD)) { 151 ret = 0; 152 } else { 153 ret = 1; 154 } 155 } else if (errno == ESRCH) { 156 /* 157 * If the kill did not catch the child then it means the child 158 * exited immediately after the timeout occured. 159 */ 160 ret = 0; 161 } 162 163 /* 164 * make sure not to exit with 205 for any steps other than step1-step4. 165 * Suncluster reconfiguration can't handle it otherwise. 166 */ 167 switch (stepnum) { 168 case MC_STEP1: 169 case MC_STEP2: 170 case MC_STEP3: 171 case MC_STEP4: 172 /* 173 * If the child was killed successfully return 205 for a 174 * new reconfig cycle otherwise send 1 to panic the node. 175 */ 176 if (ret != 0) { 177 md_eprintf(gettext("Could not kill child\n")); 178 exit(1); 179 } else { 180 exit(205); 181 } 182 break; 183 case MC_START: 184 case MC_STOP: 185 case MC_ABORT: 186 case MC_RETURN: 187 default: 188 exit(1); 189 break; 190 } 191 } 192 193 /* 194 * Attempt to load local set. 195 * Returns: 196 * pointer to mdsetname_t for local set (local_sp) is successful. 197 * 0 if failure 198 * if there are no local set mddbs, no error message is printed. 199 * Otherwise, error message is printed so that user 200 * can determine why the local set didn't start. 201 */ 202 mdsetname_t * 203 load_local_set(md_error_t *ep) 204 { 205 mdsetname_t *local_sp = NULL; 206 207 /* Does local set exist? If not, give no error */ 208 if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) { 209 return (0); 210 } 211 212 /* 213 * snarf local set 214 * If fails with MDE_DB_NODB, then just return 1 printing 215 * no failure. 216 * Otherwise, print error message, and return 1. 217 */ 218 if (meta_setup_db_locations(ep) != 0) { 219 if (!(mdismddberror(ep, MDE_DB_NODB))) 220 mde_perror(ep, ""); 221 return (0); 222 } 223 224 /* local set loaded successfully */ 225 return (local_sp); 226 } 227 228 /* 229 * Purpose: Compose a full path name for a metadevice 230 * 231 * On entry: sp - setname pointer 232 * mnum - minor number of metadevice 233 * pathname - pointer to array to return path string 234 * pathlen - max length of pathname array 235 */ 236 static int 237 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen) 238 { 239 int rtn; 240 mdname_t *np; 241 md_error_t status = mdnullerror; 242 243 if (MD_MIN2SET(mnum) != sp->setno) { 244 md_eprintf(gettext("minor number 0x%x invalid for set %d\n"), 245 mnum, sp->setno); 246 return (-1); 247 } 248 249 if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) { 250 return (-1); 251 } 252 253 rtn = snprintf(pathname, pathlen, "%s", np->rname); 254 255 if ((pathname[0] == '\0') || (rtn >= pathlen)) { 256 md_eprintf(gettext( 257 "Could not create path for device %s\n"), 258 get_mdname(sp, mnum)); 259 return (-1); 260 } 261 return (0); 262 } 263 264 /* 265 * Purpose: Walk through all the devices specified for the given set 266 * and do the action specified in mode 267 */ 268 static int 269 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep) 270 { 271 mdnamelist_t *devnlp = NULL; 272 mdnamelist_t *p; 273 mdname_t *devnp = NULL; 274 md_set_mmown_params_t ownpar_p; 275 md_set_mmown_params_t *ownpar = &ownpar_p; 276 md_unit_t *mm; 277 int mirror_dev = 0; 278 mndiskset_membershiplist_t *nl; 279 int cnt; 280 int has_parent; 281 md_mn_get_mir_state_t mir_state_p; 282 md_mn_get_mir_state_t *mir_state = &mir_state_p; 283 284 /* 285 * if we are choosing or resetting the owners then make sure 286 * we are only doing it for mirror devices 287 */ 288 mirror_dev = (strcmp(MD_MIRROR, drivername) == 0); 289 if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) { 290 return (-1); 291 } 292 293 /* get a list of all the metadevices for current set */ 294 if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) { 295 mde_perror(ep, gettext("Could not get mirrors for set %s"), 296 sp->setname); 297 return (-1); 298 } else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) { 299 mde_perror(ep, gettext( 300 "Could not get soft partitions for set %s"), sp->setname); 301 return (-1); 302 } 303 304 /* If resetting the owner, get the known membership list */ 305 if (mode & RESET_OWNER) { 306 if (meta_read_nodelist(&cnt, &nl, ep)) { 307 mde_perror(ep, "Could not get nodelist"); 308 return (-1); 309 } 310 } 311 312 /* for each metadevice */ 313 for (p = devnlp; (p != NULL); p = p->next) { 314 devnp = p->namep; 315 316 /* 317 * Get the current setting for mirror ABR state and all of the 318 * submirror state and flags from the master node. We only 319 * perform this when going through a 'start' cycle. 320 */ 321 if ((mode & GET_MIRROR_STATE) && mirror_dev) { 322 char *miscname; 323 324 /* 325 * Ensure that we ignore soft-parts that are returned 326 * from the meta_get_mirror_names() call 327 */ 328 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 329 goto out; 330 if (strcmp(miscname, MD_MIRROR) != 0) 331 continue; 332 333 mir_state->mnum = meta_getminor(devnp->dev); 334 MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno); 335 meta_mc_log(MC_LOG4, gettext("Getting mirror state" 336 " for %s: %s"), get_mdname(sp, mir_state->mnum), 337 meta_print_hrtime(gethrtime() - start_time)); 338 339 if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep, 340 "MD_MN_GET_MIRROR_STATE") != 0) { 341 mde_perror(ep, gettext("Unable to get " 342 "mirror state for %s"), 343 get_mdname(sp, mir_state->mnum)); 344 goto out; 345 } else { 346 continue; 347 } 348 } 349 350 /* check if this is a top level metadevice */ 351 if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL) 352 goto out; 353 if (MD_HAS_PARENT(MD_PARENT(mm))) { 354 has_parent = 1; 355 } else { 356 has_parent = 0; 357 } 358 Free(mm); 359 360 if (mode & (RESET_OWNER | CHOOSE_OWNER)) { 361 char *miscname; 362 363 /* 364 * we can only do these for mirrors so make sure we 365 * really have a mirror device and not a softpartition 366 * imitating one. meta_get_mirror_names seems to think 367 * softparts on top of a mirror are mirrors! 368 */ 369 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 370 goto out; 371 if (strcmp(miscname, MD_MIRROR) != 0) 372 continue; 373 374 (void) memset(ownpar, 0, sizeof (*ownpar)); 375 ownpar->d.mnum = meta_getminor(devnp->dev); 376 MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno); 377 378 meta_mc_log(MC_LOG4, gettext("Setting owner " 379 "for %s: %s"), get_mdname(sp, ownpar->d.mnum), 380 meta_print_hrtime(gethrtime() - start_time)); 381 382 /* get the current owner id */ 383 if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep, 384 "MD_MN_GET_MM_OWNER") != 0) { 385 mde_perror(ep, gettext("Unable to get " 386 "mirror owner for %s"), 387 get_mdname(sp, ownpar->d.mnum)); 388 goto out; 389 } 390 } 391 392 if (mode & RESET_OWNER) { 393 if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) { 394 mdclrerror(ep); 395 continue; 396 } 397 398 /* 399 * reset owner only if the current owner is 400 * not in the membership list 401 * Also kill the resync thread so that when the resync 402 * is started, it will perform an optimized resync 403 * for any resync regions that were dirty when the 404 * current owner left the membership. 405 */ 406 if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) { 407 if (meta_mn_change_owner(&ownpar, 408 sp->setno, ownpar->d.mnum, 409 MD_MN_MIRROR_UNOWNED, 410 MD_MN_MM_ALLOW_CHANGE) == -1) { 411 md_eprintf(gettext( 412 "Unable to reset mirror owner " 413 "for %s\n"), 414 get_mdname(sp, ownpar->d.mnum)); 415 goto out; 416 } 417 if (meta_mirror_resync(sp, devnp, 0, ep, 418 MD_RESYNC_KILL_NO_WAIT) != 0) { 419 md_eprintf(gettext( 420 "Unable to kill resync for" 421 " %s\n"), 422 get_mdname(sp, ownpar->d.mnum)); 423 goto out; 424 } 425 } 426 } 427 428 if (mode & CHOOSE_OWNER) { 429 /* 430 * only orphaned resyncs will have no owner. 431 * if that is the case choose a new owner. Otherwise 432 * re-establish the existing owner. This covers the 433 * case where a node that owned the mirror 434 * reboots/panics and comes back into the cluster before 435 * the reconfig cycle has completed. In this case the 436 * other cluster nodes will have the mirror owner marked 437 * as the rebooted node while it has the owner marked 438 * as 'None'. We have to reestablish the ownership so 439 * that the subsequent resync can continue. 440 */ 441 if (meta_mn_change_owner(&ownpar, sp->setno, 442 ownpar->d.mnum, ownpar->d.owner, 443 MD_MN_MM_CHOOSE_OWNER) == -1) { 444 md_eprintf(gettext("Unable to choose " 445 "mirror owner for %s\n"), 446 get_mdname(sp, ownpar->d.mnum)); 447 goto out; 448 } 449 } 450 451 /* 452 * For RESET_ABR and UPDATE_ABR - only handle top 453 * level metadevices. 454 */ 455 if (has_parent) 456 continue; 457 458 if (mode & RESET_ABR) { 459 /* 460 * Reset the ABR (application based recovery) 461 * value on all nodes. We are dealing with 462 * the possibility that we have ABR set but the 463 * only node that had the device open with ABR has 464 * left the cluster. We simply open and close the 465 * device and if this is the last close in the 466 * cluster, ABR will be cleared on all nodes. 467 */ 468 char *miscname; 469 char name[MAXPATHLEN]; 470 int mnum, fd; 471 472 name[0] = '\0'; 473 mnum = meta_getminor(devnp->dev); 474 475 /* 476 * Ensure that we don't include soft-parts in the 477 * mirror-only call to RESET_ABR. meta_get_mirror_names 478 * returns a bogus list that includes all soft-parts 479 * built on mirrors. 480 */ 481 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 482 goto out; 483 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 484 continue; 485 486 meta_mc_log(MC_LOG4, gettext("Re-setting ABR state " 487 "for %s: %s"), get_mdname(sp, mnum), 488 meta_print_hrtime(gethrtime() - start_time)); 489 490 /* compose the absolute device path and open it */ 491 if (compose_path(sp, mnum, &name[0], 492 sizeof (name)) != 0) 493 goto out; 494 if ((fd = open(name, O_RDWR, 0)) < 0) { 495 md_perror(gettext("Could not open device %s"), 496 name); 497 continue; 498 } 499 500 (void) close(fd); 501 } 502 503 if (mode & UPDATE_ABR) { 504 /* 505 * Update the ABR value on this node. We obtain the 506 * current ABR state from the master node. 507 */ 508 509 char *miscname; 510 char name[MAXPATHLEN]; 511 int mnum, fd; 512 volcap_t vc; 513 uint_t tstate; 514 515 name[0] = '\0'; 516 mnum = meta_getminor(devnp->dev); 517 518 /* 519 * Ensure that we don't include soft-parts in the 520 * mirror-only call to UPDATE_ABR. meta_get_mirror_names 521 * returns a bogus list that includes all soft-parts 522 * built on mirrors. 523 */ 524 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 525 goto out; 526 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 527 continue; 528 529 /* Get tstate from Master */ 530 if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) 531 != 0) 532 continue; 533 /* If not set on the master, nothing to do */ 534 if (!(tstate & MD_ABR_CAP)) 535 continue; 536 537 meta_mc_log(MC_LOG4, gettext("Updating ABR state " 538 "for %s: %s"), get_mdname(sp, mnum), 539 meta_print_hrtime(gethrtime() - start_time)); 540 541 /* compose the absolute device path and open it */ 542 if (compose_path(sp, mnum, &name[0], 543 sizeof (name)) != 0) 544 goto out; 545 if ((fd = open(name, O_RDWR, 0)) < 0) { 546 md_perror(gettext("Could not open device %s"), 547 name); 548 continue; 549 } 550 551 /* set ABR state */ 552 vc.vc_info = 0; 553 vc.vc_set = 0; 554 if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) { 555 /* 556 * Ignore if device does not support this 557 * ioctl 558 */ 559 if ((errno != ENOTTY) && (errno != ENOTSUP)) { 560 md_perror(gettext("Could not get " 561 "ABR/DMR state for device %s"), 562 name); 563 } 564 (void) close(fd); 565 continue; 566 } 567 if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) { 568 (void) close(fd); 569 continue; 570 } 571 572 vc.vc_set = DKV_ABR_CAP; 573 if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) { 574 md_perror(gettext( 575 "Could not set ABR state for " 576 "device %s"), name); 577 (void) close(fd); 578 goto out; 579 } else { 580 md_eprintf(gettext( 581 "Setting ABR state on device %s\n"), name); 582 } 583 584 (void) close(fd); 585 } 586 } 587 588 /* cleanup */ 589 if (mode & RESET_OWNER) { 590 meta_free_nodelist(nl); 591 } 592 metafreenamelist(devnlp); 593 return (0); 594 595 out: 596 /* cleanup */ 597 if (mode & RESET_OWNER) { 598 meta_free_nodelist(nl); 599 } 600 metafreenamelist(devnlp); 601 return (-1); 602 } 603 604 /* 605 * Print usage message 606 */ 607 static void 608 usage(mdsetname_t *sp, int eval) 609 { 610 (void) fprintf(stderr, gettext("usage:" 611 "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n" 612 "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n" 613 "\t%s [-V version] [-t timeout] [-d level] abort | stop\n" 614 "\t%s [-V | -? | -h]\n"), 615 myname, myname, myname, myname); 616 if (!eval) { 617 fprintf(stderr, gettext("\n" 618 "\tValid debug (-d) levels are 1-%d for increasing " 619 "verbosity.\n\tDefault is -d 3.\n\n" 620 "\tValid step values are: return | step1 | step2 | " 621 "step3 | step4\n\n" 622 "\tNodelist is a space-separated list of node id's\n\n"), 623 MAX_DEBUG_LEVEL); 624 } 625 md_exit(sp, eval); 626 } 627 628 /* 629 * Input: Input takes a config step name followed by a list of 630 * possible node id's. 631 * 632 * Returns: 0 - Success 633 * 1 - Fail 634 * Node will be removed from cluster membership 635 * by forcing node to panic. 636 * 205 - Unsuccessful. Start another reconfig cycle. 637 * Problem was encountered that could be fixed by 638 * running another reconfig cycle. 639 * Problem could be a result of a failure to read 640 * the nodelist file or that all work could not be 641 * accomplished in a reconfig step in the amount of 642 * time given so another reconfig cycle is needed in 643 * order to finish the current step. 644 */ 645 int 646 main(int argc, char **argv) 647 { 648 mdsetname_t *sp = NULL; 649 md_error_t status = mdnullerror; 650 md_error_t *ep = &status; 651 set_t max_sets, setno; 652 int c, clust = 0; 653 struct sigaction nsa, osa; 654 struct step_t *step_ptr; 655 mdsetname_t *local_sp = NULL; 656 md_drive_desc *dd; 657 int rval = 0; 658 md_set_desc *sd; 659 mddb_block_parm_t mbp; 660 uint_t debug = 3; /* log upto MC_LOG3 by default */ 661 int version_table_size; 662 mddb_setflags_config_t sf; 663 int ret_val; 664 mddb_config_t cfg; 665 int set_info[MD_MAXSETS]; 666 667 /* 668 * Get the locale set up before calling any other routines 669 * with messages to ouput. Just in case we're not in a build 670 * environment, make sure that TEXT_DOMAIN gets set to 671 * something. 672 */ 673 #if !defined(TEXT_DOMAIN) 674 #define TEXT_DOMAIN "SYS_TEST" 675 #endif 676 (void) setlocale(LC_ALL, ""); 677 (void) textdomain(TEXT_DOMAIN); 678 679 if ((clust = sdssc_bind_library()) == SDSSC_ERROR) { 680 md_eprintf(gettext("Interface error with libsds_sc.so\n")); 681 exit(1); 682 } 683 684 if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) { 685 mde_perror(ep, ""); 686 md_exit(sp, 1); 687 } 688 689 /* 690 * open log and enable libmeta logging. Do it here explicitly 691 * rather than letting md_init() do it because we are not really 692 * a daemon and that is what md_init() opens the log as. 693 */ 694 openlog("metaclust", LOG_CONS, LOG_USER); 695 696 version_table_size = sizeof (version_table) / sizeof (version_table[0]); 697 698 optind = 1; 699 opterr = 0; 700 while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) { 701 switch (c) { 702 case 'h': 703 usage(sp, 0); 704 break; 705 706 case 'd': 707 if (sscanf(optarg, "%u", &debug) != 1) { 708 md_eprintf(gettext("Invalid debug level\n")); 709 md_exit(sp, 1); 710 } else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) { 711 debug = min(max(debug, 1), MAX_DEBUG_LEVEL); 712 md_eprintf(gettext("Debug level must be " 713 "between 1 and %d inclusive.\n"), 714 MAX_DEBUG_LEVEL); 715 md_eprintf(gettext("Debug level set to %d.\n"), 716 debug); 717 } 718 break; 719 720 case 'V': 721 version = Strdup(optarg); 722 break; 723 724 case 't': 725 if (sscanf(optarg, "%u", &timeout) != 1) { 726 md_eprintf(gettext("Invalid timeout value\n")); 727 md_exit(sp, 1); 728 } 729 break; 730 731 case '?': 732 if (optopt == '?') { 733 usage(sp, 0); 734 } else if (optopt == 'V') { 735 int i; 736 737 fprintf(stdout, gettext( 738 "%s: Versions Supported:"), myname); 739 for (i = 0; i < version_table_size; i++) { 740 fprintf(stdout, " %s", 741 version_table[i]); 742 } 743 fprintf(stdout, "\n"); 744 md_exit(sp, 0); 745 } 746 /*FALLTHROUGH*/ 747 748 default: 749 usage(sp, 1); 750 break; 751 } 752 } 753 754 /* initialise the debug level and start time */ 755 setup_mc_log(debug); 756 757 /* 758 * check that the version specified (if any) is supported. 759 */ 760 if (version != NULL) { 761 int i, found = 0; 762 763 for (i = 0; i < version_table_size; i++) { 764 if (strcmp(version, version_table[i]) == 0) { 765 found = 1; 766 break; 767 } 768 } 769 if (!found) { 770 md_eprintf(gettext("Version %s not supported\n"), 771 version); 772 md_exit(sp, 1); 773 } 774 } 775 776 argc -= optind; 777 argv += optind; 778 779 /* parse arguments */ 780 if (argc <= 0) { 781 usage(sp, 1); 782 } 783 784 /* convert the step name to the corresponding number */ 785 step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) / 786 sizeof (step_table[0])), sizeof (step_table[0]), mc_compare); 787 if (step_ptr != NULL) { 788 stepnum = step_ptr->step_num; 789 } 790 791 --argc; 792 ++argv; 793 794 /* set timeout alarm signal, a value of 0 will disable timeout */ 795 if (timeout > 0) { 796 int stat_loc = 0; 797 798 c_pid = fork(); 799 800 if (c_pid == (pid_t)-1) { 801 md_perror(gettext("Unable to fork")); 802 md_exit(sp, 1); 803 } else if (c_pid) { 804 /* parent */ 805 nsa.sa_flags = 0; 806 if (sigfillset(&nsa.sa_mask) < 0) { 807 md_perror(gettext("Unable to set signal mask")); 808 md_exit(sp, 1); 809 } 810 811 nsa.sa_handler = sigalarmhandler; 812 if (sigaction(SIGALRM, &nsa, &osa) == -1) { 813 md_perror(gettext("Unable to set alarm " 814 "handler")); 815 md_exit(sp, 1); 816 } 817 818 (void) alarm(timeout); 819 820 /* 821 * wait for child to exit or timeout to expire. 822 * keep retrying if the call is interrupted 823 */ 824 while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) { 825 if (errno != EINTR) { 826 break; 827 } 828 } 829 if (ret_val == c_pid) { 830 /* exit with the childs exit value */ 831 exit(WEXITSTATUS(stat_loc)); 832 } else if (errno == ECHILD) { 833 md_exit(sp, 0); 834 } else { 835 perror(myname); 836 md_exit(sp, 1); 837 } 838 } 839 } 840 841 /* 842 * If a timeout value is given, everything from this point onwards is 843 * executed in the child process. 844 */ 845 846 switch (stepnum) { 847 case MC_START: 848 /* 849 * Start Step 850 * 851 * - Suspend all rpc.mdcommd messages 852 */ 853 854 /* expect the local node id to be given only */ 855 if (argc != 1) 856 usage(sp, 1); 857 858 meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"), 859 meta_print_hrtime(0)); 860 861 /* 862 * Does local set exist? If not, exit with 0 863 * since there's no reason to have this node panic if 864 * the local set cannot be started. 865 */ 866 if ((local_sp = load_local_set(ep)) == NULL) { 867 md_exit(local_sp, 0); 868 } 869 870 if ((max_sets = get_max_sets(ep)) == 0) { 871 mde_perror(ep, ""); 872 md_exit(sp, 1); 873 } 874 875 /* start walking through all possible disksets */ 876 for (setno = 1; setno < max_sets; setno++) { 877 if ((sp = metasetnosetname(setno, ep)) == NULL) { 878 if (mdiserror(ep, MDE_NO_SET)) { 879 /* No set for this setno - continue */ 880 mdclrerror(ep); 881 continue; 882 } else { 883 mde_perror(ep, gettext("Unable to " 884 "get set %d information"), setno); 885 md_exit(sp, 1); 886 } 887 } 888 889 /* only check multi-node disksets */ 890 if (!meta_is_mn_set(sp, ep)) { 891 mdclrerror(ep); 892 continue; 893 } 894 895 meta_mc_log(MC_LOG3, gettext("Start - block parse " 896 "messages for set %s: %s"), sp->setname, 897 meta_print_hrtime(gethrtime() - start_time)); 898 899 /* 900 * Mddb parse messages are sent amongst the nodes 901 * in a diskset whenever the locator block or 902 * locator names structure has been changed. 903 * A locator block change could occur as a result 904 * of a disk failure during the reconfig cycle, 905 * so block the mddb parse messages while the 906 * rpc.mdcommd is suspended during the reconfig cycle. 907 */ 908 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 909 (void) memset(&mbp, 0, sizeof (mbp)); 910 mbp.c_setno = setno; 911 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 912 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 913 &mbp.c_mde, NULL)) { 914 mdstealerror(ep, &mbp.c_mde); 915 mde_perror(ep, gettext("Could not " 916 "block set %s"), sp->setname); 917 md_exit(sp, 1); 918 } 919 } 920 921 /* suspend commd and spin waiting for drain */ 922 while ((ret_val = mdmn_suspend(setno, 923 MD_COMM_ALL_CLASSES)) == 924 MDE_DS_COMMDCTL_SUSPEND_NYD) { 925 sleep(1); 926 } 927 928 if (ret_val) { 929 md_eprintf(gettext("Could not suspend " 930 "rpc.mdcommd for set %s\n"), sp->setname); 931 md_exit(sp, 1); 932 } 933 934 /* 935 * Set start step flag for set. This is set to indicate 936 * that this node entered the reconfig cycle through 937 * the start step. This is used during the reconfig 938 * cycle to determine whether the node had entered 939 * through the start step or the return step. 940 */ 941 (void) memset(&sf, 0, sizeof (sf)); 942 sf.sf_setno = sp->setno; 943 sf.sf_setflags = MD_SET_MN_START_RC; 944 sf.sf_flags = MDDB_NM_SET; 945 /* Use magic to help protect ioctl against attack. */ 946 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 947 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 948 &sf.sf_mde, NULL)) { 949 mdstealerror(ep, &sf.sf_mde); 950 mde_perror(ep, gettext("Could not set " 951 "start_step flag for set %s"), sp->setname); 952 md_exit(sp, 1); 953 } 954 955 } 956 957 meta_mc_log(MC_LOG2, gettext("Start step completed: %s"), 958 meta_print_hrtime(gethrtime() - start_time)); 959 960 break; 961 962 case MC_STOP: 963 /* 964 * Stop Step 965 * 966 * - ??? 967 */ 968 969 /* don't expect any more arguments to follow the step name */ 970 if (argc != 0) 971 usage(sp, 1); 972 973 break; 974 975 case MC_ABORT: 976 /* 977 * Abort Step 978 * 979 * - Abort rpc.mdcommd 980 */ 981 982 /* don't expect any more arguments to follow the step name */ 983 if (argc != 0) 984 usage(sp, 1); 985 986 meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"), 987 meta_print_hrtime(0)); 988 989 /* 990 * Does local set exist? If not, exit with 0 991 * since there's no reason to have this node panic if 992 * the local set cannot be started. 993 */ 994 if ((local_sp = load_local_set(ep)) == NULL) { 995 md_exit(local_sp, 0); 996 } 997 998 /* 999 * abort the rpc.mdcommd. The abort is only issued on this node 1000 * meaning that the abort reconfig step is called on this 1001 * node before a panic while the rest of the cluster will 1002 * undergo a reconfig cycle. 1003 * There is no time relation between this node running a 1004 * reconfig abort and the the rest of the cluster 1005 * running a reconfig cycle meaning that this node may 1006 * panic before, during or after the cluster has run 1007 * a reconfig cycle. 1008 */ 1009 mdmn_abort(); 1010 1011 meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"), 1012 meta_print_hrtime(gethrtime() - start_time)); 1013 1014 break; 1015 1016 case MC_RETURN: 1017 /* 1018 * Return Step 1019 * 1020 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL 1021 * and release local set lock. Grabbing the local set 1022 * lock allows any active metaset/metadb commands to 1023 * terminate gracefully and will keep a metaset/metadb 1024 * command from starting until the DRAIN ALL is issued. 1025 * The metaset/metadb commands can issue 1026 * DRAIN ALL/RESUME ALL commands to rpc.mdcommd, 1027 * so the return step must not issue the DRAIN ALL command 1028 * until metaset/metadb have finished or metaset may issue 1029 * a RESUME ALL after this return reconfig step has issued 1030 * the DRAIN ALL command. 1031 * After this reconfig step has issued the DRAIN_ALL and 1032 * released the local set lock, metaset/metadb will fail 1033 * when attempting to contact the rpc.mdcommd and will 1034 * terminate without making any configuration changes. 1035 * The DRAIN ALL command will keep all other meta* commands 1036 * from running during the reconfig cycle (these commands 1037 * will wait until the rpc.mdcommd is resumed) since the 1038 * reconfig cycle may be changing the diskset configuration. 1039 */ 1040 1041 /* expect the nodelist to follow the step name */ 1042 if (argc < 1) 1043 usage(sp, 1); 1044 1045 meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"), 1046 meta_print_hrtime(0)); 1047 1048 /* 1049 * Does local set exist? If not, exit with 0 1050 * since there's no reason to have this node panic if 1051 * the local set cannot be started. 1052 */ 1053 if ((local_sp = load_local_set(ep)) == NULL) { 1054 md_exit(local_sp, 0); 1055 } 1056 1057 /* 1058 * Suspend any mirror resyncs that are in progress. This 1059 * stops unnecessary timeouts. 1060 */ 1061 meta_mirror_resync_block_all(); 1062 1063 if (meta_lock(local_sp, TRUE, ep) != 0) { 1064 mde_perror(ep, ""); 1065 md_exit(local_sp, 1); 1066 } 1067 1068 /* 1069 * All metaset and metadb commands on this node have now 1070 * terminated gracefully. Now, issue a drain all to 1071 * the rpc.mdcommd. Any meta command issued after the 1072 * drain all will either spin sending the command to the 1073 * master until after the reconfig cycle has finished OR 1074 * will terminate gracefully (metaset/metadb). 1075 */ 1076 if ((max_sets = get_max_sets(ep)) == 0) { 1077 mde_perror(ep, ""); 1078 md_exit(sp, 1); 1079 } 1080 1081 /* start walking through all possible disksets */ 1082 for (setno = 1; setno < max_sets; setno++) { 1083 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1084 if (mdiserror(ep, MDE_NO_SET)) { 1085 /* No set for this setno - continue */ 1086 mdclrerror(ep); 1087 continue; 1088 } else { 1089 mde_perror(ep, gettext("Unable to " 1090 "get set %d information"), setno); 1091 md_exit(sp, 1); 1092 } 1093 } 1094 1095 /* only check multi-node disksets */ 1096 if (!meta_is_mn_set(sp, ep)) { 1097 mdclrerror(ep); 1098 continue; 1099 } 1100 1101 meta_mc_log(MC_LOG3, gettext("Return - block parse " 1102 "messages for set %s: %s"), sp->setname, 1103 meta_print_hrtime(gethrtime() - start_time)); 1104 1105 /* 1106 * Mddb parse messages are sent amongst the nodes 1107 * in a diskset whenever the locator block or 1108 * locator names structure has been changed. 1109 * A locator block change could occur as a result 1110 * of a disk failure during the reconfig cycle, 1111 * so block the mddb parse messages while the 1112 * rpc.commd is suspended during the reconfig cycle. 1113 */ 1114 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1115 (void) memset(&mbp, 0, sizeof (mbp)); 1116 mbp.c_setno = setno; 1117 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 1118 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1119 &mbp.c_mde, NULL)) { 1120 mdstealerror(ep, &mbp.c_mde); 1121 mde_perror(ep, gettext("Could not " 1122 "block set %s"), sp->setname); 1123 md_exit(sp, 1); 1124 } 1125 } 1126 1127 /* suspend commd and spin waiting for drain */ 1128 while ((ret_val = mdmn_suspend(setno, 1129 MD_COMM_ALL_CLASSES)) == 1130 MDE_DS_COMMDCTL_SUSPEND_NYD) { 1131 sleep(1); 1132 } 1133 1134 if (ret_val) { 1135 md_eprintf(gettext("Could not suspend " 1136 "rpc.mdcommd for set %s\n"), sp->setname); 1137 md_exit(sp, 1); 1138 } 1139 } 1140 /* 1141 * Resume all I/Os for this node for all MN sets in 1142 * case master node had suspended I/Os but panic'd 1143 * before resuming I/Os. In case of failure, exit 1144 * with a 1 since unable to resume I/Os on this node. 1145 */ 1146 if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) { 1147 mde_perror(ep, gettext( 1148 "Unable to resume I/O on node %s for all sets"), 1149 mynode()); 1150 md_exit(sp, 1); 1151 } 1152 1153 1154 /* 1155 * Can now unlock local set lock. New metaset/metadb 1156 * commands are now held off using drain all. 1157 */ 1158 (void) meta_unlock(local_sp, ep); 1159 1160 meta_mc_log(MC_LOG2, gettext("Return step completed: %s"), 1161 meta_print_hrtime(gethrtime() - start_time)); 1162 1163 break; 1164 1165 case MC_STEP1: 1166 /* 1167 * Step 1 1168 * 1169 * - Populate nodelist file if we are on clustering 1170 * and pick a master node for each MN diskset. 1171 */ 1172 1173 /* expect the nodelist to follow the step name */ 1174 if (argc < 1) 1175 usage(sp, 1); 1176 1177 meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"), 1178 meta_print_hrtime(0)); 1179 1180 /* Always write nodelist file even if no local set exists */ 1181 if (clust == SDSSC_OKAY) { 1182 /* skip to the nodelist args */ 1183 if (meta_write_nodelist(argc, argv, ep) != 0) { 1184 mde_perror(ep, gettext( 1185 "Could not populate nodelist file")); 1186 md_exit(sp, 1); 1187 } 1188 } 1189 1190 /* 1191 * Does local set exist? If not, exit with 0 1192 * since there's no reason to have this node panic if 1193 * the local set cannot be started. 1194 */ 1195 if ((local_sp = load_local_set(ep)) == NULL) { 1196 md_exit(local_sp, 0); 1197 } 1198 1199 /* 1200 * At this point, all meta* commands are blocked across 1201 * all disksets since the master rpc.mdcommd has drained or 1202 * the master node has died. 1203 * If a metaset or metadb command had been in progress 1204 * at the start of the reconfig cycle, this command has 1205 * either completed or it has been terminated due to 1206 * the death of the master node. 1207 * 1208 * This means that that it is now ok to remove any 1209 * outstanding clnt_locks associated with multinode 1210 * disksets on this node due to a node panic during 1211 * a metaset operation. This allows the routines that 1212 * choose the master to use rpc.metad to determine the 1213 * master of the diskset. 1214 */ 1215 if (clnt_clr_mnsetlock(mynode(), ep) != 0) { 1216 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1217 "clear locks failed %s"), 1218 meta_print_hrtime(gethrtime() - start_time)); 1219 md_exit(local_sp, 1); 1220 } 1221 1222 /* 1223 * Call reconfig_choose_master to choose a master for 1224 * each MN diskset, update the nodelist for each diskset 1225 * given the member information and send a reinit message 1226 * to rpc.mdcommd to reload the nodelist. 1227 */ 1228 rval = meta_reconfig_choose_master(ep); 1229 if (rval == 205) { 1230 /* 1231 * NOTE: Should issue call to reboot remote host that 1232 * is causing the RPC failure. Clustering to 1233 * provide interface in the future. This should 1234 * stop a never-ending set of 205 reconfig cycles. 1235 * Remote host causing failure is stored in 1236 * ep->host if ep is an RPC error. 1237 * if (mdanyrpcerror(ep)) 1238 * reboot (ep->host); 1239 */ 1240 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1241 "choose master failure of 205 %s"), 1242 meta_print_hrtime(gethrtime() - start_time)); 1243 md_exit(local_sp, 205); 1244 } else if (rval != 0) { 1245 meta_mc_log(MC_LOG2, gettext("Step1 failure: " 1246 "choose master failure %s"), 1247 meta_print_hrtime(gethrtime() - start_time)); 1248 md_exit(local_sp, 1); 1249 } 1250 1251 meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"), 1252 meta_print_hrtime(gethrtime() - start_time)); 1253 1254 md_exit(local_sp, rval); 1255 break; 1256 1257 case MC_STEP2: 1258 /* 1259 * Step 2 1260 * 1261 * In Step 2, each node walks the list of disksets. If a 1262 * node is a master of a MN diskset, it synchronizes 1263 * the local set USER records for that diskset. 1264 * 1265 * If disks exist in the diskset and there is a joined 1266 * (owner) node in the diskset, the master will also: 1267 * - synchronize the diskset mddbs to the master 1268 * - play the change log 1269 * 1270 * The master node will now attempt to join any unjoined 1271 * nodes that are currently members in the membership list. 1272 */ 1273 1274 /* expect the nodelist to follow the step name */ 1275 if (argc < 1) 1276 usage(sp, 1); 1277 1278 meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"), 1279 meta_print_hrtime(0)); 1280 1281 /* 1282 * Does local set exist? If not, exit with 0 1283 * since there's no reason to have this node panic if 1284 * the local set cannot be started. 1285 */ 1286 if ((local_sp = load_local_set(ep)) == NULL) { 1287 md_exit(local_sp, 0); 1288 } 1289 1290 if ((max_sets = get_max_sets(ep)) == 0) { 1291 mde_perror(ep, ""); 1292 md_exit(local_sp, 1); 1293 } 1294 1295 /* start walking through all possible disksets */ 1296 for (setno = 1; setno < max_sets; setno++) { 1297 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1298 if (mdiserror(ep, MDE_NO_SET)) { 1299 /* No set for this setno - continue */ 1300 mdclrerror(ep); 1301 continue; 1302 } else if (mdanyrpcerror(ep)) { 1303 /* Fail on RPC failure to self */ 1304 mde_perror(ep, gettext( 1305 "Unable to get information for " 1306 "set number %d"), setno); 1307 md_exit(local_sp, 1); 1308 } else { 1309 mde_perror(ep, gettext( 1310 "Unable to get information for " 1311 "set number %d"), setno); 1312 mdclrerror(ep); 1313 continue; 1314 } 1315 } 1316 1317 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1318 if (mdanyrpcerror(ep)) { 1319 /* Fail on RPC failure to self */ 1320 mde_perror(ep, gettext( 1321 "Unable to get information for " 1322 "set number %d"), setno); 1323 md_exit(local_sp, 1); 1324 } 1325 mde_perror(ep, gettext("Unable to get set " 1326 "%s desc information"), sp->setname); 1327 mdclrerror(ep); 1328 continue; 1329 } 1330 1331 /* Only check MN disksets */ 1332 if (!(MD_MNSET_DESC(sd))) { 1333 continue; 1334 } 1335 1336 /* All actions in step 2 are driven by master */ 1337 if (!(sd->sd_mn_am_i_master)) { 1338 continue; 1339 } 1340 1341 meta_mc_log(MC_LOG3, gettext("Step2 - begin record " 1342 "synchronization for set %s: %s"), sp->setname, 1343 meta_print_hrtime(gethrtime() - start_time)); 1344 1345 /* 1346 * Synchronize the USER records in the local mddbs 1347 * for hosts that are members. The USER records 1348 * contain set, drive and host information. 1349 */ 1350 rval = meta_mnsync_user_records(sp, ep); 1351 if (rval != 0) { 1352 mde_perror(ep, gettext( 1353 "Synchronization of user records " 1354 "in set %s failed\n"), sp->setname); 1355 if (rval == 205) { 1356 /* 1357 * NOTE: Should issue call to reboot 1358 * remote host that is causing the RPC 1359 * failure. Clustering to provide 1360 * interface in the future. This 1361 * should stop a never-ending set of 1362 * 205 reconfig cycles. 1363 * Remote host causing failure is 1364 * stored in ep->host if ep is an 1365 * RPC error. 1366 * if (mdanyrpcerror(ep)) 1367 * reboot (ep->host); 1368 */ 1369 md_exit(local_sp, 205); 1370 } else { 1371 md_exit(local_sp, 1); 1372 } 1373 } 1374 1375 /* Reget sd since sync_user_recs may have flushed it */ 1376 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1377 mde_perror(ep, gettext("Unable to get set " 1378 "%s desc information"), sp->setname); 1379 md_exit(local_sp, 1); 1380 } 1381 1382 dd = metaget_drivedesc(sp, 1383 (MD_BASICNAME_OK | PRINT_FAST), ep); 1384 if (! mdisok(ep)) { 1385 mde_perror(ep, gettext("Unable to get set " 1386 "%s drive information"), sp->setname); 1387 md_exit(local_sp, 1); 1388 } 1389 1390 /* 1391 * No drives in set, continue to next set. 1392 */ 1393 if (dd == NULL) { 1394 /* Done with this set */ 1395 continue; 1396 } 1397 1398 meta_mc_log(MC_LOG3, gettext("Step2 - local set user " 1399 "records completed for set %s: %s"), sp->setname, 1400 meta_print_hrtime(gethrtime() - start_time)); 1401 1402 /* 1403 * Synchronize the diskset mddbs for hosts 1404 * that are members. This may involve 1405 * playing the changelog and writing out 1406 * to the diskset mddbs. 1407 */ 1408 rval = meta_mnsync_diskset_mddbs(sp, ep); 1409 if (rval != 0) { 1410 mde_perror(ep, gettext( 1411 "Synchronization of diskset mddbs " 1412 "in set %s failed\n"), sp->setname); 1413 meta_mc_log(MC_LOG3, gettext("Step2 - diskset " 1414 "mddb synchronization failed for " 1415 "set %s: %s"), sp->setname, 1416 meta_print_hrtime(gethrtime() - 1417 start_time)); 1418 if (rval == 205) { 1419 /* 1420 * NOTE: Should issue call to reboot 1421 * remote host that is causing the RPC 1422 * failure. Clustering to provide 1423 * interface in the future. This 1424 * should stop a never-ending set of 1425 * 205 reconfig cycles. 1426 * Remote host causing failure is 1427 * stored in ep->host if ep is an 1428 * RPC error. 1429 * if (mdanyrpcerror(ep)) 1430 * reboot (ep->host); 1431 */ 1432 md_exit(local_sp, 205); 1433 } else if (rval == 1) { 1434 continue; 1435 } else { 1436 md_exit(local_sp, 1); 1437 } 1438 } 1439 1440 meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb " 1441 "synchronization completed for set %s: %s"), 1442 sp->setname, 1443 meta_print_hrtime(gethrtime() - start_time)); 1444 1445 /* Join the starting nodes to the diskset */ 1446 rval = meta_mnjoin_all(sp, ep); 1447 if (rval != 0) { 1448 mde_perror(ep, gettext( 1449 "Join of non-owner (starting) nodes " 1450 "in set %s failed\n"), sp->setname); 1451 meta_mc_log(MC_LOG3, gettext("Step2 - non owner" 1452 "nodes joined for set %s: %s"), 1453 sp->setname, 1454 meta_print_hrtime(gethrtime() - 1455 start_time)); 1456 if (rval == 205) { 1457 /* 1458 * NOTE: Should issue call to reboot 1459 * remote host that is causing the RPC 1460 * failure. Clustering to provide 1461 * interface in the future. This 1462 * should stop a never-ending set of 1463 * 205 reconfig cycles. 1464 * Remote host causing failure is 1465 * stored in ep->host if ep is an 1466 * RPC error. 1467 * if (mdanyrpcerror(ep)) 1468 * reboot (ep->host); 1469 */ 1470 md_exit(local_sp, 205); 1471 } else { 1472 md_exit(local_sp, 1); 1473 } 1474 } 1475 1476 meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes " 1477 "joined for set %s: %s"), sp->setname, 1478 meta_print_hrtime(gethrtime() - start_time)); 1479 1480 } 1481 1482 meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"), 1483 meta_print_hrtime(gethrtime() - start_time)); 1484 1485 break; 1486 1487 case MC_STEP3: 1488 /* 1489 * Step 3 1490 * 1491 * For all multinode sets do, 1492 * - Reinitialise rpc.mdcommd 1493 * - Reset mirror owners to null if the current owner is 1494 * no longer in the membership list 1495 */ 1496 1497 /* expect the nodelist to follow the step name */ 1498 if (argc < 1) 1499 usage(sp, 1); 1500 1501 meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"), 1502 meta_print_hrtime(0)); 1503 1504 /* 1505 * Does local set exist? If not, exit with 0 1506 * since there's no reason to have this node panic if 1507 * the local set cannot be started. 1508 */ 1509 if ((local_sp = load_local_set(ep)) == NULL) { 1510 md_exit(local_sp, 0); 1511 } 1512 1513 /* 1514 * walk through all sets on this node which could include: 1515 * - MN disksets 1516 * - traditional disksets 1517 * - non-existent disksets 1518 * start mirror resync for all MN sets 1519 */ 1520 if ((max_sets = get_max_sets(ep)) == 0) { 1521 mde_perror(ep, ""); 1522 md_exit(local_sp, 1); 1523 } 1524 1525 /* start walking through all possible disksets */ 1526 for (setno = 1; setno < max_sets; setno++) { 1527 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1528 if (mdiserror(ep, MDE_NO_SET)) { 1529 /* No set for this setno - continue */ 1530 mdclrerror(ep); 1531 continue; 1532 } else { 1533 mde_perror(ep, gettext("Unable to " 1534 "get set %d information"), setno); 1535 md_exit(local_sp, 1); 1536 } 1537 } 1538 1539 /* only check multi-node disksets */ 1540 if (!meta_is_mn_set(sp, ep)) { 1541 mdclrerror(ep); 1542 continue; 1543 } 1544 1545 if (meta_lock(sp, TRUE, ep) != 0) { 1546 mde_perror(ep, ""); 1547 md_exit(local_sp, 1); 1548 } 1549 1550 /* If this node isn't joined to set, do nothing */ 1551 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1552 if (!mdisok(ep)) { 1553 mde_perror(ep, gettext("Could " 1554 "not get set %s ownership"), 1555 sp->setname); 1556 md_exit(sp, 1); 1557 } 1558 mdclrerror(ep); 1559 meta_unlock(sp, ep); 1560 continue; 1561 } 1562 1563 meta_mc_log(MC_LOG3, gettext("Step3 - begin " 1564 "re-initialising rpc.mdcommd and resetting mirror " 1565 "owners for set %s: %s"), sp->setname, 1566 meta_print_hrtime(gethrtime() - start_time)); 1567 1568 /* reinitialzse rpc.mdcommd with new nodelist */ 1569 if (mdmn_reinit_set(setno)) { 1570 md_eprintf(gettext( 1571 "Could not re-initialise rpc.mdcommd for " 1572 "set %s\n"), sp->setname); 1573 md_exit(sp, 1); 1574 } 1575 1576 (void) memset(&cfg, 0, sizeof (cfg)); 1577 cfg.c_id = 0; 1578 cfg.c_setno = sp->setno; 1579 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1580 NULL) != 0) { 1581 mdstealerror(ep, &cfg.c_mde); 1582 mde_perror(ep, gettext("Could " 1583 "not get set %s information"), 1584 sp->setname); 1585 md_exit(sp, 1); 1586 } 1587 1588 /* Don't do anything else if set is stale */ 1589 if (cfg.c_flags & MDDB_C_STALE) { 1590 meta_unlock(sp, ep); 1591 mdclrerror(ep); 1592 continue; 1593 } 1594 1595 /* reset mirror owners */ 1596 if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) { 1597 md_exit(sp, 1); 1598 } 1599 1600 meta_unlock(sp, ep); 1601 1602 meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd " 1603 "re-initialised and mirror owners reset for " 1604 "set %s: %s"), sp->setname, 1605 meta_print_hrtime(gethrtime() - start_time)); 1606 } 1607 1608 meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"), 1609 meta_print_hrtime(gethrtime() - start_time)); 1610 1611 break; 1612 1613 case MC_STEP4: 1614 /* 1615 * Step 4 1616 * 1617 * For all multinode sets do: 1618 * - Resume the rpc.mdcommd messages. Must resume all 1619 * sets before issuing I/O to any set since an error 1620 * encountered in a commd suspended set could be 1621 * blocked waiting for commd in another set to resume. 1622 * (This happens since the daemon queues service 1623 * all sets). An open of a soft partition causes 1624 * a read of the watermarks during the open. 1625 * - If set is non-writable (not an owner or STALE), then 1626 * continue to next set. 1627 * 1628 * For all multinode sets do, 1629 * - Reset ABR states for all mirrors, ie clear ABR if not 1630 * open on any node. 1631 * - Reset ABR states for all soft partitions, ie clear ABR if 1632 * not open on any node. 1633 * - For all slave nodes that have entered through the start 1634 * step, update the ABR state to that of the master and 1635 * get the submirror state from the master 1636 * - meta_lock set 1637 * - Resync all mirrors 1638 * - unlock meta_lock for this set. 1639 * - Choose a new owner for any orphaned resyncs 1640 * 1641 * There is one potential issue here. when concurrently 1642 * resetting and updating the ABR state. If the master has ABR 1643 * set, but should no longer have because the only node that 1644 * had the metadevice open and had ABR set has paniced, the 1645 * master will send a message to all nodes to clear the ABR 1646 * state. Meanwhile any node that has come through the 1647 * start step will get tstate from the master and will update 1648 * ABR if it was set in tstate. So, we appear to have a problem 1649 * if the following sequence occurs:- 1650 * - The slave gets tstate with ABR set 1651 * - The master sends a message to clear ABR 1652 * - The slave updates ABR with the value it got from tstate. 1653 * We now have the master with ABR clear and the slave with ABR 1654 * set. Fortunately, having set ABR, the slave will close the 1655 * metadevice after setting ABR and as there are no nodes with 1656 * the device open, the close will send a message to clear ABR 1657 * on all nodes. So, the nodes will all have ABR unset. 1658 */ 1659 1660 /* expect the nodelist to follow the step name */ 1661 if (argc < 1) 1662 usage(sp, 1); 1663 1664 meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"), 1665 meta_print_hrtime(0)); 1666 1667 /* 1668 * Does local set exist? If not, exit with 0 1669 * since there's no reason to have this node panic if 1670 * the local set cannot be started. 1671 */ 1672 if ((local_sp = load_local_set(ep)) == NULL) { 1673 md_exit(local_sp, 0); 1674 } 1675 1676 /* 1677 * walk through all sets on this node which could include: 1678 * - MN disksets 1679 * - traditional disksets 1680 * - non-existent disksets 1681 * start mirror resync for all MN sets 1682 */ 1683 if ((max_sets = get_max_sets(ep)) == 0) { 1684 mde_perror(ep, ""); 1685 md_exit(local_sp, 1); 1686 } 1687 1688 /* Clear set_info structure */ 1689 for (setno = 1; setno < max_sets; setno++) { 1690 set_info[setno] = 0; 1691 } 1692 1693 /* start walking through all possible disksets */ 1694 for (setno = 1; setno < max_sets; setno++) { 1695 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1696 if (mdiserror(ep, MDE_NO_SET)) { 1697 /* No set for this setno - continue */ 1698 mdclrerror(ep); 1699 continue; 1700 } else { 1701 mde_perror(ep, gettext("Unable to " 1702 "get set %d information"), setno); 1703 md_exit(local_sp, 1); 1704 } 1705 } 1706 1707 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1708 mde_perror(ep, gettext("Unable to get set " 1709 "%s desc information"), sp->setname); 1710 mdclrerror(ep); 1711 continue; 1712 } 1713 1714 /* only check multi-node disksets */ 1715 if (!meta_is_mn_set(sp, ep)) { 1716 mdclrerror(ep); 1717 continue; 1718 } 1719 1720 set_info[setno] |= SET_INFO_MN; 1721 1722 /* 1723 * If not an owner (all mddbs failed) or stale 1724 * (< 50% mddbs operational), then set is 1725 * non-writable so just resume commd and 1726 * unblock mddb messages. 1727 */ 1728 mdclrerror(ep); 1729 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1730 set_info[setno] |= SET_INFO_NO_WR; 1731 } 1732 if (!mdisok(ep)) { 1733 mde_perror(ep, gettext("Could " 1734 "not get set %s ownership"), 1735 sp->setname); 1736 md_exit(local_sp, 1); 1737 } 1738 /* Set is owned - is it stale? */ 1739 if (!set_info[setno] & SET_INFO_NO_WR) { 1740 (void) memset(&cfg, 0, sizeof (cfg)); 1741 cfg.c_id = 0; 1742 cfg.c_setno = sp->setno; 1743 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1744 NULL) != 0) { 1745 mdstealerror(ep, &cfg.c_mde); 1746 mde_perror(ep, gettext("Could " 1747 "not get set %s information"), 1748 sp->setname); 1749 md_exit(local_sp, 1); 1750 } 1751 if (cfg.c_flags & MDDB_C_STALE) { 1752 set_info[setno] |= SET_INFO_NO_WR; 1753 } 1754 } 1755 1756 /* resume rpc.mdcommd */ 1757 if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0)) { 1758 md_eprintf(gettext("Unable to resume " 1759 "rpc.mdcommd for set %s\n"), sp->setname); 1760 md_exit(local_sp, 1); 1761 } 1762 meta_ping_mnset(setno); 1763 1764 /* Unblock mddb parse messages */ 1765 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1766 (void) memset(&mbp, 0, sizeof (mbp)); 1767 mbp.c_setno = setno; 1768 mbp.c_blk_flags = MDDB_UNBLOCK_PARSE; 1769 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1770 &mbp.c_mde, NULL)) { 1771 mdstealerror(ep, &mbp.c_mde); 1772 mde_perror(ep, gettext("Could not " 1773 "unblock set %s"), sp->setname); 1774 md_exit(local_sp, 1); 1775 } 1776 } 1777 meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd " 1778 "resumed and messages unblocked for set %s: %s"), 1779 sp->setname, 1780 meta_print_hrtime(gethrtime() - start_time)); 1781 } 1782 1783 for (setno = 1; setno < max_sets; setno++) { 1784 int start_step; 1785 1786 /* Skip traditional disksets. */ 1787 if ((set_info[setno] & SET_INFO_MN) == 0) 1788 continue; 1789 1790 /* 1791 * If already determined that this set is 1792 * a non-writable set, then just continue 1793 * to next set since there's nothing else 1794 * to do for a non-writable set. 1795 */ 1796 if (set_info[setno] & SET_INFO_NO_WR) 1797 continue; 1798 1799 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1800 if (mdiserror(ep, MDE_NO_SET)) { 1801 /* No set for this setno - continue */ 1802 mdclrerror(ep); 1803 continue; 1804 } else { 1805 mde_perror(ep, gettext("Unable to " 1806 "get set %d information"), setno); 1807 md_exit(local_sp, 1); 1808 } 1809 } 1810 1811 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1812 mde_perror(ep, gettext("Unable to get set " 1813 "%s desc information"), sp->setname); 1814 mdclrerror(ep); 1815 continue; 1816 } 1817 1818 /* See if this node came through the start step */ 1819 (void) memset(&sf, 0, sizeof (sf)); 1820 sf.sf_setno = sp->setno; 1821 sf.sf_flags = MDDB_NM_GET; 1822 /* Use magic to help protect ioctl against attack. */ 1823 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1824 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1825 &sf.sf_mde, NULL)) { 1826 mdstealerror(ep, &sf.sf_mde); 1827 mde_perror(ep, gettext("Could not get " 1828 "start_step flag for set %s"), sp->setname); 1829 md_exit(local_sp, 1); 1830 } 1831 start_step = 1832 (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0; 1833 1834 /* 1835 * We can now reset the start_step flag for the set 1836 * if it was already set. 1837 */ 1838 if (start_step) { 1839 (void) memset(&sf, 0, sizeof (sf)); 1840 sf.sf_setno = sp->setno; 1841 sf.sf_setflags = MD_SET_MN_START_RC; 1842 sf.sf_flags = MDDB_NM_RESET; 1843 /* 1844 * Use magic to help protect ioctl 1845 * against attack. 1846 */ 1847 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1848 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1849 &sf.sf_mde, NULL)) { 1850 mdstealerror(ep, &sf.sf_mde); 1851 mde_perror(ep, 1852 gettext("Could not reset " 1853 "start_step flag for set %s"), 1854 sp->setname); 1855 } 1856 } 1857 1858 meta_mc_log(MC_LOG3, gettext("Step4 - begin setting " 1859 "ABR state and restarting io's for " 1860 "set %s: %s"), sp->setname, 1861 meta_print_hrtime(gethrtime() - start_time)); 1862 1863 1864 /* 1865 * If we are not the master and we have come through 1866 * the start step, we must update the ABR states 1867 * for mirrors and soft partitions. Also the submirror 1868 * states need to be synchronised so that we see the 1869 * same status as other previously joined members. 1870 * This _must_ be done before starting the resync. 1871 */ 1872 if (!(sd->sd_mn_am_i_master) && start_step) { 1873 if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR, 1874 ep) == -1) { 1875 md_exit(local_sp, 1); 1876 } 1877 if (reset_state(UPDATE_ABR, sp, MD_SP, 1878 ep) == -1) { 1879 md_exit(local_sp, 1); 1880 } 1881 /* 1882 * Mark the fact that we've got the mirror 1883 * state. This allows the resync thread to 1884 * determine if _it_ needs to issue this. This 1885 * can happen if a node is added to a set after 1886 * a reconfig cycle has completed. 1887 */ 1888 (void) memset(&sf, 0, sizeof (sf)); 1889 sf.sf_setno = sp->setno; 1890 sf.sf_setflags = MD_SET_MN_MIR_STATE_RC; 1891 sf.sf_flags = MDDB_NM_SET; 1892 /* 1893 * Use magic to help protect ioctl 1894 * against attack. 1895 */ 1896 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1897 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1898 &sf.sf_mde, NULL)) { 1899 mdstealerror(ep, &sf.sf_mde); 1900 mde_perror(ep, 1901 gettext("Could not set " 1902 "submirror state flag for set %s"), 1903 sp->setname); 1904 } 1905 } 1906 1907 /* 1908 * All remaining actions are only performed by the 1909 * master 1910 */ 1911 if (!(sd->sd_mn_am_i_master)) { 1912 if (meta_lock(sp, TRUE, ep) != 0) { 1913 mde_perror(ep, ""); 1914 md_exit(local_sp, 1); 1915 } 1916 meta_mirror_resync_unblock(sp); 1917 meta_unlock(sp, ep); 1918 continue; 1919 } 1920 1921 /* 1922 * If the master came through the start step, this 1923 * implies that all of the nodes must have done the 1924 * same and hence there can be no applications 1925 * running. Hence no need to reset ABR 1926 */ 1927 if (!start_step) { 1928 /* Reset ABR state for mirrors */ 1929 if (reset_state(RESET_ABR, sp, MD_MIRROR, 1930 ep) == -1) { 1931 md_exit(local_sp, 1); 1932 } 1933 /* ...and now the same for soft partitions */ 1934 if (reset_state(RESET_ABR, sp, MD_SP, 1935 ep) == -1) { 1936 md_exit(local_sp, 1); 1937 } 1938 } 1939 1940 /* 1941 * choose owners for orphaned resyncs and reset 1942 * non-orphaned resyncs so that an owner node that 1943 * reboots will restart the resync if needed. 1944 */ 1945 if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1) 1946 md_exit(local_sp, 1); 1947 1948 /* 1949 * Must unlock set lock before meta_mirror_resync_all 1950 * sends a message to run the metasync command 1951 * which also grabs the meta_lock. 1952 */ 1953 if (meta_lock(sp, TRUE, ep) != 0) { 1954 mde_perror(ep, ""); 1955 md_exit(local_sp, 1); 1956 } 1957 meta_mirror_resync_unblock(sp); 1958 meta_unlock(sp, ep); 1959 1960 /* resync all mirrors in set */ 1961 if (meta_mirror_resync_all(sp, 0, ep) != 0) { 1962 mde_perror(ep, gettext("Mirror resyncs " 1963 "failed for set %s"), sp->setname); 1964 md_exit(local_sp, 1); 1965 } 1966 1967 meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted " 1968 "for set %s: %s"), sp->setname, 1969 meta_print_hrtime(gethrtime() - start_time)); 1970 } 1971 1972 meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"), 1973 meta_print_hrtime(gethrtime() - start_time)); 1974 1975 break; 1976 1977 default: 1978 usage(sp, 1); 1979 break; 1980 } 1981 1982 md_exit(sp, 0); 1983 /* NOTREACHED */ 1984 return (0); 1985 } 1986