1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <meta.h> 28 #include <sdssc.h> 29 #include <signal.h> 30 #include <syslog.h> 31 #include <sys/types.h> 32 #include <sys/wait.h> 33 #include <sys/lvm/md_mirror.h> 34 #include <metad.h> 35 36 #define MY_VERSION "1.0" /* the highest supported version */ 37 #define MAX_DEBUG_LEVEL 5 /* maximum verbosity level */ 38 39 #define RESET_OWNER 0x0001 40 #define CHOOSE_OWNER 0x0002 41 #define RESET_ABR 0x0004 42 #define UPDATE_ABR 0x0008 43 #define GET_MIRROR_STATE 0x0010 44 45 #define SET_INFO_NO_WR 0x0002 46 #define SET_INFO_MN 0x0004 47 48 /* 49 * This table defines all the metaclust reconfig steps we understand 50 */ 51 typedef enum stpnum { 52 MC_UNK = 0, 53 MC_START, 54 MC_STOP, 55 MC_ABORT, 56 MC_RETURN, 57 MC_STEP1, 58 MC_STEP2, 59 MC_STEP3, 60 MC_STEP4 61 } stepnum_t; 62 63 /* 64 * Structure for step_name -> step_number mapping 65 */ 66 struct step_t { 67 char *step_nam; 68 stepnum_t step_num; 69 }; 70 71 /* 72 * Step name to step number mapping table 73 * This table MUST be sorted alphabetically in ascending order of step name 74 */ 75 static struct step_t step_table[] = { 76 { "abort", MC_ABORT }, 77 { "return", MC_RETURN }, 78 { "start", MC_START }, 79 { "step1", MC_STEP1 }, 80 { "step2", MC_STEP2 }, 81 { "step3", MC_STEP3 }, 82 { "step4", MC_STEP4 }, 83 { "stop", MC_STOP } 84 }; 85 86 /* 87 * If support for a different version is added, the new version number should 88 * be appended to the version_table below. This list will be searched to 89 * determine if a version requested via the -V option is supported or not. 90 */ 91 static char *version_table[] = { 92 MY_VERSION 93 }; 94 95 uint_t timeout = 0; /* disable timeout by default */ 96 char *version = MY_VERSION; /* use latest version by default */ 97 int stepnum = MC_UNK; /* reconfiguration step number */ 98 pid_t c_pid; /* child process id */ 99 100 /* 101 * Binary search comparison routine 102 */ 103 static int 104 mc_compare(const void *stp1, const void *stp2) 105 { 106 return (strcmp((const char *)stp1, 107 ((const struct step_t *)stp2)->step_nam)); 108 } 109 110 /* 111 * Timeout expiry alarm signal handler 112 */ 113 /*ARGSUSED*/ 114 static void 115 sigalarmhandler(int sig) 116 { 117 int i, n, ret, stat_loc = 0; 118 FILE *pgcore; 119 char corecmd[256]; 120 121 n = sizeof (step_table) / sizeof (step_table[0]); 122 for (i = 0; i < n; i++) { 123 if (stepnum == step_table[i].step_num) 124 break; 125 } 126 127 assert(i != n); 128 129 meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"), 130 step_table[i].step_nam, 131 meta_print_hrtime(gethrtime() - start_time)); 132 133 /* 134 * See what the child was actually doing when the timeout expired. 135 * A core-dump of this would be _really_ good, so let's just 136 * try a 'gcore -g c_pid' and hope 137 */ 138 139 (void) memset(corecmd, 0, sizeof (corecmd)); 140 (void) snprintf(corecmd, sizeof (corecmd), 141 "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid); 142 143 pgcore = popen(corecmd, "r"); 144 145 if (pgcore == NULL) { 146 meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"), 147 c_pid); 148 } else { 149 (void) pclose(pgcore); 150 } 151 152 if ((ret = kill(c_pid, SIGKILL)) == 0) { 153 /* 154 * The child will wait forever until the status is retrieved 155 * so get it now. Keep retrying if the call is interrupted. 156 * 157 * The possible results are, 158 * 159 * - child killed successfully 160 * - signal sent but child not killed 161 * - waitpid failed/interrupted 162 */ 163 (void) sleep(2); 164 while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) { 165 if (errno != EINTR) { 166 break; 167 } 168 } 169 if ((ret == c_pid) || (errno == ECHILD)) { 170 ret = 0; 171 } else { 172 ret = 1; 173 } 174 } else if (errno == ESRCH) { 175 /* 176 * If the kill did not catch the child then it means the child 177 * exited immediately after the timeout occured. 178 */ 179 ret = 0; 180 } 181 182 /* 183 * make sure not to exit with 205 for any steps other than step1-step4. 184 * Suncluster reconfiguration can't handle it otherwise. 185 */ 186 switch (stepnum) { 187 case MC_STEP1: 188 case MC_STEP2: 189 case MC_STEP3: 190 case MC_STEP4: 191 /* 192 * If the child was killed successfully return 205 for a 193 * new reconfig cycle otherwise send 1 to panic the node. 194 */ 195 if (ret != 0) { 196 md_eprintf(gettext("Could not kill child\n")); 197 exit(1); 198 } else { 199 exit(205); 200 } 201 break; 202 case MC_START: 203 case MC_STOP: 204 case MC_ABORT: 205 case MC_RETURN: 206 default: 207 exit(1); 208 break; 209 } 210 } 211 212 /* 213 * Attempt to load local set. 214 * Returns: 215 * pointer to mdsetname_t for local set (local_sp) is successful. 216 * 0 if failure 217 * if there are no local set mddbs, no error message is printed. 218 * Otherwise, error message is printed so that user 219 * can determine why the local set didn't start. 220 */ 221 mdsetname_t * 222 load_local_set(md_error_t *ep) 223 { 224 mdsetname_t *local_sp = NULL; 225 226 /* Does local set exist? If not, give no error */ 227 if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) { 228 return (0); 229 } 230 231 /* 232 * snarf local set 233 * If fails with MDE_DB_NODB, then just return 1 printing 234 * no failure. 235 * Otherwise, print error message, and return 1. 236 */ 237 if (meta_setup_db_locations(ep) != 0) { 238 if (!(mdismddberror(ep, MDE_DB_NODB))) 239 mde_perror(ep, ""); 240 return (0); 241 } 242 243 /* local set loaded successfully */ 244 return (local_sp); 245 } 246 247 /* 248 * Purpose: Compose a full path name for a metadevice 249 * 250 * On entry: sp - setname pointer 251 * mnum - minor number of metadevice 252 * pathname - pointer to array to return path string 253 * pathlen - max length of pathname array 254 */ 255 static int 256 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen) 257 { 258 int rtn; 259 mdname_t *np; 260 md_error_t status = mdnullerror; 261 262 if (MD_MIN2SET(mnum) != sp->setno) { 263 md_eprintf(gettext("minor number 0x%x invalid for set %d\n"), 264 mnum, sp->setno); 265 return (-1); 266 } 267 268 if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) { 269 return (-1); 270 } 271 272 rtn = snprintf(pathname, pathlen, "%s", np->rname); 273 274 if ((pathname[0] == '\0') || (rtn >= pathlen)) { 275 md_eprintf(gettext( 276 "Could not create path for device %s\n"), 277 get_mdname(sp, mnum)); 278 return (-1); 279 } 280 return (0); 281 } 282 283 /* 284 * Purpose: Walk through all the devices specified for the given set 285 * and do the action specified in mode 286 */ 287 static int 288 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep) 289 { 290 mdnamelist_t *devnlp = NULL; 291 mdnamelist_t *p; 292 mdname_t *devnp = NULL; 293 md_set_mmown_params_t ownpar_p; 294 md_set_mmown_params_t *ownpar = &ownpar_p; 295 md_unit_t *mm; 296 int mirror_dev = 0; 297 mndiskset_membershiplist_t *nl; 298 int cnt; 299 int has_parent; 300 md_mn_get_mir_state_t mir_state_p; 301 md_mn_get_mir_state_t *mir_state = &mir_state_p; 302 303 /* 304 * if we are choosing or resetting the owners then make sure 305 * we are only doing it for mirror devices 306 */ 307 mirror_dev = (strcmp(MD_MIRROR, drivername) == 0); 308 if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) { 309 return (-1); 310 } 311 312 /* get a list of all the metadevices for current set */ 313 if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) { 314 mde_perror(ep, gettext("Could not get mirrors for set %s"), 315 sp->setname); 316 return (-1); 317 } else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) { 318 mde_perror(ep, gettext( 319 "Could not get soft partitions for set %s"), sp->setname); 320 return (-1); 321 } 322 323 /* If resetting the owner, get the known membership list */ 324 if (mode & RESET_OWNER) { 325 if (meta_read_nodelist(&cnt, &nl, ep)) { 326 mde_perror(ep, "Could not get nodelist"); 327 return (-1); 328 } 329 } 330 331 /* for each metadevice */ 332 for (p = devnlp; (p != NULL); p = p->next) { 333 devnp = p->namep; 334 335 /* 336 * Get the current setting for mirror ABR state and all of the 337 * submirror state and flags from the master node. We only 338 * perform this when going through a 'start' cycle. 339 */ 340 if ((mode & GET_MIRROR_STATE) && mirror_dev) { 341 char *miscname; 342 343 /* 344 * Ensure that we ignore soft-parts that are returned 345 * from the meta_get_mirror_names() call 346 */ 347 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 348 goto out; 349 if (strcmp(miscname, MD_MIRROR) != 0) 350 continue; 351 352 mir_state->mnum = meta_getminor(devnp->dev); 353 MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno); 354 meta_mc_log(MC_LOG4, gettext("Getting mirror state" 355 " for %s: %s"), get_mdname(sp, mir_state->mnum), 356 meta_print_hrtime(gethrtime() - start_time)); 357 358 if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep, 359 "MD_MN_GET_MIRROR_STATE") != 0) { 360 mde_perror(ep, gettext("Unable to get " 361 "mirror state for %s"), 362 get_mdname(sp, mir_state->mnum)); 363 goto out; 364 } else { 365 continue; 366 } 367 } 368 369 /* check if this is a top level metadevice */ 370 if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL) 371 goto out; 372 if (MD_HAS_PARENT(MD_PARENT(mm))) { 373 has_parent = 1; 374 } else { 375 has_parent = 0; 376 } 377 Free(mm); 378 379 if (mode & (RESET_OWNER | CHOOSE_OWNER)) { 380 char *miscname; 381 382 /* 383 * we can only do these for mirrors so make sure we 384 * really have a mirror device and not a softpartition 385 * imitating one. meta_get_mirror_names seems to think 386 * softparts on top of a mirror are mirrors! 387 */ 388 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 389 goto out; 390 if (strcmp(miscname, MD_MIRROR) != 0) 391 continue; 392 393 (void) memset(ownpar, 0, sizeof (*ownpar)); 394 ownpar->d.mnum = meta_getminor(devnp->dev); 395 MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno); 396 397 meta_mc_log(MC_LOG4, gettext("Setting owner " 398 "for %s: %s"), get_mdname(sp, ownpar->d.mnum), 399 meta_print_hrtime(gethrtime() - start_time)); 400 401 /* get the current owner id */ 402 if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep, 403 "MD_MN_GET_MM_OWNER") != 0) { 404 mde_perror(ep, gettext("Unable to get " 405 "mirror owner for %s"), 406 get_mdname(sp, ownpar->d.mnum)); 407 goto out; 408 } 409 } 410 411 if (mode & RESET_OWNER) { 412 if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) { 413 mdclrerror(ep); 414 continue; 415 } 416 417 /* 418 * reset owner only if the current owner is 419 * not in the membership list 420 * Also kill the resync thread so that when the resync 421 * is started, it will perform an optimized resync 422 * for any resync regions that were dirty when the 423 * current owner left the membership. 424 */ 425 if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) { 426 if (meta_mn_change_owner(&ownpar, 427 sp->setno, ownpar->d.mnum, 428 MD_MN_MIRROR_UNOWNED, 429 MD_MN_MM_ALLOW_CHANGE) == -1) { 430 md_eprintf(gettext( 431 "Unable to reset mirror owner " 432 "for %s\n"), 433 get_mdname(sp, ownpar->d.mnum)); 434 goto out; 435 } 436 if (meta_mirror_resync(sp, devnp, 0, ep, 437 MD_RESYNC_KILL_NO_WAIT) != 0) { 438 md_eprintf(gettext( 439 "Unable to kill resync for" 440 " %s\n"), 441 get_mdname(sp, ownpar->d.mnum)); 442 goto out; 443 } 444 } 445 } 446 447 if (mode & CHOOSE_OWNER) { 448 /* 449 * only orphaned resyncs will have no owner. 450 * if that is the case choose a new owner. Otherwise 451 * re-establish the existing owner. This covers the 452 * case where a node that owned the mirror 453 * reboots/panics and comes back into the cluster before 454 * the reconfig cycle has completed. In this case the 455 * other cluster nodes will have the mirror owner marked 456 * as the rebooted node while it has the owner marked 457 * as 'None'. We have to reestablish the ownership so 458 * that the subsequent resync can continue. 459 */ 460 if (meta_mn_change_owner(&ownpar, sp->setno, 461 ownpar->d.mnum, ownpar->d.owner, 462 MD_MN_MM_CHOOSE_OWNER) == -1) { 463 md_eprintf(gettext("Unable to choose " 464 "mirror owner for %s\n"), 465 get_mdname(sp, ownpar->d.mnum)); 466 goto out; 467 } 468 } 469 470 /* 471 * For RESET_ABR and UPDATE_ABR - only handle top 472 * level metadevices. 473 */ 474 if (has_parent) 475 continue; 476 477 if (mode & RESET_ABR) { 478 /* 479 * Reset the ABR (application based recovery) 480 * value on all nodes. We are dealing with 481 * the possibility that we have ABR set but the 482 * only node that had the device open with ABR has 483 * left the cluster. We simply open and close the 484 * device and if this is the last close in the 485 * cluster, ABR will be cleared on all nodes. 486 */ 487 char *miscname; 488 char name[MAXPATHLEN]; 489 int mnum, fd; 490 491 name[0] = '\0'; 492 mnum = meta_getminor(devnp->dev); 493 494 /* 495 * Ensure that we don't include soft-parts in the 496 * mirror-only call to RESET_ABR. meta_get_mirror_names 497 * returns a bogus list that includes all soft-parts 498 * built on mirrors. 499 */ 500 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 501 goto out; 502 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 503 continue; 504 505 meta_mc_log(MC_LOG4, gettext("Re-setting ABR state " 506 "for %s: %s"), get_mdname(sp, mnum), 507 meta_print_hrtime(gethrtime() - start_time)); 508 509 /* compose the absolute device path and open it */ 510 if (compose_path(sp, mnum, &name[0], 511 sizeof (name)) != 0) 512 goto out; 513 if ((fd = open(name, O_RDWR, 0)) < 0) { 514 md_perror(gettext("Could not open device %s"), 515 name); 516 continue; 517 } 518 519 (void) close(fd); 520 } 521 522 if (mode & UPDATE_ABR) { 523 /* 524 * Update the ABR value on this node. We obtain the 525 * current ABR state from the master node. 526 */ 527 528 char *miscname; 529 char name[MAXPATHLEN]; 530 int mnum, fd; 531 volcap_t vc; 532 uint_t tstate; 533 534 name[0] = '\0'; 535 mnum = meta_getminor(devnp->dev); 536 537 /* 538 * Ensure that we don't include soft-parts in the 539 * mirror-only call to UPDATE_ABR. meta_get_mirror_names 540 * returns a bogus list that includes all soft-parts 541 * built on mirrors. 542 */ 543 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 544 goto out; 545 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 546 continue; 547 548 /* Get tstate from Master */ 549 if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) 550 != 0) 551 continue; 552 /* If not set on the master, nothing to do */ 553 if (!(tstate & MD_ABR_CAP)) 554 continue; 555 556 meta_mc_log(MC_LOG4, gettext("Updating ABR state " 557 "for %s: %s"), get_mdname(sp, mnum), 558 meta_print_hrtime(gethrtime() - start_time)); 559 560 /* compose the absolute device path and open it */ 561 if (compose_path(sp, mnum, &name[0], 562 sizeof (name)) != 0) 563 goto out; 564 if ((fd = open(name, O_RDWR, 0)) < 0) { 565 md_perror(gettext("Could not open device %s"), 566 name); 567 continue; 568 } 569 570 /* set ABR state */ 571 vc.vc_info = 0; 572 vc.vc_set = 0; 573 if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) { 574 /* 575 * Ignore if device does not support this 576 * ioctl 577 */ 578 if ((errno != ENOTTY) && (errno != ENOTSUP)) { 579 md_perror(gettext("Could not get " 580 "ABR/DMR state for device %s"), 581 name); 582 } 583 (void) close(fd); 584 continue; 585 } 586 if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) { 587 (void) close(fd); 588 continue; 589 } 590 591 vc.vc_set = DKV_ABR_CAP; 592 if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) { 593 md_perror(gettext( 594 "Could not set ABR state for " 595 "device %s"), name); 596 (void) close(fd); 597 goto out; 598 } else { 599 md_eprintf(gettext( 600 "Setting ABR state on device %s\n"), name); 601 } 602 603 (void) close(fd); 604 } 605 } 606 607 /* cleanup */ 608 if (mode & RESET_OWNER) { 609 meta_free_nodelist(nl); 610 } 611 metafreenamelist(devnlp); 612 return (0); 613 614 out: 615 /* cleanup */ 616 if (mode & RESET_OWNER) { 617 meta_free_nodelist(nl); 618 } 619 metafreenamelist(devnlp); 620 return (-1); 621 } 622 623 /* 624 * Print usage message 625 */ 626 static void 627 usage(mdsetname_t *sp, int eval) 628 { 629 (void) fprintf(stderr, gettext("usage:" 630 "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n" 631 "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n" 632 "\t%s [-V version] [-t timeout] [-d level] abort | stop\n" 633 "\t%s [-V | -? | -h]\n"), 634 myname, myname, myname, myname); 635 if (!eval) { 636 (void) fprintf(stderr, gettext("\n" 637 "\tValid debug (-d) levels are 1-%d for increasing " 638 "verbosity.\n\tDefault is -d 3.\n\n" 639 "\tValid step values are: return | step1 | step2 | " 640 "step3 | step4\n\n" 641 "\tNodelist is a space-separated list of node id's\n\n"), 642 MAX_DEBUG_LEVEL); 643 } 644 md_exit(sp, eval); 645 } 646 647 /* 648 * Input: Input takes a config step name followed by a list of 649 * possible node id's. 650 * 651 * Returns: 0 - Success 652 * 1 - Fail 653 * Node will be removed from cluster membership 654 * by forcing node to panic. 655 * 205 - Unsuccessful. Start another reconfig cycle. 656 * Problem was encountered that could be fixed by 657 * running another reconfig cycle. 658 * Problem could be a result of a failure to read 659 * the nodelist file or that all work could not be 660 * accomplished in a reconfig step in the amount of 661 * time given so another reconfig cycle is needed in 662 * order to finish the current step. 663 */ 664 int 665 main(int argc, char **argv) 666 { 667 mdsetname_t *sp = NULL; 668 md_error_t status = mdnullerror; 669 md_error_t *ep = &status; 670 set_t max_sets, setno; 671 int c, clust = 0; 672 struct sigaction nsa, osa; 673 struct step_t *step_ptr; 674 mdsetname_t *local_sp = NULL; 675 md_drive_desc *dd; 676 int rval = 0; 677 md_set_desc *sd; 678 mddb_block_parm_t mbp; 679 uint_t debug = 3; /* log upto MC_LOG3 by default */ 680 int version_table_size; 681 mddb_setflags_config_t sf; 682 int ret_val; 683 mddb_config_t cfg; 684 int set_info[MD_MAXSETS]; 685 long commd_timeout = 0; 686 687 /* 688 * Get the locale set up before calling any other routines 689 * with messages to ouput. Just in case we're not in a build 690 * environment, make sure that TEXT_DOMAIN gets set to 691 * something. 692 */ 693 #if !defined(TEXT_DOMAIN) 694 #define TEXT_DOMAIN "SYS_TEST" 695 #endif 696 (void) setlocale(LC_ALL, ""); 697 (void) textdomain(TEXT_DOMAIN); 698 699 if ((clust = sdssc_bind_library()) == SDSSC_ERROR) { 700 md_eprintf(gettext("Interface error with libsds_sc.so\n")); 701 exit(1); 702 } 703 704 if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) { 705 mde_perror(ep, ""); 706 md_exit(sp, 1); 707 } 708 709 /* 710 * open log and enable libmeta logging. Do it here explicitly 711 * rather than letting md_init() do it because we are not really 712 * a daemon and that is what md_init() opens the log as. 713 */ 714 openlog("metaclust", LOG_CONS, LOG_USER); 715 716 version_table_size = sizeof (version_table) / sizeof (version_table[0]); 717 718 optind = 1; 719 opterr = 0; 720 while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) { 721 switch (c) { 722 case 'h': 723 usage(sp, 0); 724 break; 725 726 case 'd': 727 if (sscanf(optarg, "%u", &debug) != 1) { 728 md_eprintf(gettext("Invalid debug level\n")); 729 md_exit(sp, 1); 730 } else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) { 731 debug = min(max(debug, 1), MAX_DEBUG_LEVEL); 732 md_eprintf(gettext("Debug level must be " 733 "between 1 and %d inclusive.\n"), 734 MAX_DEBUG_LEVEL); 735 md_eprintf(gettext("Debug level set to %d.\n"), 736 debug); 737 } 738 break; 739 740 case 'V': 741 version = Strdup(optarg); 742 break; 743 744 case 't': 745 if (sscanf(optarg, "%u", &timeout) != 1) { 746 md_eprintf(gettext("Invalid timeout value\n")); 747 md_exit(sp, 1); 748 } 749 break; 750 751 case '?': 752 if (optopt == '?') { 753 usage(sp, 0); 754 } else if (optopt == 'V') { 755 int i; 756 757 (void) fprintf(stdout, gettext( 758 "%s: Versions Supported:"), myname); 759 for (i = 0; i < version_table_size; i++) { 760 (void) fprintf(stdout, " %s", 761 version_table[i]); 762 } 763 (void) fprintf(stdout, "\n"); 764 md_exit(sp, 0); 765 } 766 /*FALLTHROUGH*/ 767 768 default: 769 usage(sp, 1); 770 break; 771 } 772 } 773 774 /* initialise the debug level and start time */ 775 setup_mc_log(debug); 776 777 /* 778 * check that the version specified (if any) is supported. 779 */ 780 if (version != NULL) { 781 int i, found = 0; 782 783 for (i = 0; i < version_table_size; i++) { 784 if (strcmp(version, version_table[i]) == 0) { 785 found = 1; 786 break; 787 } 788 } 789 if (!found) { 790 md_eprintf(gettext("Version %s not supported\n"), 791 version); 792 md_exit(sp, 1); 793 } 794 } 795 796 argc -= optind; 797 argv += optind; 798 799 /* parse arguments */ 800 if (argc <= 0) { 801 usage(sp, 1); 802 } 803 804 /* convert the step name to the corresponding number */ 805 step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) / 806 sizeof (step_table[0])), sizeof (step_table[0]), mc_compare); 807 if (step_ptr != NULL) { 808 stepnum = step_ptr->step_num; 809 } 810 811 --argc; 812 ++argv; 813 814 /* set timeout alarm signal, a value of 0 will disable timeout */ 815 if (timeout > 0) { 816 int stat_loc = 0; 817 commd_timeout = (long)(timeout * .75); 818 819 c_pid = fork(); 820 821 if (c_pid == (pid_t)-1) { 822 md_perror(gettext("Unable to fork")); 823 md_exit(sp, 1); 824 } else if (c_pid) { 825 /* parent */ 826 nsa.sa_flags = 0; 827 if (sigfillset(&nsa.sa_mask) < 0) { 828 md_perror(gettext("Unable to set signal mask")); 829 md_exit(sp, 1); 830 } 831 832 nsa.sa_handler = sigalarmhandler; 833 if (sigaction(SIGALRM, &nsa, &osa) == -1) { 834 md_perror(gettext("Unable to set alarm " 835 "handler")); 836 md_exit(sp, 1); 837 } 838 839 (void) alarm(timeout); 840 841 /* 842 * wait for child to exit or timeout to expire. 843 * keep retrying if the call is interrupted 844 */ 845 while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) { 846 if (errno != EINTR) { 847 break; 848 } 849 } 850 if (ret_val == c_pid) { 851 /* exit with the childs exit value */ 852 exit(WEXITSTATUS(stat_loc)); 853 } else if (errno == ECHILD) { 854 md_exit(sp, 0); 855 } else { 856 perror(myname); 857 md_exit(sp, 1); 858 } 859 } 860 } 861 862 /* 863 * If a timeout value is given, everything from this point onwards is 864 * executed in the child process. 865 */ 866 867 switch (stepnum) { 868 case MC_START: 869 /* 870 * Start Step 871 * 872 * - Suspend all rpc.mdcommd messages 873 */ 874 875 /* expect the local node id to be given only */ 876 if (argc != 1) 877 usage(sp, 1); 878 879 meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"), 880 meta_print_hrtime(0)); 881 882 /* 883 * With multinode disksets configured we need to 884 * update all replicas on all cluster nodes to have 885 * the same status. If local replicas on a cluster 886 * node are not accessible we need to panic this 887 * node, otherwise we abort in the reconfig cycle 888 * and failfast/reboot the "good" cluster node too. 889 * To avoid a total cluster outage in the above case 890 * we panic only the failing node via md_exit(.., 1). 891 */ 892 if ((local_sp = load_local_set(ep)) == NULL) { 893 /* panic the node */ 894 md_exit(local_sp, 1); 895 } 896 897 if ((max_sets = get_max_sets(ep)) == 0) { 898 mde_perror(ep, ""); 899 md_exit(sp, 1); 900 } 901 902 /* start walking through all possible disksets */ 903 for (setno = 1; setno < max_sets; setno++) { 904 if ((sp = metasetnosetname(setno, ep)) == NULL) { 905 if (mdiserror(ep, MDE_NO_SET)) { 906 /* No set for this setno - continue */ 907 mdclrerror(ep); 908 continue; 909 } else { 910 mde_perror(ep, gettext("Unable to " 911 "get set %d information"), setno); 912 md_exit(sp, 1); 913 } 914 } 915 916 /* only check multi-node disksets */ 917 if (!meta_is_mn_set(sp, ep)) { 918 mdclrerror(ep); 919 continue; 920 } 921 922 meta_mc_log(MC_LOG3, gettext("Start - block parse " 923 "messages for set %s: %s"), sp->setname, 924 meta_print_hrtime(gethrtime() - start_time)); 925 926 /* 927 * Mddb parse messages are sent amongst the nodes 928 * in a diskset whenever the locator block or 929 * locator names structure has been changed. 930 * A locator block change could occur as a result 931 * of a disk failure during the reconfig cycle, 932 * so block the mddb parse messages while the 933 * rpc.mdcommd is suspended during the reconfig cycle. 934 */ 935 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 936 (void) memset(&mbp, 0, sizeof (mbp)); 937 mbp.c_setno = setno; 938 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 939 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 940 &mbp.c_mde, NULL)) { 941 (void) mdstealerror(ep, &mbp.c_mde); 942 mde_perror(ep, gettext("Could not " 943 "block set %s"), sp->setname); 944 md_exit(sp, 1); 945 } 946 } 947 948 /* suspend commd and spin waiting for drain */ 949 while ((ret_val = mdmn_suspend(setno, 950 MD_COMM_ALL_CLASSES, commd_timeout)) == 951 MDE_DS_COMMDCTL_SUSPEND_NYD) { 952 (void) sleep(1); 953 } 954 955 if (ret_val) { 956 md_eprintf(gettext("Could not suspend " 957 "rpc.mdcommd for set %s\n"), sp->setname); 958 md_exit(sp, 1); 959 } 960 961 /* 962 * Set start step flag for set. This is set to indicate 963 * that this node entered the reconfig cycle through 964 * the start step. This is used during the reconfig 965 * cycle to determine whether the node had entered 966 * through the start step or the return step. 967 */ 968 (void) memset(&sf, 0, sizeof (sf)); 969 sf.sf_setno = sp->setno; 970 sf.sf_setflags = MD_SET_MN_START_RC; 971 sf.sf_flags = MDDB_NM_SET; 972 /* Use magic to help protect ioctl against attack. */ 973 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 974 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 975 &sf.sf_mde, NULL)) { 976 (void) mdstealerror(ep, &sf.sf_mde); 977 mde_perror(ep, gettext("Could not set " 978 "start_step flag for set %s"), sp->setname); 979 md_exit(sp, 1); 980 } 981 982 } 983 984 meta_mc_log(MC_LOG2, gettext("Start step completed: %s"), 985 meta_print_hrtime(gethrtime() - start_time)); 986 987 break; 988 989 case MC_STOP: 990 /* 991 * Stop Step 992 * 993 * - ??? 994 */ 995 996 /* don't expect any more arguments to follow the step name */ 997 if (argc != 0) 998 usage(sp, 1); 999 1000 break; 1001 1002 case MC_ABORT: 1003 /* 1004 * Abort Step 1005 * 1006 * - Abort rpc.mdcommd 1007 */ 1008 1009 /* don't expect any more arguments to follow the step name */ 1010 if (argc != 0) 1011 usage(sp, 1); 1012 1013 meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"), 1014 meta_print_hrtime(0)); 1015 1016 /* 1017 * Does local set exist? If not, exit with 0 1018 * since there's no reason to have this node panic if 1019 * the local set cannot be started. 1020 */ 1021 if ((local_sp = load_local_set(ep)) == NULL) { 1022 md_exit(local_sp, 0); 1023 } 1024 1025 /* 1026 * abort the rpc.mdcommd. The abort is only issued on this node 1027 * meaning that the abort reconfig step is called on this 1028 * node before a panic while the rest of the cluster will 1029 * undergo a reconfig cycle. 1030 * There is no time relation between this node running a 1031 * reconfig abort and the the rest of the cluster 1032 * running a reconfig cycle meaning that this node may 1033 * panic before, during or after the cluster has run 1034 * a reconfig cycle. 1035 */ 1036 mdmn_abort(); 1037 1038 meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"), 1039 meta_print_hrtime(gethrtime() - start_time)); 1040 1041 break; 1042 1043 case MC_RETURN: 1044 /* 1045 * Return Step 1046 * 1047 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL 1048 * and release local set lock. Grabbing the local set 1049 * lock allows any active metaset/metadb commands to 1050 * terminate gracefully and will keep a metaset/metadb 1051 * command from starting until the DRAIN ALL is issued. 1052 * The metaset/metadb commands can issue 1053 * DRAIN ALL/RESUME ALL commands to rpc.mdcommd, 1054 * so the return step must not issue the DRAIN ALL command 1055 * until metaset/metadb have finished or metaset may issue 1056 * a RESUME ALL after this return reconfig step has issued 1057 * the DRAIN ALL command. 1058 * After this reconfig step has issued the DRAIN_ALL and 1059 * released the local set lock, metaset/metadb will fail 1060 * when attempting to contact the rpc.mdcommd and will 1061 * terminate without making any configuration changes. 1062 * The DRAIN ALL command will keep all other meta* commands 1063 * from running during the reconfig cycle (these commands 1064 * will wait until the rpc.mdcommd is resumed) since the 1065 * reconfig cycle may be changing the diskset configuration. 1066 */ 1067 1068 /* expect the nodelist to follow the step name */ 1069 if (argc < 1) 1070 usage(sp, 1); 1071 1072 meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"), 1073 meta_print_hrtime(0)); 1074 1075 /* 1076 * Does local set exist? If not, exit with 0 1077 * since there's no reason to have this node panic if 1078 * the local set cannot be started. 1079 */ 1080 if ((local_sp = load_local_set(ep)) == NULL) { 1081 md_exit(local_sp, 0); 1082 } 1083 1084 /* 1085 * Suspend any mirror resyncs that are in progress. This 1086 * stops unnecessary timeouts. 1087 */ 1088 meta_mirror_resync_block_all(); 1089 1090 if (meta_lock(local_sp, TRUE, ep) != 0) { 1091 mde_perror(ep, ""); 1092 md_exit(local_sp, 1); 1093 } 1094 1095 /* 1096 * All metaset and metadb commands on this node have now 1097 * terminated gracefully. Now, issue a drain all to 1098 * the rpc.mdcommd. Any meta command issued after the 1099 * drain all will either spin sending the command to the 1100 * master until after the reconfig cycle has finished OR 1101 * will terminate gracefully (metaset/metadb). 1102 */ 1103 if ((max_sets = get_max_sets(ep)) == 0) { 1104 mde_perror(ep, ""); 1105 md_exit(sp, 1); 1106 } 1107 1108 /* start walking through all possible disksets */ 1109 for (setno = 1; setno < max_sets; setno++) { 1110 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1111 if (mdiserror(ep, MDE_NO_SET)) { 1112 /* No set for this setno - continue */ 1113 mdclrerror(ep); 1114 continue; 1115 } else { 1116 mde_perror(ep, gettext("Unable to " 1117 "get set %d information"), setno); 1118 md_exit(sp, 1); 1119 } 1120 } 1121 1122 /* only check multi-node disksets */ 1123 if (!meta_is_mn_set(sp, ep)) { 1124 mdclrerror(ep); 1125 continue; 1126 } 1127 1128 meta_mc_log(MC_LOG3, gettext("Return - block parse " 1129 "messages for set %s: %s"), sp->setname, 1130 meta_print_hrtime(gethrtime() - start_time)); 1131 1132 /* 1133 * Mddb parse messages are sent amongst the nodes 1134 * in a diskset whenever the locator block or 1135 * locator names structure has been changed. 1136 * A locator block change could occur as a result 1137 * of a disk failure during the reconfig cycle, 1138 * so block the mddb parse messages while the 1139 * rpc.commd is suspended during the reconfig cycle. 1140 */ 1141 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1142 (void) memset(&mbp, 0, sizeof (mbp)); 1143 mbp.c_setno = setno; 1144 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 1145 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1146 &mbp.c_mde, NULL)) { 1147 (void) mdstealerror(ep, &mbp.c_mde); 1148 mde_perror(ep, gettext("Could not " 1149 "block set %s"), sp->setname); 1150 md_exit(sp, 1); 1151 } 1152 } 1153 1154 /* suspend commd and spin waiting for drain */ 1155 while ((ret_val = mdmn_suspend(setno, 1156 MD_COMM_ALL_CLASSES, commd_timeout)) == 1157 MDE_DS_COMMDCTL_SUSPEND_NYD) { 1158 (void) sleep(1); 1159 } 1160 1161 if (ret_val) { 1162 md_eprintf(gettext("Could not suspend " 1163 "rpc.mdcommd for set %s\n"), sp->setname); 1164 md_exit(sp, 1); 1165 } 1166 } 1167 /* 1168 * Resume all I/Os for this node for all MN sets in 1169 * case master node had suspended I/Os but panic'd 1170 * before resuming I/Os. In case of failure, exit 1171 * with a 1 since unable to resume I/Os on this node. 1172 */ 1173 if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) { 1174 mde_perror(ep, gettext( 1175 "Unable to resume I/O on node %s for all sets"), 1176 mynode()); 1177 md_exit(sp, 1); 1178 } 1179 1180 1181 /* 1182 * Can now unlock local set lock. New metaset/metadb 1183 * commands are now held off using drain all. 1184 */ 1185 (void) meta_unlock(local_sp, ep); 1186 1187 meta_mc_log(MC_LOG2, gettext("Return step completed: %s"), 1188 meta_print_hrtime(gethrtime() - start_time)); 1189 1190 break; 1191 1192 case MC_STEP1: 1193 /* 1194 * Step 1 1195 * 1196 * - Populate nodelist file if we are on clustering 1197 * and pick a master node for each MN diskset. 1198 */ 1199 1200 /* expect the nodelist to follow the step name */ 1201 if (argc < 1) 1202 usage(sp, 1); 1203 1204 meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"), 1205 meta_print_hrtime(0)); 1206 1207 /* Always write nodelist file even if no local set exists */ 1208 if (clust == SDSSC_OKAY) { 1209 /* skip to the nodelist args */ 1210 if (meta_write_nodelist(argc, argv, ep) != 0) { 1211 mde_perror(ep, gettext( 1212 "Could not populate nodelist file")); 1213 md_exit(sp, 1); 1214 } 1215 } 1216 1217 /* 1218 * Does local set exist? If not, exit with 0 1219 * since there's no reason to have this node panic if 1220 * the local set cannot be started. 1221 */ 1222 if ((local_sp = load_local_set(ep)) == NULL) { 1223 md_exit(local_sp, 0); 1224 } 1225 1226 /* 1227 * At this point, all meta* commands are blocked across 1228 * all disksets since the master rpc.mdcommd has drained or 1229 * the master node has died. 1230 * If a metaset or metadb command had been in progress 1231 * at the start of the reconfig cycle, this command has 1232 * either completed or it has been terminated due to 1233 * the death of the master node. 1234 * 1235 * This means that that it is now ok to remove any 1236 * outstanding clnt_locks associated with multinode 1237 * disksets on this node due to a node panic during 1238 * a metaset operation. This allows the routines that 1239 * choose the master to use rpc.metad to determine the 1240 * master of the diskset. 1241 */ 1242 if (clnt_clr_mnsetlock(mynode(), ep) != 0) { 1243 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1244 "clear locks failed %s"), 1245 meta_print_hrtime(gethrtime() - start_time)); 1246 md_exit(local_sp, 1); 1247 } 1248 1249 /* 1250 * Call reconfig_choose_master to choose a master for 1251 * each MN diskset, update the nodelist for each diskset 1252 * given the member information and send a reinit message 1253 * to rpc.mdcommd to reload the nodelist. 1254 */ 1255 rval = meta_reconfig_choose_master(commd_timeout, ep); 1256 if (rval == 205) { 1257 /* 1258 * NOTE: Should issue call to reboot remote host that 1259 * is causing the RPC failure. Clustering to 1260 * provide interface in the future. This should 1261 * stop a never-ending set of 205 reconfig cycles. 1262 * Remote host causing failure is stored in 1263 * ep->host if ep is an RPC error. 1264 * if (mdanyrpcerror(ep)) 1265 * reboot (ep->host); 1266 */ 1267 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1268 "choose master failure of 205 %s"), 1269 meta_print_hrtime(gethrtime() - start_time)); 1270 md_exit(local_sp, 205); 1271 } else if (rval != 0) { 1272 meta_mc_log(MC_LOG2, gettext("Step1 failure: " 1273 "choose master failure %s"), 1274 meta_print_hrtime(gethrtime() - start_time)); 1275 md_exit(local_sp, 1); 1276 } 1277 1278 meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"), 1279 meta_print_hrtime(gethrtime() - start_time)); 1280 1281 md_exit(local_sp, rval); 1282 break; 1283 1284 case MC_STEP2: 1285 /* 1286 * Step 2 1287 * 1288 * In Step 2, each node walks the list of disksets. If a 1289 * node is a master of a MN diskset, it synchronizes 1290 * the local set USER records for that diskset. 1291 * 1292 * If disks exist in the diskset and there is a joined 1293 * (owner) node in the diskset, the master will also: 1294 * - synchronize the diskset mddbs to the master 1295 * - play the change log 1296 * 1297 * The master node will now attempt to join any unjoined 1298 * nodes that are currently members in the membership list. 1299 */ 1300 1301 /* expect the nodelist to follow the step name */ 1302 if (argc < 1) 1303 usage(sp, 1); 1304 1305 meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"), 1306 meta_print_hrtime(0)); 1307 1308 /* 1309 * Does local set exist? If not, exit with 0 1310 * since there's no reason to have this node panic if 1311 * the local set cannot be started. 1312 */ 1313 if ((local_sp = load_local_set(ep)) == NULL) { 1314 md_exit(local_sp, 0); 1315 } 1316 1317 if ((max_sets = get_max_sets(ep)) == 0) { 1318 mde_perror(ep, ""); 1319 md_exit(local_sp, 1); 1320 } 1321 1322 /* start walking through all possible disksets */ 1323 for (setno = 1; setno < max_sets; setno++) { 1324 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1325 if (mdiserror(ep, MDE_NO_SET)) { 1326 /* No set for this setno - continue */ 1327 mdclrerror(ep); 1328 continue; 1329 } else if (mdanyrpcerror(ep)) { 1330 /* Fail on RPC failure to self */ 1331 mde_perror(ep, gettext( 1332 "Unable to get information for " 1333 "set number %d"), setno); 1334 md_exit(local_sp, 1); 1335 } else { 1336 mde_perror(ep, gettext( 1337 "Unable to get information for " 1338 "set number %d"), setno); 1339 mdclrerror(ep); 1340 continue; 1341 } 1342 } 1343 1344 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1345 if (mdanyrpcerror(ep)) { 1346 /* Fail on RPC failure to self */ 1347 mde_perror(ep, gettext( 1348 "Unable to get information for " 1349 "set number %d"), setno); 1350 md_exit(local_sp, 1); 1351 } 1352 mde_perror(ep, gettext("Unable to get set " 1353 "%s desc information"), sp->setname); 1354 mdclrerror(ep); 1355 continue; 1356 } 1357 1358 /* Only check MN disksets */ 1359 if (!(MD_MNSET_DESC(sd))) { 1360 continue; 1361 } 1362 1363 /* All actions in step 2 are driven by master */ 1364 if (!(sd->sd_mn_am_i_master)) { 1365 continue; 1366 } 1367 1368 meta_mc_log(MC_LOG3, gettext("Step2 - begin record " 1369 "synchronization for set %s: %s"), sp->setname, 1370 meta_print_hrtime(gethrtime() - start_time)); 1371 1372 /* 1373 * Synchronize the USER records in the local mddbs 1374 * for hosts that are members. The USER records 1375 * contain set, drive and host information. 1376 */ 1377 rval = meta_mnsync_user_records(sp, ep); 1378 if (rval != 0) { 1379 mde_perror(ep, gettext( 1380 "Synchronization of user records " 1381 "in set %s failed\n"), sp->setname); 1382 if (rval == 205) { 1383 /* 1384 * NOTE: Should issue call to reboot 1385 * remote host that is causing the RPC 1386 * failure. Clustering to provide 1387 * interface in the future. This 1388 * should stop a never-ending set of 1389 * 205 reconfig cycles. 1390 * Remote host causing failure is 1391 * stored in ep->host if ep is an 1392 * RPC error. 1393 * if (mdanyrpcerror(ep)) 1394 * reboot (ep->host); 1395 */ 1396 md_exit(local_sp, 205); 1397 } else { 1398 md_exit(local_sp, 1); 1399 } 1400 } 1401 1402 /* Reget sd since sync_user_recs may have flushed it */ 1403 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1404 mde_perror(ep, gettext("Unable to get set " 1405 "%s desc information"), sp->setname); 1406 md_exit(local_sp, 1); 1407 } 1408 1409 dd = metaget_drivedesc(sp, 1410 (MD_BASICNAME_OK | PRINT_FAST), ep); 1411 if (! mdisok(ep)) { 1412 mde_perror(ep, gettext("Unable to get set " 1413 "%s drive information"), sp->setname); 1414 md_exit(local_sp, 1); 1415 } 1416 1417 /* 1418 * No drives in set, continue to next set. 1419 */ 1420 if (dd == NULL) { 1421 /* Done with this set */ 1422 continue; 1423 } 1424 1425 meta_mc_log(MC_LOG3, gettext("Step2 - local set user " 1426 "records completed for set %s: %s"), sp->setname, 1427 meta_print_hrtime(gethrtime() - start_time)); 1428 1429 /* 1430 * Synchronize the diskset mddbs for hosts 1431 * that are members. This may involve 1432 * playing the changelog and writing out 1433 * to the diskset mddbs. 1434 */ 1435 rval = meta_mnsync_diskset_mddbs(sp, ep); 1436 if (rval != 0) { 1437 mde_perror(ep, gettext( 1438 "Synchronization of diskset mddbs " 1439 "in set %s failed\n"), sp->setname); 1440 meta_mc_log(MC_LOG3, gettext("Step2 - diskset " 1441 "mddb synchronization failed for " 1442 "set %s: %s"), sp->setname, 1443 meta_print_hrtime(gethrtime() - 1444 start_time)); 1445 if (rval == 205) { 1446 /* 1447 * NOTE: Should issue call to reboot 1448 * remote host that is causing the RPC 1449 * failure. Clustering to provide 1450 * interface in the future. This 1451 * should stop a never-ending set of 1452 * 205 reconfig cycles. 1453 * Remote host causing failure is 1454 * stored in ep->host if ep is an 1455 * RPC error. 1456 * if (mdanyrpcerror(ep)) 1457 * reboot (ep->host); 1458 */ 1459 md_exit(local_sp, 205); 1460 } else if (rval == 1) { 1461 continue; 1462 } else { 1463 md_exit(local_sp, 1); 1464 } 1465 } 1466 1467 meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb " 1468 "synchronization completed for set %s: %s"), 1469 sp->setname, 1470 meta_print_hrtime(gethrtime() - start_time)); 1471 1472 /* Join the starting nodes to the diskset */ 1473 rval = meta_mnjoin_all(sp, ep); 1474 if (rval != 0) { 1475 mde_perror(ep, gettext( 1476 "Join of non-owner (starting) nodes " 1477 "in set %s failed\n"), sp->setname); 1478 meta_mc_log(MC_LOG3, gettext("Step2 - non owner" 1479 "nodes joined for set %s: %s"), 1480 sp->setname, 1481 meta_print_hrtime(gethrtime() - 1482 start_time)); 1483 if (rval == 205) { 1484 /* 1485 * NOTE: Should issue call to reboot 1486 * remote host that is causing the RPC 1487 * failure. Clustering to provide 1488 * interface in the future. This 1489 * should stop a never-ending set of 1490 * 205 reconfig cycles. 1491 * Remote host causing failure is 1492 * stored in ep->host if ep is an 1493 * RPC error. 1494 * if (mdanyrpcerror(ep)) 1495 * reboot (ep->host); 1496 */ 1497 md_exit(local_sp, 205); 1498 } else { 1499 md_exit(local_sp, 1); 1500 } 1501 } 1502 1503 meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes " 1504 "joined for set %s: %s"), sp->setname, 1505 meta_print_hrtime(gethrtime() - start_time)); 1506 1507 } 1508 1509 meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"), 1510 meta_print_hrtime(gethrtime() - start_time)); 1511 1512 break; 1513 1514 case MC_STEP3: 1515 /* 1516 * Step 3 1517 * 1518 * For all multinode sets do, 1519 * - Reinitialise rpc.mdcommd 1520 * - Reset mirror owners to null if the current owner is 1521 * no longer in the membership list 1522 */ 1523 1524 /* expect the nodelist to follow the step name */ 1525 if (argc < 1) 1526 usage(sp, 1); 1527 1528 meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"), 1529 meta_print_hrtime(0)); 1530 1531 /* 1532 * Does local set exist? If not, exit with 0 1533 * since there's no reason to have this node panic if 1534 * the local set cannot be started. 1535 */ 1536 if ((local_sp = load_local_set(ep)) == NULL) { 1537 md_exit(local_sp, 0); 1538 } 1539 1540 /* 1541 * walk through all sets on this node which could include: 1542 * - MN disksets 1543 * - traditional disksets 1544 * - non-existent disksets 1545 * start mirror resync for all MN sets 1546 */ 1547 if ((max_sets = get_max_sets(ep)) == 0) { 1548 mde_perror(ep, ""); 1549 md_exit(local_sp, 1); 1550 } 1551 1552 /* start walking through all possible disksets */ 1553 for (setno = 1; setno < max_sets; setno++) { 1554 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1555 if (mdiserror(ep, MDE_NO_SET)) { 1556 /* No set for this setno - continue */ 1557 mdclrerror(ep); 1558 continue; 1559 } else { 1560 mde_perror(ep, gettext("Unable to " 1561 "get set %d information"), setno); 1562 md_exit(local_sp, 1); 1563 } 1564 } 1565 1566 /* only check multi-node disksets */ 1567 if (!meta_is_mn_set(sp, ep)) { 1568 mdclrerror(ep); 1569 continue; 1570 } 1571 1572 if (meta_lock(sp, TRUE, ep) != 0) { 1573 mde_perror(ep, ""); 1574 md_exit(local_sp, 1); 1575 } 1576 1577 /* If this node isn't joined to set, do nothing */ 1578 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1579 if (!mdisok(ep)) { 1580 mde_perror(ep, gettext("Could " 1581 "not get set %s ownership"), 1582 sp->setname); 1583 md_exit(sp, 1); 1584 } 1585 mdclrerror(ep); 1586 (void) meta_unlock(sp, ep); 1587 continue; 1588 } 1589 1590 meta_mc_log(MC_LOG3, gettext("Step3 - begin " 1591 "re-initialising rpc.mdcommd and resetting mirror " 1592 "owners for set %s: %s"), sp->setname, 1593 meta_print_hrtime(gethrtime() - start_time)); 1594 1595 /* reinitialzse rpc.mdcommd with new nodelist */ 1596 if (mdmn_reinit_set(setno, commd_timeout)) { 1597 md_eprintf(gettext( 1598 "Could not re-initialise rpc.mdcommd for " 1599 "set %s\n"), sp->setname); 1600 md_exit(sp, 1); 1601 } 1602 1603 (void) memset(&cfg, 0, sizeof (cfg)); 1604 cfg.c_id = 0; 1605 cfg.c_setno = sp->setno; 1606 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1607 NULL) != 0) { 1608 (void) mdstealerror(ep, &cfg.c_mde); 1609 mde_perror(ep, gettext("Could " 1610 "not get set %s information"), 1611 sp->setname); 1612 md_exit(sp, 1); 1613 } 1614 1615 /* Don't do anything else if set is stale */ 1616 if (cfg.c_flags & MDDB_C_STALE) { 1617 (void) meta_unlock(sp, ep); 1618 mdclrerror(ep); 1619 continue; 1620 } 1621 1622 /* reset mirror owners */ 1623 if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) { 1624 md_exit(sp, 1); 1625 } 1626 1627 (void) meta_unlock(sp, ep); 1628 1629 meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd " 1630 "re-initialised and mirror owners reset for " 1631 "set %s: %s"), sp->setname, 1632 meta_print_hrtime(gethrtime() - start_time)); 1633 } 1634 1635 meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"), 1636 meta_print_hrtime(gethrtime() - start_time)); 1637 1638 break; 1639 1640 case MC_STEP4: 1641 /* 1642 * Step 4 1643 * 1644 * For all multinode sets do: 1645 * - Resume the rpc.mdcommd messages. Must resume all 1646 * sets before issuing I/O to any set since an error 1647 * encountered in a commd suspended set could be 1648 * blocked waiting for commd in another set to resume. 1649 * (This happens since the daemon queues service 1650 * all sets). An open of a soft partition causes 1651 * a read of the watermarks during the open. 1652 * - If set is non-writable (not an owner or STALE), then 1653 * continue to next set. 1654 * 1655 * For all multinode sets do, 1656 * - Reset ABR states for all mirrors, ie clear ABR if not 1657 * open on any node. 1658 * - Reset ABR states for all soft partitions, ie clear ABR if 1659 * not open on any node. 1660 * - For all slave nodes that have entered through the start 1661 * step, update the ABR state to that of the master and 1662 * get the submirror state from the master 1663 * - meta_lock set 1664 * - Resync all mirrors 1665 * - unlock meta_lock for this set. 1666 * - Choose a new owner for any orphaned resyncs 1667 * 1668 * There is one potential issue here. when concurrently 1669 * resetting and updating the ABR state. If the master has ABR 1670 * set, but should no longer have because the only node that 1671 * had the metadevice open and had ABR set has paniced, the 1672 * master will send a message to all nodes to clear the ABR 1673 * state. Meanwhile any node that has come through the 1674 * start step will get tstate from the master and will update 1675 * ABR if it was set in tstate. So, we appear to have a problem 1676 * if the following sequence occurs:- 1677 * - The slave gets tstate with ABR set 1678 * - The master sends a message to clear ABR 1679 * - The slave updates ABR with the value it got from tstate. 1680 * We now have the master with ABR clear and the slave with ABR 1681 * set. Fortunately, having set ABR, the slave will close the 1682 * metadevice after setting ABR and as there are no nodes with 1683 * the device open, the close will send a message to clear ABR 1684 * on all nodes. So, the nodes will all have ABR unset. 1685 */ 1686 1687 /* expect the nodelist to follow the step name */ 1688 if (argc < 1) 1689 usage(sp, 1); 1690 1691 meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"), 1692 meta_print_hrtime(0)); 1693 1694 /* 1695 * Does local set exist? If not, exit with 0 1696 * since there's no reason to have this node panic if 1697 * the local set cannot be started. 1698 */ 1699 if ((local_sp = load_local_set(ep)) == NULL) { 1700 md_exit(local_sp, 0); 1701 } 1702 1703 /* 1704 * walk through all sets on this node which could include: 1705 * - MN disksets 1706 * - traditional disksets 1707 * - non-existent disksets 1708 * start mirror resync for all MN sets 1709 */ 1710 if ((max_sets = get_max_sets(ep)) == 0) { 1711 mde_perror(ep, ""); 1712 md_exit(local_sp, 1); 1713 } 1714 1715 /* Clear set_info structure */ 1716 for (setno = 1; setno < max_sets; setno++) { 1717 set_info[setno] = 0; 1718 } 1719 1720 /* start walking through all possible disksets */ 1721 for (setno = 1; setno < max_sets; setno++) { 1722 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1723 if (mdiserror(ep, MDE_NO_SET)) { 1724 /* No set for this setno - continue */ 1725 mdclrerror(ep); 1726 continue; 1727 } else { 1728 mde_perror(ep, gettext("Unable to " 1729 "get set %d information"), setno); 1730 md_exit(local_sp, 1); 1731 } 1732 } 1733 1734 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1735 mde_perror(ep, gettext("Unable to get set " 1736 "%s desc information"), sp->setname); 1737 mdclrerror(ep); 1738 continue; 1739 } 1740 1741 /* only check multi-node disksets */ 1742 if (!meta_is_mn_set(sp, ep)) { 1743 mdclrerror(ep); 1744 continue; 1745 } 1746 1747 set_info[setno] |= SET_INFO_MN; 1748 1749 /* 1750 * If not an owner (all mddbs failed) or stale 1751 * (< 50% mddbs operational), then set is 1752 * non-writable so just resume commd and 1753 * unblock mddb messages. 1754 */ 1755 mdclrerror(ep); 1756 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1757 set_info[setno] |= SET_INFO_NO_WR; 1758 } 1759 if (!mdisok(ep)) { 1760 mde_perror(ep, gettext("Could " 1761 "not get set %s ownership"), 1762 sp->setname); 1763 md_exit(local_sp, 1); 1764 } 1765 /* Set is owned - is it stale? */ 1766 if (!set_info[setno] & SET_INFO_NO_WR) { 1767 (void) memset(&cfg, 0, sizeof (cfg)); 1768 cfg.c_id = 0; 1769 cfg.c_setno = sp->setno; 1770 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1771 NULL) != 0) { 1772 (void) mdstealerror(ep, &cfg.c_mde); 1773 mde_perror(ep, gettext("Could " 1774 "not get set %s information"), 1775 sp->setname); 1776 md_exit(local_sp, 1); 1777 } 1778 if (cfg.c_flags & MDDB_C_STALE) { 1779 set_info[setno] |= SET_INFO_NO_WR; 1780 } 1781 } 1782 1783 /* resume rpc.mdcommd */ 1784 if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0, 1785 commd_timeout)) { 1786 md_eprintf(gettext("Unable to resume " 1787 "rpc.mdcommd for set %s\n"), sp->setname); 1788 md_exit(local_sp, 1); 1789 } 1790 1791 /* Unblock mddb parse messages */ 1792 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1793 (void) memset(&mbp, 0, sizeof (mbp)); 1794 mbp.c_setno = setno; 1795 mbp.c_blk_flags = MDDB_UNBLOCK_PARSE; 1796 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1797 &mbp.c_mde, NULL)) { 1798 (void) mdstealerror(ep, &mbp.c_mde); 1799 mde_perror(ep, gettext("Could not " 1800 "unblock set %s"), sp->setname); 1801 md_exit(local_sp, 1); 1802 } 1803 } 1804 meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd " 1805 "resumed and messages unblocked for set %s: %s"), 1806 sp->setname, 1807 meta_print_hrtime(gethrtime() - start_time)); 1808 } 1809 1810 for (setno = 1; setno < max_sets; setno++) { 1811 int start_step; 1812 1813 /* Skip traditional disksets. */ 1814 if ((set_info[setno] & SET_INFO_MN) == 0) 1815 continue; 1816 1817 /* 1818 * If already determined that this set is 1819 * a non-writable set, then just continue 1820 * to next set since there's nothing else 1821 * to do for a non-writable set. 1822 */ 1823 if (set_info[setno] & SET_INFO_NO_WR) 1824 continue; 1825 1826 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1827 if (mdiserror(ep, MDE_NO_SET)) { 1828 /* No set for this setno - continue */ 1829 mdclrerror(ep); 1830 continue; 1831 } else { 1832 mde_perror(ep, gettext("Unable to " 1833 "get set %d information"), setno); 1834 md_exit(local_sp, 1); 1835 } 1836 } 1837 1838 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1839 mde_perror(ep, gettext("Unable to get set " 1840 "%s desc information"), sp->setname); 1841 mdclrerror(ep); 1842 continue; 1843 } 1844 1845 /* See if this node came through the start step */ 1846 (void) memset(&sf, 0, sizeof (sf)); 1847 sf.sf_setno = sp->setno; 1848 sf.sf_flags = MDDB_NM_GET; 1849 /* Use magic to help protect ioctl against attack. */ 1850 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1851 if (metaioctl(MD_MN_GET_SETFLAGS, &sf, 1852 &sf.sf_mde, NULL)) { 1853 (void) mdstealerror(ep, &sf.sf_mde); 1854 mde_perror(ep, gettext("Could not get " 1855 "start_step flag for set %s"), sp->setname); 1856 md_exit(local_sp, 1); 1857 } 1858 start_step = 1859 (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0; 1860 1861 /* 1862 * We can now reset the start_step flag for the set 1863 * if it was already set. 1864 */ 1865 if (start_step) { 1866 (void) memset(&sf, 0, sizeof (sf)); 1867 sf.sf_setno = sp->setno; 1868 sf.sf_setflags = MD_SET_MN_START_RC; 1869 sf.sf_flags = MDDB_NM_RESET; 1870 /* 1871 * Use magic to help protect ioctl 1872 * against attack. 1873 */ 1874 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1875 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1876 &sf.sf_mde, NULL)) { 1877 (void) mdstealerror(ep, &sf.sf_mde); 1878 mde_perror(ep, 1879 gettext("Could not reset " 1880 "start_step flag for set %s"), 1881 sp->setname); 1882 } 1883 } 1884 1885 meta_mc_log(MC_LOG3, gettext("Step4 - begin setting " 1886 "ABR state and restarting io's for " 1887 "set %s: %s"), sp->setname, 1888 meta_print_hrtime(gethrtime() - start_time)); 1889 1890 1891 /* 1892 * If we are not the master and we have come through 1893 * the start step, we must update the ABR states 1894 * for mirrors and soft partitions. Also the submirror 1895 * states need to be synchronised so that we see the 1896 * same status as other previously joined members. 1897 * This _must_ be done before starting the resync. 1898 */ 1899 if (!(sd->sd_mn_am_i_master) && start_step) { 1900 if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR, 1901 ep) == -1) { 1902 md_exit(local_sp, 1); 1903 } 1904 if (reset_state(UPDATE_ABR, sp, MD_SP, 1905 ep) == -1) { 1906 md_exit(local_sp, 1); 1907 } 1908 /* 1909 * Mark the fact that we've got the mirror 1910 * state. This allows the resync thread to 1911 * determine if _it_ needs to issue this. This 1912 * can happen if a node is added to a set after 1913 * a reconfig cycle has completed. 1914 */ 1915 (void) memset(&sf, 0, sizeof (sf)); 1916 sf.sf_setno = sp->setno; 1917 sf.sf_setflags = MD_SET_MN_MIR_STATE_RC; 1918 sf.sf_flags = MDDB_NM_SET; 1919 /* 1920 * Use magic to help protect ioctl 1921 * against attack. 1922 */ 1923 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1924 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1925 &sf.sf_mde, NULL)) { 1926 (void) mdstealerror(ep, &sf.sf_mde); 1927 mde_perror(ep, 1928 gettext("Could not set " 1929 "submirror state flag for set %s"), 1930 sp->setname); 1931 } 1932 } 1933 1934 /* 1935 * All remaining actions are only performed by the 1936 * master 1937 */ 1938 if (!(sd->sd_mn_am_i_master)) { 1939 if (meta_lock(sp, TRUE, ep) != 0) { 1940 mde_perror(ep, ""); 1941 md_exit(local_sp, 1); 1942 } 1943 meta_mirror_resync_unblock(sp); 1944 (void) meta_unlock(sp, ep); 1945 continue; 1946 } 1947 1948 /* 1949 * If the master came through the start step, this 1950 * implies that all of the nodes must have done the 1951 * same and hence there can be no applications 1952 * running. Hence no need to reset ABR 1953 */ 1954 if (!start_step) { 1955 /* Reset ABR state for mirrors */ 1956 if (reset_state(RESET_ABR, sp, MD_MIRROR, 1957 ep) == -1) { 1958 md_exit(local_sp, 1); 1959 } 1960 /* ...and now the same for soft partitions */ 1961 if (reset_state(RESET_ABR, sp, MD_SP, 1962 ep) == -1) { 1963 md_exit(local_sp, 1); 1964 } 1965 } 1966 1967 /* 1968 * choose owners for orphaned resyncs and reset 1969 * non-orphaned resyncs so that an owner node that 1970 * reboots will restart the resync if needed. 1971 */ 1972 if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1) 1973 md_exit(local_sp, 1); 1974 1975 /* 1976 * Must unlock set lock before meta_mirror_resync_all 1977 * sends a message to run the metasync command 1978 * which also grabs the meta_lock. 1979 */ 1980 if (meta_lock(sp, TRUE, ep) != 0) { 1981 mde_perror(ep, ""); 1982 md_exit(local_sp, 1); 1983 } 1984 meta_mirror_resync_unblock(sp); 1985 (void) meta_unlock(sp, ep); 1986 1987 /* resync all mirrors in set */ 1988 if (meta_mirror_resync_all(sp, 0, ep) != 0) { 1989 mde_perror(ep, gettext("Mirror resyncs " 1990 "failed for set %s"), sp->setname); 1991 md_exit(local_sp, 1); 1992 } 1993 1994 meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted " 1995 "for set %s: %s"), sp->setname, 1996 meta_print_hrtime(gethrtime() - start_time)); 1997 } 1998 1999 meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"), 2000 meta_print_hrtime(gethrtime() - start_time)); 2001 2002 break; 2003 2004 default: 2005 usage(sp, 1); 2006 break; 2007 } 2008 2009 md_exit(sp, 0); 2010 /* NOTREACHED */ 2011 return (0); 2012 } 2013