1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <meta.h> 28 #include <sdssc.h> 29 #include <signal.h> 30 #include <syslog.h> 31 #include <sys/types.h> 32 #include <sys/wait.h> 33 #include <sys/lvm/md_mirror.h> 34 #include <metad.h> 35 36 #define MY_VERSION "1.0" /* the highest supported version */ 37 #define MAX_DEBUG_LEVEL 5 /* maximum verbosity level */ 38 39 #define RESET_OWNER 0x0001 40 #define CHOOSE_OWNER 0x0002 41 #define RESET_ABR 0x0004 42 #define UPDATE_ABR 0x0008 43 #define GET_MIRROR_STATE 0x0010 44 45 #define SET_INFO_NO_WR 0x0002 46 #define SET_INFO_MN 0x0004 47 48 /* 49 * This table defines all the metaclust reconfig steps we understand 50 */ 51 typedef enum stpnum { 52 MC_UNK = 0, 53 MC_START, 54 MC_STOP, 55 MC_ABORT, 56 MC_RETURN, 57 MC_STEP1, 58 MC_STEP2, 59 MC_STEP3, 60 MC_STEP4 61 } stepnum_t; 62 63 /* 64 * Structure for step_name -> step_number mapping 65 */ 66 struct step_t { 67 char *step_nam; 68 stepnum_t step_num; 69 }; 70 71 /* 72 * Step name to step number mapping table 73 * This table MUST be sorted alphabetically in ascending order of step name 74 */ 75 static struct step_t step_table[] = { 76 { "abort", MC_ABORT }, 77 { "return", MC_RETURN }, 78 { "start", MC_START }, 79 { "step1", MC_STEP1 }, 80 { "step2", MC_STEP2 }, 81 { "step3", MC_STEP3 }, 82 { "step4", MC_STEP4 }, 83 { "stop", MC_STOP } 84 }; 85 86 /* 87 * If support for a different version is added, the new version number should 88 * be appended to the version_table below. This list will be searched to 89 * determine if a version requested via the -V option is supported or not. 90 */ 91 static char *version_table[] = { 92 MY_VERSION 93 }; 94 95 uint_t timeout = 0; /* disable timeout by default */ 96 char *version = MY_VERSION; /* use latest version by default */ 97 int stepnum = MC_UNK; /* reconfiguration step number */ 98 pid_t c_pid; /* child process id */ 99 100 /* 101 * Binary search comparison routine 102 */ 103 static int 104 mc_compare(const void *stp1, const void *stp2) 105 { 106 return (strcmp((const char *)stp1, 107 ((const struct step_t *)stp2)->step_nam)); 108 } 109 110 /* 111 * Timeout expiry alarm signal handler 112 */ 113 /*ARGSUSED*/ 114 static void 115 sigalarmhandler(int sig) 116 { 117 int i, n, ret, stat_loc = 0; 118 FILE *pgcore; 119 char corecmd[256]; 120 121 n = sizeof (step_table) / sizeof (step_table[0]); 122 for (i = 0; i < n; i++) { 123 if (stepnum == step_table[i].step_num) 124 break; 125 } 126 127 assert(i != n); 128 129 meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"), 130 step_table[i].step_nam, 131 meta_print_hrtime(gethrtime() - start_time)); 132 133 /* 134 * See what the child was actually doing when the timeout expired. 135 * A core-dump of this would be _really_ good, so let's just 136 * try a 'gcore -g c_pid' and hope 137 */ 138 139 (void) memset(corecmd, 0, sizeof (corecmd)); 140 (void) snprintf(corecmd, sizeof (corecmd), 141 "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid); 142 143 pgcore = popen(corecmd, "r"); 144 145 if (pgcore == NULL) { 146 meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"), 147 c_pid); 148 } else { 149 (void) pclose(pgcore); 150 } 151 152 if ((ret = kill(c_pid, SIGKILL)) == 0) { 153 /* 154 * The child will wait forever until the status is retrieved 155 * so get it now. Keep retrying if the call is interrupted. 156 * 157 * The possible results are, 158 * 159 * - child killed successfully 160 * - signal sent but child not killed 161 * - waitpid failed/interrupted 162 */ 163 sleep(2); 164 while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) { 165 if (errno != EINTR) { 166 break; 167 } 168 } 169 if ((ret == c_pid) || (errno == ECHILD)) { 170 ret = 0; 171 } else { 172 ret = 1; 173 } 174 } else if (errno == ESRCH) { 175 /* 176 * If the kill did not catch the child then it means the child 177 * exited immediately after the timeout occured. 178 */ 179 ret = 0; 180 } 181 182 /* 183 * make sure not to exit with 205 for any steps other than step1-step4. 184 * Suncluster reconfiguration can't handle it otherwise. 185 */ 186 switch (stepnum) { 187 case MC_STEP1: 188 case MC_STEP2: 189 case MC_STEP3: 190 case MC_STEP4: 191 /* 192 * If the child was killed successfully return 205 for a 193 * new reconfig cycle otherwise send 1 to panic the node. 194 */ 195 if (ret != 0) { 196 md_eprintf(gettext("Could not kill child\n")); 197 exit(1); 198 } else { 199 exit(205); 200 } 201 break; 202 case MC_START: 203 case MC_STOP: 204 case MC_ABORT: 205 case MC_RETURN: 206 default: 207 exit(1); 208 break; 209 } 210 } 211 212 /* 213 * Attempt to load local set. 214 * Returns: 215 * pointer to mdsetname_t for local set (local_sp) is successful. 216 * 0 if failure 217 * if there are no local set mddbs, no error message is printed. 218 * Otherwise, error message is printed so that user 219 * can determine why the local set didn't start. 220 */ 221 mdsetname_t * 222 load_local_set(md_error_t *ep) 223 { 224 mdsetname_t *local_sp = NULL; 225 226 /* Does local set exist? If not, give no error */ 227 if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) { 228 return (0); 229 } 230 231 /* 232 * snarf local set 233 * If fails with MDE_DB_NODB, then just return 1 printing 234 * no failure. 235 * Otherwise, print error message, and return 1. 236 */ 237 if (meta_setup_db_locations(ep) != 0) { 238 if (!(mdismddberror(ep, MDE_DB_NODB))) 239 mde_perror(ep, ""); 240 return (0); 241 } 242 243 /* local set loaded successfully */ 244 return (local_sp); 245 } 246 247 /* 248 * Purpose: Compose a full path name for a metadevice 249 * 250 * On entry: sp - setname pointer 251 * mnum - minor number of metadevice 252 * pathname - pointer to array to return path string 253 * pathlen - max length of pathname array 254 */ 255 static int 256 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen) 257 { 258 int rtn; 259 mdname_t *np; 260 md_error_t status = mdnullerror; 261 262 if (MD_MIN2SET(mnum) != sp->setno) { 263 md_eprintf(gettext("minor number 0x%x invalid for set %d\n"), 264 mnum, sp->setno); 265 return (-1); 266 } 267 268 if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) { 269 return (-1); 270 } 271 272 rtn = snprintf(pathname, pathlen, "%s", np->rname); 273 274 if ((pathname[0] == '\0') || (rtn >= pathlen)) { 275 md_eprintf(gettext( 276 "Could not create path for device %s\n"), 277 get_mdname(sp, mnum)); 278 return (-1); 279 } 280 return (0); 281 } 282 283 /* 284 * Purpose: Walk through all the devices specified for the given set 285 * and do the action specified in mode 286 */ 287 static int 288 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep) 289 { 290 mdnamelist_t *devnlp = NULL; 291 mdnamelist_t *p; 292 mdname_t *devnp = NULL; 293 md_set_mmown_params_t ownpar_p; 294 md_set_mmown_params_t *ownpar = &ownpar_p; 295 md_unit_t *mm; 296 int mirror_dev = 0; 297 mndiskset_membershiplist_t *nl; 298 int cnt; 299 int has_parent; 300 md_mn_get_mir_state_t mir_state_p; 301 md_mn_get_mir_state_t *mir_state = &mir_state_p; 302 303 /* 304 * if we are choosing or resetting the owners then make sure 305 * we are only doing it for mirror devices 306 */ 307 mirror_dev = (strcmp(MD_MIRROR, drivername) == 0); 308 if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) { 309 return (-1); 310 } 311 312 /* get a list of all the metadevices for current set */ 313 if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) { 314 mde_perror(ep, gettext("Could not get mirrors for set %s"), 315 sp->setname); 316 return (-1); 317 } else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) { 318 mde_perror(ep, gettext( 319 "Could not get soft partitions for set %s"), sp->setname); 320 return (-1); 321 } 322 323 /* If resetting the owner, get the known membership list */ 324 if (mode & RESET_OWNER) { 325 if (meta_read_nodelist(&cnt, &nl, ep)) { 326 mde_perror(ep, "Could not get nodelist"); 327 return (-1); 328 } 329 } 330 331 /* for each metadevice */ 332 for (p = devnlp; (p != NULL); p = p->next) { 333 devnp = p->namep; 334 335 /* 336 * Get the current setting for mirror ABR state and all of the 337 * submirror state and flags from the master node. We only 338 * perform this when going through a 'start' cycle. 339 */ 340 if ((mode & GET_MIRROR_STATE) && mirror_dev) { 341 char *miscname; 342 343 /* 344 * Ensure that we ignore soft-parts that are returned 345 * from the meta_get_mirror_names() call 346 */ 347 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 348 goto out; 349 if (strcmp(miscname, MD_MIRROR) != 0) 350 continue; 351 352 mir_state->mnum = meta_getminor(devnp->dev); 353 MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno); 354 meta_mc_log(MC_LOG4, gettext("Getting mirror state" 355 " for %s: %s"), get_mdname(sp, mir_state->mnum), 356 meta_print_hrtime(gethrtime() - start_time)); 357 358 if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep, 359 "MD_MN_GET_MIRROR_STATE") != 0) { 360 mde_perror(ep, gettext("Unable to get " 361 "mirror state for %s"), 362 get_mdname(sp, mir_state->mnum)); 363 goto out; 364 } else { 365 continue; 366 } 367 } 368 369 /* check if this is a top level metadevice */ 370 if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL) 371 goto out; 372 if (MD_HAS_PARENT(MD_PARENT(mm))) { 373 has_parent = 1; 374 } else { 375 has_parent = 0; 376 } 377 Free(mm); 378 379 if (mode & (RESET_OWNER | CHOOSE_OWNER)) { 380 char *miscname; 381 382 /* 383 * we can only do these for mirrors so make sure we 384 * really have a mirror device and not a softpartition 385 * imitating one. meta_get_mirror_names seems to think 386 * softparts on top of a mirror are mirrors! 387 */ 388 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 389 goto out; 390 if (strcmp(miscname, MD_MIRROR) != 0) 391 continue; 392 393 (void) memset(ownpar, 0, sizeof (*ownpar)); 394 ownpar->d.mnum = meta_getminor(devnp->dev); 395 MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno); 396 397 meta_mc_log(MC_LOG4, gettext("Setting owner " 398 "for %s: %s"), get_mdname(sp, ownpar->d.mnum), 399 meta_print_hrtime(gethrtime() - start_time)); 400 401 /* get the current owner id */ 402 if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep, 403 "MD_MN_GET_MM_OWNER") != 0) { 404 mde_perror(ep, gettext("Unable to get " 405 "mirror owner for %s"), 406 get_mdname(sp, ownpar->d.mnum)); 407 goto out; 408 } 409 } 410 411 if (mode & RESET_OWNER) { 412 if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) { 413 mdclrerror(ep); 414 continue; 415 } 416 417 /* 418 * reset owner only if the current owner is 419 * not in the membership list 420 * Also kill the resync thread so that when the resync 421 * is started, it will perform an optimized resync 422 * for any resync regions that were dirty when the 423 * current owner left the membership. 424 */ 425 if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) { 426 if (meta_mn_change_owner(&ownpar, 427 sp->setno, ownpar->d.mnum, 428 MD_MN_MIRROR_UNOWNED, 429 MD_MN_MM_ALLOW_CHANGE) == -1) { 430 md_eprintf(gettext( 431 "Unable to reset mirror owner " 432 "for %s\n"), 433 get_mdname(sp, ownpar->d.mnum)); 434 goto out; 435 } 436 if (meta_mirror_resync(sp, devnp, 0, ep, 437 MD_RESYNC_KILL_NO_WAIT) != 0) { 438 md_eprintf(gettext( 439 "Unable to kill resync for" 440 " %s\n"), 441 get_mdname(sp, ownpar->d.mnum)); 442 goto out; 443 } 444 } 445 } 446 447 if (mode & CHOOSE_OWNER) { 448 /* 449 * only orphaned resyncs will have no owner. 450 * if that is the case choose a new owner. Otherwise 451 * re-establish the existing owner. This covers the 452 * case where a node that owned the mirror 453 * reboots/panics and comes back into the cluster before 454 * the reconfig cycle has completed. In this case the 455 * other cluster nodes will have the mirror owner marked 456 * as the rebooted node while it has the owner marked 457 * as 'None'. We have to reestablish the ownership so 458 * that the subsequent resync can continue. 459 */ 460 if (meta_mn_change_owner(&ownpar, sp->setno, 461 ownpar->d.mnum, ownpar->d.owner, 462 MD_MN_MM_CHOOSE_OWNER) == -1) { 463 md_eprintf(gettext("Unable to choose " 464 "mirror owner for %s\n"), 465 get_mdname(sp, ownpar->d.mnum)); 466 goto out; 467 } 468 } 469 470 /* 471 * For RESET_ABR and UPDATE_ABR - only handle top 472 * level metadevices. 473 */ 474 if (has_parent) 475 continue; 476 477 if (mode & RESET_ABR) { 478 /* 479 * Reset the ABR (application based recovery) 480 * value on all nodes. We are dealing with 481 * the possibility that we have ABR set but the 482 * only node that had the device open with ABR has 483 * left the cluster. We simply open and close the 484 * device and if this is the last close in the 485 * cluster, ABR will be cleared on all nodes. 486 */ 487 char *miscname; 488 char name[MAXPATHLEN]; 489 int mnum, fd; 490 491 name[0] = '\0'; 492 mnum = meta_getminor(devnp->dev); 493 494 /* 495 * Ensure that we don't include soft-parts in the 496 * mirror-only call to RESET_ABR. meta_get_mirror_names 497 * returns a bogus list that includes all soft-parts 498 * built on mirrors. 499 */ 500 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 501 goto out; 502 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 503 continue; 504 505 meta_mc_log(MC_LOG4, gettext("Re-setting ABR state " 506 "for %s: %s"), get_mdname(sp, mnum), 507 meta_print_hrtime(gethrtime() - start_time)); 508 509 /* compose the absolute device path and open it */ 510 if (compose_path(sp, mnum, &name[0], 511 sizeof (name)) != 0) 512 goto out; 513 if ((fd = open(name, O_RDWR, 0)) < 0) { 514 md_perror(gettext("Could not open device %s"), 515 name); 516 continue; 517 } 518 519 (void) close(fd); 520 } 521 522 if (mode & UPDATE_ABR) { 523 /* 524 * Update the ABR value on this node. We obtain the 525 * current ABR state from the master node. 526 */ 527 528 char *miscname; 529 char name[MAXPATHLEN]; 530 int mnum, fd; 531 volcap_t vc; 532 uint_t tstate; 533 534 name[0] = '\0'; 535 mnum = meta_getminor(devnp->dev); 536 537 /* 538 * Ensure that we don't include soft-parts in the 539 * mirror-only call to UPDATE_ABR. meta_get_mirror_names 540 * returns a bogus list that includes all soft-parts 541 * built on mirrors. 542 */ 543 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 544 goto out; 545 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 546 continue; 547 548 /* Get tstate from Master */ 549 if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) 550 != 0) 551 continue; 552 /* If not set on the master, nothing to do */ 553 if (!(tstate & MD_ABR_CAP)) 554 continue; 555 556 meta_mc_log(MC_LOG4, gettext("Updating ABR state " 557 "for %s: %s"), get_mdname(sp, mnum), 558 meta_print_hrtime(gethrtime() - start_time)); 559 560 /* compose the absolute device path and open it */ 561 if (compose_path(sp, mnum, &name[0], 562 sizeof (name)) != 0) 563 goto out; 564 if ((fd = open(name, O_RDWR, 0)) < 0) { 565 md_perror(gettext("Could not open device %s"), 566 name); 567 continue; 568 } 569 570 /* set ABR state */ 571 vc.vc_info = 0; 572 vc.vc_set = 0; 573 if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) { 574 /* 575 * Ignore if device does not support this 576 * ioctl 577 */ 578 if ((errno != ENOTTY) && (errno != ENOTSUP)) { 579 md_perror(gettext("Could not get " 580 "ABR/DMR state for device %s"), 581 name); 582 } 583 (void) close(fd); 584 continue; 585 } 586 if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) { 587 (void) close(fd); 588 continue; 589 } 590 591 vc.vc_set = DKV_ABR_CAP; 592 if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) { 593 md_perror(gettext( 594 "Could not set ABR state for " 595 "device %s"), name); 596 (void) close(fd); 597 goto out; 598 } else { 599 md_eprintf(gettext( 600 "Setting ABR state on device %s\n"), name); 601 } 602 603 (void) close(fd); 604 } 605 } 606 607 /* cleanup */ 608 if (mode & RESET_OWNER) { 609 meta_free_nodelist(nl); 610 } 611 metafreenamelist(devnlp); 612 return (0); 613 614 out: 615 /* cleanup */ 616 if (mode & RESET_OWNER) { 617 meta_free_nodelist(nl); 618 } 619 metafreenamelist(devnlp); 620 return (-1); 621 } 622 623 /* 624 * Print usage message 625 */ 626 static void 627 usage(mdsetname_t *sp, int eval) 628 { 629 (void) fprintf(stderr, gettext("usage:" 630 "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n" 631 "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n" 632 "\t%s [-V version] [-t timeout] [-d level] abort | stop\n" 633 "\t%s [-V | -? | -h]\n"), 634 myname, myname, myname, myname); 635 if (!eval) { 636 fprintf(stderr, gettext("\n" 637 "\tValid debug (-d) levels are 1-%d for increasing " 638 "verbosity.\n\tDefault is -d 3.\n\n" 639 "\tValid step values are: return | step1 | step2 | " 640 "step3 | step4\n\n" 641 "\tNodelist is a space-separated list of node id's\n\n"), 642 MAX_DEBUG_LEVEL); 643 } 644 md_exit(sp, eval); 645 } 646 647 /* 648 * Input: Input takes a config step name followed by a list of 649 * possible node id's. 650 * 651 * Returns: 0 - Success 652 * 1 - Fail 653 * Node will be removed from cluster membership 654 * by forcing node to panic. 655 * 205 - Unsuccessful. Start another reconfig cycle. 656 * Problem was encountered that could be fixed by 657 * running another reconfig cycle. 658 * Problem could be a result of a failure to read 659 * the nodelist file or that all work could not be 660 * accomplished in a reconfig step in the amount of 661 * time given so another reconfig cycle is needed in 662 * order to finish the current step. 663 */ 664 int 665 main(int argc, char **argv) 666 { 667 mdsetname_t *sp = NULL; 668 md_error_t status = mdnullerror; 669 md_error_t *ep = &status; 670 set_t max_sets, setno; 671 int c, clust = 0; 672 struct sigaction nsa, osa; 673 struct step_t *step_ptr; 674 mdsetname_t *local_sp = NULL; 675 md_drive_desc *dd; 676 int rval = 0; 677 md_set_desc *sd; 678 mddb_block_parm_t mbp; 679 uint_t debug = 3; /* log upto MC_LOG3 by default */ 680 int version_table_size; 681 mddb_setflags_config_t sf; 682 int ret_val; 683 mddb_config_t cfg; 684 int set_info[MD_MAXSETS]; 685 long commd_timeout = 0; 686 687 /* 688 * Get the locale set up before calling any other routines 689 * with messages to ouput. Just in case we're not in a build 690 * environment, make sure that TEXT_DOMAIN gets set to 691 * something. 692 */ 693 #if !defined(TEXT_DOMAIN) 694 #define TEXT_DOMAIN "SYS_TEST" 695 #endif 696 (void) setlocale(LC_ALL, ""); 697 (void) textdomain(TEXT_DOMAIN); 698 699 if ((clust = sdssc_bind_library()) == SDSSC_ERROR) { 700 md_eprintf(gettext("Interface error with libsds_sc.so\n")); 701 exit(1); 702 } 703 704 if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) { 705 mde_perror(ep, ""); 706 md_exit(sp, 1); 707 } 708 709 /* 710 * open log and enable libmeta logging. Do it here explicitly 711 * rather than letting md_init() do it because we are not really 712 * a daemon and that is what md_init() opens the log as. 713 */ 714 openlog("metaclust", LOG_CONS, LOG_USER); 715 716 version_table_size = sizeof (version_table) / sizeof (version_table[0]); 717 718 optind = 1; 719 opterr = 0; 720 while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) { 721 switch (c) { 722 case 'h': 723 usage(sp, 0); 724 break; 725 726 case 'd': 727 if (sscanf(optarg, "%u", &debug) != 1) { 728 md_eprintf(gettext("Invalid debug level\n")); 729 md_exit(sp, 1); 730 } else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) { 731 debug = min(max(debug, 1), MAX_DEBUG_LEVEL); 732 md_eprintf(gettext("Debug level must be " 733 "between 1 and %d inclusive.\n"), 734 MAX_DEBUG_LEVEL); 735 md_eprintf(gettext("Debug level set to %d.\n"), 736 debug); 737 } 738 break; 739 740 case 'V': 741 version = Strdup(optarg); 742 break; 743 744 case 't': 745 if (sscanf(optarg, "%u", &timeout) != 1) { 746 md_eprintf(gettext("Invalid timeout value\n")); 747 md_exit(sp, 1); 748 } 749 break; 750 751 case '?': 752 if (optopt == '?') { 753 usage(sp, 0); 754 } else if (optopt == 'V') { 755 int i; 756 757 fprintf(stdout, gettext( 758 "%s: Versions Supported:"), myname); 759 for (i = 0; i < version_table_size; i++) { 760 fprintf(stdout, " %s", 761 version_table[i]); 762 } 763 fprintf(stdout, "\n"); 764 md_exit(sp, 0); 765 } 766 /*FALLTHROUGH*/ 767 768 default: 769 usage(sp, 1); 770 break; 771 } 772 } 773 774 /* initialise the debug level and start time */ 775 setup_mc_log(debug); 776 777 /* 778 * check that the version specified (if any) is supported. 779 */ 780 if (version != NULL) { 781 int i, found = 0; 782 783 for (i = 0; i < version_table_size; i++) { 784 if (strcmp(version, version_table[i]) == 0) { 785 found = 1; 786 break; 787 } 788 } 789 if (!found) { 790 md_eprintf(gettext("Version %s not supported\n"), 791 version); 792 md_exit(sp, 1); 793 } 794 } 795 796 argc -= optind; 797 argv += optind; 798 799 /* parse arguments */ 800 if (argc <= 0) { 801 usage(sp, 1); 802 } 803 804 /* convert the step name to the corresponding number */ 805 step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) / 806 sizeof (step_table[0])), sizeof (step_table[0]), mc_compare); 807 if (step_ptr != NULL) { 808 stepnum = step_ptr->step_num; 809 } 810 811 --argc; 812 ++argv; 813 814 /* set timeout alarm signal, a value of 0 will disable timeout */ 815 if (timeout > 0) { 816 int stat_loc = 0; 817 commd_timeout = (long)(timeout * .75); 818 819 c_pid = fork(); 820 821 if (c_pid == (pid_t)-1) { 822 md_perror(gettext("Unable to fork")); 823 md_exit(sp, 1); 824 } else if (c_pid) { 825 /* parent */ 826 nsa.sa_flags = 0; 827 if (sigfillset(&nsa.sa_mask) < 0) { 828 md_perror(gettext("Unable to set signal mask")); 829 md_exit(sp, 1); 830 } 831 832 nsa.sa_handler = sigalarmhandler; 833 if (sigaction(SIGALRM, &nsa, &osa) == -1) { 834 md_perror(gettext("Unable to set alarm " 835 "handler")); 836 md_exit(sp, 1); 837 } 838 839 (void) alarm(timeout); 840 841 /* 842 * wait for child to exit or timeout to expire. 843 * keep retrying if the call is interrupted 844 */ 845 while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) { 846 if (errno != EINTR) { 847 break; 848 } 849 } 850 if (ret_val == c_pid) { 851 /* exit with the childs exit value */ 852 exit(WEXITSTATUS(stat_loc)); 853 } else if (errno == ECHILD) { 854 md_exit(sp, 0); 855 } else { 856 perror(myname); 857 md_exit(sp, 1); 858 } 859 } 860 } 861 862 /* 863 * If a timeout value is given, everything from this point onwards is 864 * executed in the child process. 865 */ 866 867 switch (stepnum) { 868 case MC_START: 869 /* 870 * Start Step 871 * 872 * - Suspend all rpc.mdcommd messages 873 */ 874 875 /* expect the local node id to be given only */ 876 if (argc != 1) 877 usage(sp, 1); 878 879 meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"), 880 meta_print_hrtime(0)); 881 882 /* 883 * Does local set exist? If not, exit with 0 884 * since there's no reason to have this node panic if 885 * the local set cannot be started. 886 */ 887 if ((local_sp = load_local_set(ep)) == NULL) { 888 md_exit(local_sp, 0); 889 } 890 891 if ((max_sets = get_max_sets(ep)) == 0) { 892 mde_perror(ep, ""); 893 md_exit(sp, 1); 894 } 895 896 /* start walking through all possible disksets */ 897 for (setno = 1; setno < max_sets; setno++) { 898 if ((sp = metasetnosetname(setno, ep)) == NULL) { 899 if (mdiserror(ep, MDE_NO_SET)) { 900 /* No set for this setno - continue */ 901 mdclrerror(ep); 902 continue; 903 } else { 904 mde_perror(ep, gettext("Unable to " 905 "get set %d information"), setno); 906 md_exit(sp, 1); 907 } 908 } 909 910 /* only check multi-node disksets */ 911 if (!meta_is_mn_set(sp, ep)) { 912 mdclrerror(ep); 913 continue; 914 } 915 916 meta_mc_log(MC_LOG3, gettext("Start - block parse " 917 "messages for set %s: %s"), sp->setname, 918 meta_print_hrtime(gethrtime() - start_time)); 919 920 /* 921 * Mddb parse messages are sent amongst the nodes 922 * in a diskset whenever the locator block or 923 * locator names structure has been changed. 924 * A locator block change could occur as a result 925 * of a disk failure during the reconfig cycle, 926 * so block the mddb parse messages while the 927 * rpc.mdcommd is suspended during the reconfig cycle. 928 */ 929 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 930 (void) memset(&mbp, 0, sizeof (mbp)); 931 mbp.c_setno = setno; 932 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 933 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 934 &mbp.c_mde, NULL)) { 935 mdstealerror(ep, &mbp.c_mde); 936 mde_perror(ep, gettext("Could not " 937 "block set %s"), sp->setname); 938 md_exit(sp, 1); 939 } 940 } 941 942 /* suspend commd and spin waiting for drain */ 943 while ((ret_val = mdmn_suspend(setno, 944 MD_COMM_ALL_CLASSES, commd_timeout)) == 945 MDE_DS_COMMDCTL_SUSPEND_NYD) { 946 sleep(1); 947 } 948 949 if (ret_val) { 950 md_eprintf(gettext("Could not suspend " 951 "rpc.mdcommd for set %s\n"), sp->setname); 952 md_exit(sp, 1); 953 } 954 955 /* 956 * Set start step flag for set. This is set to indicate 957 * that this node entered the reconfig cycle through 958 * the start step. This is used during the reconfig 959 * cycle to determine whether the node had entered 960 * through the start step or the return step. 961 */ 962 (void) memset(&sf, 0, sizeof (sf)); 963 sf.sf_setno = sp->setno; 964 sf.sf_setflags = MD_SET_MN_START_RC; 965 sf.sf_flags = MDDB_NM_SET; 966 /* Use magic to help protect ioctl against attack. */ 967 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 968 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 969 &sf.sf_mde, NULL)) { 970 mdstealerror(ep, &sf.sf_mde); 971 mde_perror(ep, gettext("Could not set " 972 "start_step flag for set %s"), sp->setname); 973 md_exit(sp, 1); 974 } 975 976 } 977 978 meta_mc_log(MC_LOG2, gettext("Start step completed: %s"), 979 meta_print_hrtime(gethrtime() - start_time)); 980 981 break; 982 983 case MC_STOP: 984 /* 985 * Stop Step 986 * 987 * - ??? 988 */ 989 990 /* don't expect any more arguments to follow the step name */ 991 if (argc != 0) 992 usage(sp, 1); 993 994 break; 995 996 case MC_ABORT: 997 /* 998 * Abort Step 999 * 1000 * - Abort rpc.mdcommd 1001 */ 1002 1003 /* don't expect any more arguments to follow the step name */ 1004 if (argc != 0) 1005 usage(sp, 1); 1006 1007 meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"), 1008 meta_print_hrtime(0)); 1009 1010 /* 1011 * Does local set exist? If not, exit with 0 1012 * since there's no reason to have this node panic if 1013 * the local set cannot be started. 1014 */ 1015 if ((local_sp = load_local_set(ep)) == NULL) { 1016 md_exit(local_sp, 0); 1017 } 1018 1019 /* 1020 * abort the rpc.mdcommd. The abort is only issued on this node 1021 * meaning that the abort reconfig step is called on this 1022 * node before a panic while the rest of the cluster will 1023 * undergo a reconfig cycle. 1024 * There is no time relation between this node running a 1025 * reconfig abort and the the rest of the cluster 1026 * running a reconfig cycle meaning that this node may 1027 * panic before, during or after the cluster has run 1028 * a reconfig cycle. 1029 */ 1030 mdmn_abort(); 1031 1032 meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"), 1033 meta_print_hrtime(gethrtime() - start_time)); 1034 1035 break; 1036 1037 case MC_RETURN: 1038 /* 1039 * Return Step 1040 * 1041 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL 1042 * and release local set lock. Grabbing the local set 1043 * lock allows any active metaset/metadb commands to 1044 * terminate gracefully and will keep a metaset/metadb 1045 * command from starting until the DRAIN ALL is issued. 1046 * The metaset/metadb commands can issue 1047 * DRAIN ALL/RESUME ALL commands to rpc.mdcommd, 1048 * so the return step must not issue the DRAIN ALL command 1049 * until metaset/metadb have finished or metaset may issue 1050 * a RESUME ALL after this return reconfig step has issued 1051 * the DRAIN ALL command. 1052 * After this reconfig step has issued the DRAIN_ALL and 1053 * released the local set lock, metaset/metadb will fail 1054 * when attempting to contact the rpc.mdcommd and will 1055 * terminate without making any configuration changes. 1056 * The DRAIN ALL command will keep all other meta* commands 1057 * from running during the reconfig cycle (these commands 1058 * will wait until the rpc.mdcommd is resumed) since the 1059 * reconfig cycle may be changing the diskset configuration. 1060 */ 1061 1062 /* expect the nodelist to follow the step name */ 1063 if (argc < 1) 1064 usage(sp, 1); 1065 1066 meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"), 1067 meta_print_hrtime(0)); 1068 1069 /* 1070 * Does local set exist? If not, exit with 0 1071 * since there's no reason to have this node panic if 1072 * the local set cannot be started. 1073 */ 1074 if ((local_sp = load_local_set(ep)) == NULL) { 1075 md_exit(local_sp, 0); 1076 } 1077 1078 /* 1079 * Suspend any mirror resyncs that are in progress. This 1080 * stops unnecessary timeouts. 1081 */ 1082 meta_mirror_resync_block_all(); 1083 1084 if (meta_lock(local_sp, TRUE, ep) != 0) { 1085 mde_perror(ep, ""); 1086 md_exit(local_sp, 1); 1087 } 1088 1089 /* 1090 * All metaset and metadb commands on this node have now 1091 * terminated gracefully. Now, issue a drain all to 1092 * the rpc.mdcommd. Any meta command issued after the 1093 * drain all will either spin sending the command to the 1094 * master until after the reconfig cycle has finished OR 1095 * will terminate gracefully (metaset/metadb). 1096 */ 1097 if ((max_sets = get_max_sets(ep)) == 0) { 1098 mde_perror(ep, ""); 1099 md_exit(sp, 1); 1100 } 1101 1102 /* start walking through all possible disksets */ 1103 for (setno = 1; setno < max_sets; setno++) { 1104 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1105 if (mdiserror(ep, MDE_NO_SET)) { 1106 /* No set for this setno - continue */ 1107 mdclrerror(ep); 1108 continue; 1109 } else { 1110 mde_perror(ep, gettext("Unable to " 1111 "get set %d information"), setno); 1112 md_exit(sp, 1); 1113 } 1114 } 1115 1116 /* only check multi-node disksets */ 1117 if (!meta_is_mn_set(sp, ep)) { 1118 mdclrerror(ep); 1119 continue; 1120 } 1121 1122 meta_mc_log(MC_LOG3, gettext("Return - block parse " 1123 "messages for set %s: %s"), sp->setname, 1124 meta_print_hrtime(gethrtime() - start_time)); 1125 1126 /* 1127 * Mddb parse messages are sent amongst the nodes 1128 * in a diskset whenever the locator block or 1129 * locator names structure has been changed. 1130 * A locator block change could occur as a result 1131 * of a disk failure during the reconfig cycle, 1132 * so block the mddb parse messages while the 1133 * rpc.commd is suspended during the reconfig cycle. 1134 */ 1135 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1136 (void) memset(&mbp, 0, sizeof (mbp)); 1137 mbp.c_setno = setno; 1138 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 1139 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1140 &mbp.c_mde, NULL)) { 1141 mdstealerror(ep, &mbp.c_mde); 1142 mde_perror(ep, gettext("Could not " 1143 "block set %s"), sp->setname); 1144 md_exit(sp, 1); 1145 } 1146 } 1147 1148 /* suspend commd and spin waiting for drain */ 1149 while ((ret_val = mdmn_suspend(setno, 1150 MD_COMM_ALL_CLASSES, commd_timeout)) == 1151 MDE_DS_COMMDCTL_SUSPEND_NYD) { 1152 sleep(1); 1153 } 1154 1155 if (ret_val) { 1156 md_eprintf(gettext("Could not suspend " 1157 "rpc.mdcommd for set %s\n"), sp->setname); 1158 md_exit(sp, 1); 1159 } 1160 } 1161 /* 1162 * Resume all I/Os for this node for all MN sets in 1163 * case master node had suspended I/Os but panic'd 1164 * before resuming I/Os. In case of failure, exit 1165 * with a 1 since unable to resume I/Os on this node. 1166 */ 1167 if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) { 1168 mde_perror(ep, gettext( 1169 "Unable to resume I/O on node %s for all sets"), 1170 mynode()); 1171 md_exit(sp, 1); 1172 } 1173 1174 1175 /* 1176 * Can now unlock local set lock. New metaset/metadb 1177 * commands are now held off using drain all. 1178 */ 1179 (void) meta_unlock(local_sp, ep); 1180 1181 meta_mc_log(MC_LOG2, gettext("Return step completed: %s"), 1182 meta_print_hrtime(gethrtime() - start_time)); 1183 1184 break; 1185 1186 case MC_STEP1: 1187 /* 1188 * Step 1 1189 * 1190 * - Populate nodelist file if we are on clustering 1191 * and pick a master node for each MN diskset. 1192 */ 1193 1194 /* expect the nodelist to follow the step name */ 1195 if (argc < 1) 1196 usage(sp, 1); 1197 1198 meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"), 1199 meta_print_hrtime(0)); 1200 1201 /* Always write nodelist file even if no local set exists */ 1202 if (clust == SDSSC_OKAY) { 1203 /* skip to the nodelist args */ 1204 if (meta_write_nodelist(argc, argv, ep) != 0) { 1205 mde_perror(ep, gettext( 1206 "Could not populate nodelist file")); 1207 md_exit(sp, 1); 1208 } 1209 } 1210 1211 /* 1212 * Does local set exist? If not, exit with 0 1213 * since there's no reason to have this node panic if 1214 * the local set cannot be started. 1215 */ 1216 if ((local_sp = load_local_set(ep)) == NULL) { 1217 md_exit(local_sp, 0); 1218 } 1219 1220 /* 1221 * At this point, all meta* commands are blocked across 1222 * all disksets since the master rpc.mdcommd has drained or 1223 * the master node has died. 1224 * If a metaset or metadb command had been in progress 1225 * at the start of the reconfig cycle, this command has 1226 * either completed or it has been terminated due to 1227 * the death of the master node. 1228 * 1229 * This means that that it is now ok to remove any 1230 * outstanding clnt_locks associated with multinode 1231 * disksets on this node due to a node panic during 1232 * a metaset operation. This allows the routines that 1233 * choose the master to use rpc.metad to determine the 1234 * master of the diskset. 1235 */ 1236 if (clnt_clr_mnsetlock(mynode(), ep) != 0) { 1237 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1238 "clear locks failed %s"), 1239 meta_print_hrtime(gethrtime() - start_time)); 1240 md_exit(local_sp, 1); 1241 } 1242 1243 /* 1244 * Call reconfig_choose_master to choose a master for 1245 * each MN diskset, update the nodelist for each diskset 1246 * given the member information and send a reinit message 1247 * to rpc.mdcommd to reload the nodelist. 1248 */ 1249 rval = meta_reconfig_choose_master(commd_timeout, ep); 1250 if (rval == 205) { 1251 /* 1252 * NOTE: Should issue call to reboot remote host that 1253 * is causing the RPC failure. Clustering to 1254 * provide interface in the future. This should 1255 * stop a never-ending set of 205 reconfig cycles. 1256 * Remote host causing failure is stored in 1257 * ep->host if ep is an RPC error. 1258 * if (mdanyrpcerror(ep)) 1259 * reboot (ep->host); 1260 */ 1261 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1262 "choose master failure of 205 %s"), 1263 meta_print_hrtime(gethrtime() - start_time)); 1264 md_exit(local_sp, 205); 1265 } else if (rval != 0) { 1266 meta_mc_log(MC_LOG2, gettext("Step1 failure: " 1267 "choose master failure %s"), 1268 meta_print_hrtime(gethrtime() - start_time)); 1269 md_exit(local_sp, 1); 1270 } 1271 1272 meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"), 1273 meta_print_hrtime(gethrtime() - start_time)); 1274 1275 md_exit(local_sp, rval); 1276 break; 1277 1278 case MC_STEP2: 1279 /* 1280 * Step 2 1281 * 1282 * In Step 2, each node walks the list of disksets. If a 1283 * node is a master of a MN diskset, it synchronizes 1284 * the local set USER records for that diskset. 1285 * 1286 * If disks exist in the diskset and there is a joined 1287 * (owner) node in the diskset, the master will also: 1288 * - synchronize the diskset mddbs to the master 1289 * - play the change log 1290 * 1291 * The master node will now attempt to join any unjoined 1292 * nodes that are currently members in the membership list. 1293 */ 1294 1295 /* expect the nodelist to follow the step name */ 1296 if (argc < 1) 1297 usage(sp, 1); 1298 1299 meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"), 1300 meta_print_hrtime(0)); 1301 1302 /* 1303 * Does local set exist? If not, exit with 0 1304 * since there's no reason to have this node panic if 1305 * the local set cannot be started. 1306 */ 1307 if ((local_sp = load_local_set(ep)) == NULL) { 1308 md_exit(local_sp, 0); 1309 } 1310 1311 if ((max_sets = get_max_sets(ep)) == 0) { 1312 mde_perror(ep, ""); 1313 md_exit(local_sp, 1); 1314 } 1315 1316 /* start walking through all possible disksets */ 1317 for (setno = 1; setno < max_sets; setno++) { 1318 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1319 if (mdiserror(ep, MDE_NO_SET)) { 1320 /* No set for this setno - continue */ 1321 mdclrerror(ep); 1322 continue; 1323 } else if (mdanyrpcerror(ep)) { 1324 /* Fail on RPC failure to self */ 1325 mde_perror(ep, gettext( 1326 "Unable to get information for " 1327 "set number %d"), setno); 1328 md_exit(local_sp, 1); 1329 } else { 1330 mde_perror(ep, gettext( 1331 "Unable to get information for " 1332 "set number %d"), setno); 1333 mdclrerror(ep); 1334 continue; 1335 } 1336 } 1337 1338 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1339 if (mdanyrpcerror(ep)) { 1340 /* Fail on RPC failure to self */ 1341 mde_perror(ep, gettext( 1342 "Unable to get information for " 1343 "set number %d"), setno); 1344 md_exit(local_sp, 1); 1345 } 1346 mde_perror(ep, gettext("Unable to get set " 1347 "%s desc information"), sp->setname); 1348 mdclrerror(ep); 1349 continue; 1350 } 1351 1352 /* Only check MN disksets */ 1353 if (!(MD_MNSET_DESC(sd))) { 1354 continue; 1355 } 1356 1357 /* All actions in step 2 are driven by master */ 1358 if (!(sd->sd_mn_am_i_master)) { 1359 continue; 1360 } 1361 1362 meta_mc_log(MC_LOG3, gettext("Step2 - begin record " 1363 "synchronization for set %s: %s"), sp->setname, 1364 meta_print_hrtime(gethrtime() - start_time)); 1365 1366 /* 1367 * Synchronize the USER records in the local mddbs 1368 * for hosts that are members. The USER records 1369 * contain set, drive and host information. 1370 */ 1371 rval = meta_mnsync_user_records(sp, ep); 1372 if (rval != 0) { 1373 mde_perror(ep, gettext( 1374 "Synchronization of user records " 1375 "in set %s failed\n"), sp->setname); 1376 if (rval == 205) { 1377 /* 1378 * NOTE: Should issue call to reboot 1379 * remote host that is causing the RPC 1380 * failure. Clustering to provide 1381 * interface in the future. This 1382 * should stop a never-ending set of 1383 * 205 reconfig cycles. 1384 * Remote host causing failure is 1385 * stored in ep->host if ep is an 1386 * RPC error. 1387 * if (mdanyrpcerror(ep)) 1388 * reboot (ep->host); 1389 */ 1390 md_exit(local_sp, 205); 1391 } else { 1392 md_exit(local_sp, 1); 1393 } 1394 } 1395 1396 /* Reget sd since sync_user_recs may have flushed it */ 1397 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1398 mde_perror(ep, gettext("Unable to get set " 1399 "%s desc information"), sp->setname); 1400 md_exit(local_sp, 1); 1401 } 1402 1403 dd = metaget_drivedesc(sp, 1404 (MD_BASICNAME_OK | PRINT_FAST), ep); 1405 if (! mdisok(ep)) { 1406 mde_perror(ep, gettext("Unable to get set " 1407 "%s drive information"), sp->setname); 1408 md_exit(local_sp, 1); 1409 } 1410 1411 /* 1412 * No drives in set, continue to next set. 1413 */ 1414 if (dd == NULL) { 1415 /* Done with this set */ 1416 continue; 1417 } 1418 1419 meta_mc_log(MC_LOG3, gettext("Step2 - local set user " 1420 "records completed for set %s: %s"), sp->setname, 1421 meta_print_hrtime(gethrtime() - start_time)); 1422 1423 /* 1424 * Synchronize the diskset mddbs for hosts 1425 * that are members. This may involve 1426 * playing the changelog and writing out 1427 * to the diskset mddbs. 1428 */ 1429 rval = meta_mnsync_diskset_mddbs(sp, ep); 1430 if (rval != 0) { 1431 mde_perror(ep, gettext( 1432 "Synchronization of diskset mddbs " 1433 "in set %s failed\n"), sp->setname); 1434 meta_mc_log(MC_LOG3, gettext("Step2 - diskset " 1435 "mddb synchronization failed for " 1436 "set %s: %s"), sp->setname, 1437 meta_print_hrtime(gethrtime() - 1438 start_time)); 1439 if (rval == 205) { 1440 /* 1441 * NOTE: Should issue call to reboot 1442 * remote host that is causing the RPC 1443 * failure. Clustering to provide 1444 * interface in the future. This 1445 * should stop a never-ending set of 1446 * 205 reconfig cycles. 1447 * Remote host causing failure is 1448 * stored in ep->host if ep is an 1449 * RPC error. 1450 * if (mdanyrpcerror(ep)) 1451 * reboot (ep->host); 1452 */ 1453 md_exit(local_sp, 205); 1454 } else if (rval == 1) { 1455 continue; 1456 } else { 1457 md_exit(local_sp, 1); 1458 } 1459 } 1460 1461 meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb " 1462 "synchronization completed for set %s: %s"), 1463 sp->setname, 1464 meta_print_hrtime(gethrtime() - start_time)); 1465 1466 /* Join the starting nodes to the diskset */ 1467 rval = meta_mnjoin_all(sp, ep); 1468 if (rval != 0) { 1469 mde_perror(ep, gettext( 1470 "Join of non-owner (starting) nodes " 1471 "in set %s failed\n"), sp->setname); 1472 meta_mc_log(MC_LOG3, gettext("Step2 - non owner" 1473 "nodes joined for set %s: %s"), 1474 sp->setname, 1475 meta_print_hrtime(gethrtime() - 1476 start_time)); 1477 if (rval == 205) { 1478 /* 1479 * NOTE: Should issue call to reboot 1480 * remote host that is causing the RPC 1481 * failure. Clustering to provide 1482 * interface in the future. This 1483 * should stop a never-ending set of 1484 * 205 reconfig cycles. 1485 * Remote host causing failure is 1486 * stored in ep->host if ep is an 1487 * RPC error. 1488 * if (mdanyrpcerror(ep)) 1489 * reboot (ep->host); 1490 */ 1491 md_exit(local_sp, 205); 1492 } else { 1493 md_exit(local_sp, 1); 1494 } 1495 } 1496 1497 meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes " 1498 "joined for set %s: %s"), sp->setname, 1499 meta_print_hrtime(gethrtime() - start_time)); 1500 1501 } 1502 1503 meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"), 1504 meta_print_hrtime(gethrtime() - start_time)); 1505 1506 break; 1507 1508 case MC_STEP3: 1509 /* 1510 * Step 3 1511 * 1512 * For all multinode sets do, 1513 * - Reinitialise rpc.mdcommd 1514 * - Reset mirror owners to null if the current owner is 1515 * no longer in the membership list 1516 */ 1517 1518 /* expect the nodelist to follow the step name */ 1519 if (argc < 1) 1520 usage(sp, 1); 1521 1522 meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"), 1523 meta_print_hrtime(0)); 1524 1525 /* 1526 * Does local set exist? If not, exit with 0 1527 * since there's no reason to have this node panic if 1528 * the local set cannot be started. 1529 */ 1530 if ((local_sp = load_local_set(ep)) == NULL) { 1531 md_exit(local_sp, 0); 1532 } 1533 1534 /* 1535 * walk through all sets on this node which could include: 1536 * - MN disksets 1537 * - traditional disksets 1538 * - non-existent disksets 1539 * start mirror resync for all MN sets 1540 */ 1541 if ((max_sets = get_max_sets(ep)) == 0) { 1542 mde_perror(ep, ""); 1543 md_exit(local_sp, 1); 1544 } 1545 1546 /* start walking through all possible disksets */ 1547 for (setno = 1; setno < max_sets; setno++) { 1548 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1549 if (mdiserror(ep, MDE_NO_SET)) { 1550 /* No set for this setno - continue */ 1551 mdclrerror(ep); 1552 continue; 1553 } else { 1554 mde_perror(ep, gettext("Unable to " 1555 "get set %d information"), setno); 1556 md_exit(local_sp, 1); 1557 } 1558 } 1559 1560 /* only check multi-node disksets */ 1561 if (!meta_is_mn_set(sp, ep)) { 1562 mdclrerror(ep); 1563 continue; 1564 } 1565 1566 if (meta_lock(sp, TRUE, ep) != 0) { 1567 mde_perror(ep, ""); 1568 md_exit(local_sp, 1); 1569 } 1570 1571 /* If this node isn't joined to set, do nothing */ 1572 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1573 if (!mdisok(ep)) { 1574 mde_perror(ep, gettext("Could " 1575 "not get set %s ownership"), 1576 sp->setname); 1577 md_exit(sp, 1); 1578 } 1579 mdclrerror(ep); 1580 meta_unlock(sp, ep); 1581 continue; 1582 } 1583 1584 meta_mc_log(MC_LOG3, gettext("Step3 - begin " 1585 "re-initialising rpc.mdcommd and resetting mirror " 1586 "owners for set %s: %s"), sp->setname, 1587 meta_print_hrtime(gethrtime() - start_time)); 1588 1589 /* reinitialzse rpc.mdcommd with new nodelist */ 1590 if (mdmn_reinit_set(setno, commd_timeout)) { 1591 md_eprintf(gettext( 1592 "Could not re-initialise rpc.mdcommd for " 1593 "set %s\n"), sp->setname); 1594 md_exit(sp, 1); 1595 } 1596 1597 (void) memset(&cfg, 0, sizeof (cfg)); 1598 cfg.c_id = 0; 1599 cfg.c_setno = sp->setno; 1600 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1601 NULL) != 0) { 1602 mdstealerror(ep, &cfg.c_mde); 1603 mde_perror(ep, gettext("Could " 1604 "not get set %s information"), 1605 sp->setname); 1606 md_exit(sp, 1); 1607 } 1608 1609 /* Don't do anything else if set is stale */ 1610 if (cfg.c_flags & MDDB_C_STALE) { 1611 meta_unlock(sp, ep); 1612 mdclrerror(ep); 1613 continue; 1614 } 1615 1616 /* reset mirror owners */ 1617 if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) { 1618 md_exit(sp, 1); 1619 } 1620 1621 meta_unlock(sp, ep); 1622 1623 meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd " 1624 "re-initialised and mirror owners reset for " 1625 "set %s: %s"), sp->setname, 1626 meta_print_hrtime(gethrtime() - start_time)); 1627 } 1628 1629 meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"), 1630 meta_print_hrtime(gethrtime() - start_time)); 1631 1632 break; 1633 1634 case MC_STEP4: 1635 /* 1636 * Step 4 1637 * 1638 * For all multinode sets do: 1639 * - Resume the rpc.mdcommd messages. Must resume all 1640 * sets before issuing I/O to any set since an error 1641 * encountered in a commd suspended set could be 1642 * blocked waiting for commd in another set to resume. 1643 * (This happens since the daemon queues service 1644 * all sets). An open of a soft partition causes 1645 * a read of the watermarks during the open. 1646 * - If set is non-writable (not an owner or STALE), then 1647 * continue to next set. 1648 * 1649 * For all multinode sets do, 1650 * - Reset ABR states for all mirrors, ie clear ABR if not 1651 * open on any node. 1652 * - Reset ABR states for all soft partitions, ie clear ABR if 1653 * not open on any node. 1654 * - For all slave nodes that have entered through the start 1655 * step, update the ABR state to that of the master and 1656 * get the submirror state from the master 1657 * - meta_lock set 1658 * - Resync all mirrors 1659 * - unlock meta_lock for this set. 1660 * - Choose a new owner for any orphaned resyncs 1661 * 1662 * There is one potential issue here. when concurrently 1663 * resetting and updating the ABR state. If the master has ABR 1664 * set, but should no longer have because the only node that 1665 * had the metadevice open and had ABR set has paniced, the 1666 * master will send a message to all nodes to clear the ABR 1667 * state. Meanwhile any node that has come through the 1668 * start step will get tstate from the master and will update 1669 * ABR if it was set in tstate. So, we appear to have a problem 1670 * if the following sequence occurs:- 1671 * - The slave gets tstate with ABR set 1672 * - The master sends a message to clear ABR 1673 * - The slave updates ABR with the value it got from tstate. 1674 * We now have the master with ABR clear and the slave with ABR 1675 * set. Fortunately, having set ABR, the slave will close the 1676 * metadevice after setting ABR and as there are no nodes with 1677 * the device open, the close will send a message to clear ABR 1678 * on all nodes. So, the nodes will all have ABR unset. 1679 */ 1680 1681 /* expect the nodelist to follow the step name */ 1682 if (argc < 1) 1683 usage(sp, 1); 1684 1685 meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"), 1686 meta_print_hrtime(0)); 1687 1688 /* 1689 * Does local set exist? If not, exit with 0 1690 * since there's no reason to have this node panic if 1691 * the local set cannot be started. 1692 */ 1693 if ((local_sp = load_local_set(ep)) == NULL) { 1694 md_exit(local_sp, 0); 1695 } 1696 1697 /* 1698 * walk through all sets on this node which could include: 1699 * - MN disksets 1700 * - traditional disksets 1701 * - non-existent disksets 1702 * start mirror resync for all MN sets 1703 */ 1704 if ((max_sets = get_max_sets(ep)) == 0) { 1705 mde_perror(ep, ""); 1706 md_exit(local_sp, 1); 1707 } 1708 1709 /* Clear set_info structure */ 1710 for (setno = 1; setno < max_sets; setno++) { 1711 set_info[setno] = 0; 1712 } 1713 1714 /* start walking through all possible disksets */ 1715 for (setno = 1; setno < max_sets; setno++) { 1716 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1717 if (mdiserror(ep, MDE_NO_SET)) { 1718 /* No set for this setno - continue */ 1719 mdclrerror(ep); 1720 continue; 1721 } else { 1722 mde_perror(ep, gettext("Unable to " 1723 "get set %d information"), setno); 1724 md_exit(local_sp, 1); 1725 } 1726 } 1727 1728 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1729 mde_perror(ep, gettext("Unable to get set " 1730 "%s desc information"), sp->setname); 1731 mdclrerror(ep); 1732 continue; 1733 } 1734 1735 /* only check multi-node disksets */ 1736 if (!meta_is_mn_set(sp, ep)) { 1737 mdclrerror(ep); 1738 continue; 1739 } 1740 1741 set_info[setno] |= SET_INFO_MN; 1742 1743 /* 1744 * If not an owner (all mddbs failed) or stale 1745 * (< 50% mddbs operational), then set is 1746 * non-writable so just resume commd and 1747 * unblock mddb messages. 1748 */ 1749 mdclrerror(ep); 1750 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1751 set_info[setno] |= SET_INFO_NO_WR; 1752 } 1753 if (!mdisok(ep)) { 1754 mde_perror(ep, gettext("Could " 1755 "not get set %s ownership"), 1756 sp->setname); 1757 md_exit(local_sp, 1); 1758 } 1759 /* Set is owned - is it stale? */ 1760 if (!set_info[setno] & SET_INFO_NO_WR) { 1761 (void) memset(&cfg, 0, sizeof (cfg)); 1762 cfg.c_id = 0; 1763 cfg.c_setno = sp->setno; 1764 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1765 NULL) != 0) { 1766 mdstealerror(ep, &cfg.c_mde); 1767 mde_perror(ep, gettext("Could " 1768 "not get set %s information"), 1769 sp->setname); 1770 md_exit(local_sp, 1); 1771 } 1772 if (cfg.c_flags & MDDB_C_STALE) { 1773 set_info[setno] |= SET_INFO_NO_WR; 1774 } 1775 } 1776 1777 /* resume rpc.mdcommd */ 1778 if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0, 1779 commd_timeout)) { 1780 md_eprintf(gettext("Unable to resume " 1781 "rpc.mdcommd for set %s\n"), sp->setname); 1782 md_exit(local_sp, 1); 1783 } 1784 1785 /* Unblock mddb parse messages */ 1786 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1787 (void) memset(&mbp, 0, sizeof (mbp)); 1788 mbp.c_setno = setno; 1789 mbp.c_blk_flags = MDDB_UNBLOCK_PARSE; 1790 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1791 &mbp.c_mde, NULL)) { 1792 mdstealerror(ep, &mbp.c_mde); 1793 mde_perror(ep, gettext("Could not " 1794 "unblock set %s"), sp->setname); 1795 md_exit(local_sp, 1); 1796 } 1797 } 1798 meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd " 1799 "resumed and messages unblocked for set %s: %s"), 1800 sp->setname, 1801 meta_print_hrtime(gethrtime() - start_time)); 1802 } 1803 1804 for (setno = 1; setno < max_sets; setno++) { 1805 int start_step; 1806 1807 /* Skip traditional disksets. */ 1808 if ((set_info[setno] & SET_INFO_MN) == 0) 1809 continue; 1810 1811 /* 1812 * If already determined that this set is 1813 * a non-writable set, then just continue 1814 * to next set since there's nothing else 1815 * to do for a non-writable set. 1816 */ 1817 if (set_info[setno] & SET_INFO_NO_WR) 1818 continue; 1819 1820 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1821 if (mdiserror(ep, MDE_NO_SET)) { 1822 /* No set for this setno - continue */ 1823 mdclrerror(ep); 1824 continue; 1825 } else { 1826 mde_perror(ep, gettext("Unable to " 1827 "get set %d information"), setno); 1828 md_exit(local_sp, 1); 1829 } 1830 } 1831 1832 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1833 mde_perror(ep, gettext("Unable to get set " 1834 "%s desc information"), sp->setname); 1835 mdclrerror(ep); 1836 continue; 1837 } 1838 1839 /* See if this node came through the start step */ 1840 (void) memset(&sf, 0, sizeof (sf)); 1841 sf.sf_setno = sp->setno; 1842 sf.sf_flags = MDDB_NM_GET; 1843 /* Use magic to help protect ioctl against attack. */ 1844 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1845 if (metaioctl(MD_MN_GET_SETFLAGS, &sf, 1846 &sf.sf_mde, NULL)) { 1847 mdstealerror(ep, &sf.sf_mde); 1848 mde_perror(ep, gettext("Could not get " 1849 "start_step flag for set %s"), sp->setname); 1850 md_exit(local_sp, 1); 1851 } 1852 start_step = 1853 (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0; 1854 1855 /* 1856 * We can now reset the start_step flag for the set 1857 * if it was already set. 1858 */ 1859 if (start_step) { 1860 (void) memset(&sf, 0, sizeof (sf)); 1861 sf.sf_setno = sp->setno; 1862 sf.sf_setflags = MD_SET_MN_START_RC; 1863 sf.sf_flags = MDDB_NM_RESET; 1864 /* 1865 * Use magic to help protect ioctl 1866 * against attack. 1867 */ 1868 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1869 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1870 &sf.sf_mde, NULL)) { 1871 mdstealerror(ep, &sf.sf_mde); 1872 mde_perror(ep, 1873 gettext("Could not reset " 1874 "start_step flag for set %s"), 1875 sp->setname); 1876 } 1877 } 1878 1879 meta_mc_log(MC_LOG3, gettext("Step4 - begin setting " 1880 "ABR state and restarting io's for " 1881 "set %s: %s"), sp->setname, 1882 meta_print_hrtime(gethrtime() - start_time)); 1883 1884 1885 /* 1886 * If we are not the master and we have come through 1887 * the start step, we must update the ABR states 1888 * for mirrors and soft partitions. Also the submirror 1889 * states need to be synchronised so that we see the 1890 * same status as other previously joined members. 1891 * This _must_ be done before starting the resync. 1892 */ 1893 if (!(sd->sd_mn_am_i_master) && start_step) { 1894 if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR, 1895 ep) == -1) { 1896 md_exit(local_sp, 1); 1897 } 1898 if (reset_state(UPDATE_ABR, sp, MD_SP, 1899 ep) == -1) { 1900 md_exit(local_sp, 1); 1901 } 1902 /* 1903 * Mark the fact that we've got the mirror 1904 * state. This allows the resync thread to 1905 * determine if _it_ needs to issue this. This 1906 * can happen if a node is added to a set after 1907 * a reconfig cycle has completed. 1908 */ 1909 (void) memset(&sf, 0, sizeof (sf)); 1910 sf.sf_setno = sp->setno; 1911 sf.sf_setflags = MD_SET_MN_MIR_STATE_RC; 1912 sf.sf_flags = MDDB_NM_SET; 1913 /* 1914 * Use magic to help protect ioctl 1915 * against attack. 1916 */ 1917 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1918 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1919 &sf.sf_mde, NULL)) { 1920 mdstealerror(ep, &sf.sf_mde); 1921 mde_perror(ep, 1922 gettext("Could not set " 1923 "submirror state flag for set %s"), 1924 sp->setname); 1925 } 1926 } 1927 1928 /* 1929 * All remaining actions are only performed by the 1930 * master 1931 */ 1932 if (!(sd->sd_mn_am_i_master)) { 1933 if (meta_lock(sp, TRUE, ep) != 0) { 1934 mde_perror(ep, ""); 1935 md_exit(local_sp, 1); 1936 } 1937 meta_mirror_resync_unblock(sp); 1938 meta_unlock(sp, ep); 1939 continue; 1940 } 1941 1942 /* 1943 * If the master came through the start step, this 1944 * implies that all of the nodes must have done the 1945 * same and hence there can be no applications 1946 * running. Hence no need to reset ABR 1947 */ 1948 if (!start_step) { 1949 /* Reset ABR state for mirrors */ 1950 if (reset_state(RESET_ABR, sp, MD_MIRROR, 1951 ep) == -1) { 1952 md_exit(local_sp, 1); 1953 } 1954 /* ...and now the same for soft partitions */ 1955 if (reset_state(RESET_ABR, sp, MD_SP, 1956 ep) == -1) { 1957 md_exit(local_sp, 1); 1958 } 1959 } 1960 1961 /* 1962 * choose owners for orphaned resyncs and reset 1963 * non-orphaned resyncs so that an owner node that 1964 * reboots will restart the resync if needed. 1965 */ 1966 if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1) 1967 md_exit(local_sp, 1); 1968 1969 /* 1970 * Must unlock set lock before meta_mirror_resync_all 1971 * sends a message to run the metasync command 1972 * which also grabs the meta_lock. 1973 */ 1974 if (meta_lock(sp, TRUE, ep) != 0) { 1975 mde_perror(ep, ""); 1976 md_exit(local_sp, 1); 1977 } 1978 meta_mirror_resync_unblock(sp); 1979 meta_unlock(sp, ep); 1980 1981 /* resync all mirrors in set */ 1982 if (meta_mirror_resync_all(sp, 0, ep) != 0) { 1983 mde_perror(ep, gettext("Mirror resyncs " 1984 "failed for set %s"), sp->setname); 1985 md_exit(local_sp, 1); 1986 } 1987 1988 meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted " 1989 "for set %s: %s"), sp->setname, 1990 meta_print_hrtime(gethrtime() - start_time)); 1991 } 1992 1993 meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"), 1994 meta_print_hrtime(gethrtime() - start_time)); 1995 1996 break; 1997 1998 default: 1999 usage(sp, 1); 2000 break; 2001 } 2002 2003 md_exit(sp, 0); 2004 /* NOTREACHED */ 2005 return (0); 2006 } 2007