1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 30 #include <meta.h> 31 #include <sdssc.h> 32 #include <signal.h> 33 #include <syslog.h> 34 #include <sys/types.h> 35 #include <sys/wait.h> 36 #include <sys/lvm/md_mirror.h> 37 #include <metad.h> 38 39 #define MY_VERSION "1.0" /* the highest supported version */ 40 #define MAX_DEBUG_LEVEL 5 /* maximum verbosity level */ 41 42 #define RESET_OWNER 0x0001 43 #define CHOOSE_OWNER 0x0002 44 #define RESET_ABR 0x0004 45 #define UPDATE_ABR 0x0008 46 #define GET_MIRROR_STATE 0x0010 47 48 #define SET_INFO_NO_WR 0x0002 49 #define SET_INFO_MN 0x0004 50 51 /* 52 * This table defines all the metaclust reconfig steps we understand 53 */ 54 typedef enum stpnum { 55 MC_UNK = 0, 56 MC_START, 57 MC_STOP, 58 MC_ABORT, 59 MC_RETURN, 60 MC_STEP1, 61 MC_STEP2, 62 MC_STEP3, 63 MC_STEP4 64 } stepnum_t; 65 66 /* 67 * Structure for step_name -> step_number mapping 68 */ 69 struct step_t { 70 char *step_nam; 71 stepnum_t step_num; 72 }; 73 74 /* 75 * Step name to step number mapping table 76 * This table MUST be sorted alphabetically in ascending order of step name 77 */ 78 static struct step_t step_table[] = { 79 { "abort", MC_ABORT }, 80 { "return", MC_RETURN }, 81 { "start", MC_START }, 82 { "step1", MC_STEP1 }, 83 { "step2", MC_STEP2 }, 84 { "step3", MC_STEP3 }, 85 { "step4", MC_STEP4 }, 86 { "stop", MC_STOP } 87 }; 88 89 /* 90 * If support for a different version is added, the new version number should 91 * be appended to the version_table below. This list will be searched to 92 * determine if a version requested via the -V option is supported or not. 93 */ 94 static char *version_table[] = { 95 MY_VERSION 96 }; 97 98 uint_t timeout = 0; /* disable timeout by default */ 99 char *version = MY_VERSION; /* use latest version by default */ 100 int stepnum = MC_UNK; /* reconfiguration step number */ 101 pid_t c_pid; /* child process id */ 102 103 /* 104 * Binary search comparison routine 105 */ 106 static int 107 mc_compare(const void *stp1, const void *stp2) 108 { 109 return (strcmp((const char *)stp1, 110 ((const struct step_t *)stp2)->step_nam)); 111 } 112 113 /* 114 * Timeout expiry alarm signal handler 115 */ 116 /*ARGSUSED*/ 117 static void 118 sigalarmhandler(int sig) 119 { 120 int i, n, ret, stat_loc = 0; 121 122 n = sizeof (step_table) / sizeof (step_table[0]); 123 for (i = 0; i < n; i++) { 124 if (stepnum == step_table[i].step_num) 125 break; 126 } 127 128 assert(i != n); 129 130 meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"), 131 step_table[i].step_nam, 132 meta_print_hrtime(gethrtime() - start_time)); 133 134 if ((ret = kill(c_pid, SIGKILL)) == 0) { 135 /* 136 * The child will wait forever until the status is retrieved 137 * so get it now. Keep retrying if the call is interrupted. 138 * 139 * The possible results are, 140 * 141 * - child killed successfully 142 * - signal sent but child not killed 143 * - waitpid failed/interrupted 144 */ 145 sleep(2); 146 while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) { 147 if (errno != EINTR) { 148 break; 149 } 150 } 151 if ((ret == c_pid) || (errno == ECHILD)) { 152 ret = 0; 153 } else { 154 ret = 1; 155 } 156 } else if (errno == ESRCH) { 157 /* 158 * If the kill did not catch the child then it means the child 159 * exited immediately after the timeout occured. 160 */ 161 ret = 0; 162 } 163 164 /* 165 * make sure not to exit with 205 for any steps other than step1-step4. 166 * Suncluster reconfiguration can't handle it otherwise. 167 */ 168 switch (stepnum) { 169 case MC_STEP1: 170 case MC_STEP2: 171 case MC_STEP3: 172 case MC_STEP4: 173 /* 174 * If the child was killed successfully return 205 for a 175 * new reconfig cycle otherwise send 1 to panic the node. 176 */ 177 if (ret != 0) { 178 md_eprintf(gettext("Could not kill child\n")); 179 exit(1); 180 } else { 181 exit(205); 182 } 183 break; 184 case MC_START: 185 case MC_STOP: 186 case MC_ABORT: 187 case MC_RETURN: 188 default: 189 exit(1); 190 break; 191 } 192 } 193 194 /* 195 * Attempt to load local set. 196 * Returns: 197 * pointer to mdsetname_t for local set (local_sp) is successful. 198 * 0 if failure 199 * if there are no local set mddbs, no error message is printed. 200 * Otherwise, error message is printed so that user 201 * can determine why the local set didn't start. 202 */ 203 mdsetname_t * 204 load_local_set(md_error_t *ep) 205 { 206 mdsetname_t *local_sp = NULL; 207 208 /* Does local set exist? If not, give no error */ 209 if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) { 210 return (0); 211 } 212 213 /* 214 * snarf local set 215 * If fails with MDE_DB_NODB, then just return 1 printing 216 * no failure. 217 * Otherwise, print error message, and return 1. 218 */ 219 if (meta_setup_db_locations(ep) != 0) { 220 if (!(mdismddberror(ep, MDE_DB_NODB))) 221 mde_perror(ep, ""); 222 return (0); 223 } 224 225 /* local set loaded successfully */ 226 return (local_sp); 227 } 228 229 /* 230 * Purpose: Compose a full path name for a metadevice 231 * 232 * On entry: sp - setname pointer 233 * mnum - minor number of metadevice 234 * pathname - pointer to array to return path string 235 * pathlen - max length of pathname array 236 */ 237 static int 238 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen) 239 { 240 int rtn; 241 242 if (MD_MIN2SET(mnum) != sp->setno) { 243 md_eprintf(gettext("minor number 0x%x invalid for set %d\n"), 244 mnum, sp->setno); 245 return (-1); 246 } 247 rtn = snprintf(pathname, pathlen, "/dev/md/%s/rdsk/d%u", 248 sp->setname, (unsigned)MD_MIN2UNIT(mnum)); 249 250 if ((pathname[0] == '\0') || (rtn >= pathlen)) { 251 md_eprintf(gettext( 252 "Could not create path for device %s/d%u\n"), 253 sp->setname, (unsigned)MD_MIN2UNIT(mnum)); 254 return (-1); 255 } 256 return (0); 257 } 258 259 /* 260 * Purpose: Walk through all the devices specified for the given set 261 * and do the action specified in mode 262 */ 263 static int 264 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep) 265 { 266 mdnamelist_t *devnlp = NULL; 267 mdnamelist_t *p; 268 mdname_t *devnp = NULL; 269 md_set_mmown_params_t ownpar_p; 270 md_set_mmown_params_t *ownpar = &ownpar_p; 271 md_unit_t *mm; 272 int mirror_dev = 0; 273 mndiskset_membershiplist_t *nl; 274 int cnt; 275 int has_parent; 276 md_mn_get_mir_state_t mir_state_p; 277 md_mn_get_mir_state_t *mir_state = &mir_state_p; 278 279 /* 280 * if we are choosing or resetting the owners then make sure 281 * we are only doing it for mirror devices 282 */ 283 mirror_dev = (strcmp(MD_MIRROR, drivername) == 0); 284 if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) { 285 return (-1); 286 } 287 288 /* get a list of all the metadevices for current set */ 289 if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) { 290 mde_perror(ep, gettext("Could not get mirrors for set %s"), 291 sp->setname); 292 return (-1); 293 } else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) { 294 mde_perror(ep, gettext( 295 "Could not get soft partitions for set %s"), sp->setname); 296 return (-1); 297 } 298 299 /* If resetting the owner, get the known membership list */ 300 if (mode & RESET_OWNER) { 301 if (meta_read_nodelist(&cnt, &nl, ep)) { 302 mde_perror(ep, "Could not get nodelist"); 303 return (-1); 304 } 305 } 306 307 /* for each metadevice */ 308 for (p = devnlp; (p != NULL); p = p->next) { 309 devnp = p->namep; 310 311 /* 312 * Get the current setting for mirror ABR state and all of the 313 * submirror state and flags from the master node. We only 314 * perform this when going through a 'start' cycle. 315 */ 316 if ((mode & GET_MIRROR_STATE) && mirror_dev) { 317 char *miscname; 318 319 /* 320 * Ensure that we ignore soft-parts that are returned 321 * from the meta_get_mirror_names() call 322 */ 323 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 324 goto out; 325 if (strcmp(miscname, MD_MIRROR) != 0) 326 continue; 327 328 mir_state->mnum = meta_getminor(devnp->dev); 329 MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno); 330 meta_mc_log(MC_LOG4, gettext("Getting mirror state" 331 " for %s/d%u: %s"), sp->setname, 332 (unsigned)MD_MIN2UNIT(mir_state->mnum), 333 meta_print_hrtime(gethrtime() - start_time)); 334 335 if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep, 336 "MD_MN_GET_MIRROR_STATE") != 0) { 337 mde_perror(ep, gettext("Unable to get " 338 "mirror state for %s/d%u"), sp->setname, 339 (unsigned)MD_MIN2UNIT(mir_state->mnum)); 340 goto out; 341 } else { 342 continue; 343 } 344 } 345 346 /* check if this is a top level metadevice */ 347 if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL) 348 goto out; 349 if (MD_HAS_PARENT(MD_PARENT(mm))) { 350 has_parent = 1; 351 } else { 352 has_parent = 0; 353 } 354 Free(mm); 355 356 if (mode & (RESET_OWNER | CHOOSE_OWNER)) { 357 char *miscname; 358 359 /* 360 * we can only do these for mirrors so make sure we 361 * really have a mirror device and not a softpartition 362 * imitating one. meta_get_mirror_names seems to think 363 * softparts on top of a mirror are mirrors! 364 */ 365 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 366 goto out; 367 if (strcmp(miscname, MD_MIRROR) != 0) 368 continue; 369 370 (void) memset(ownpar, 0, sizeof (*ownpar)); 371 ownpar->d.mnum = meta_getminor(devnp->dev); 372 MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno); 373 374 meta_mc_log(MC_LOG4, gettext("Setting owner " 375 "for %s/d%u: %s"), sp->setname, 376 (unsigned)MD_MIN2UNIT(ownpar->d.mnum), 377 meta_print_hrtime(gethrtime() - start_time)); 378 379 /* get the current owner id */ 380 if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep, 381 "MD_MN_GET_MM_OWNER") != 0) { 382 mde_perror(ep, gettext("Unable to get " 383 "mirror owner for %s/d%u"), sp->setname, 384 (unsigned)MD_MIN2UNIT(ownpar->d.mnum)); 385 goto out; 386 } 387 } 388 389 if (mode & RESET_OWNER) { 390 if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) { 391 mdclrerror(ep); 392 continue; 393 } 394 395 /* 396 * reset owner only if the current owner is 397 * not in the membership list 398 * Also kill the resync thread so that when the resync 399 * is started, it will perform an optimized resync 400 * for any resync regions that were dirty when the 401 * current owner left the membership. 402 */ 403 if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) { 404 if (meta_mn_change_owner(&ownpar, 405 sp->setno, ownpar->d.mnum, 406 MD_MN_MIRROR_UNOWNED, 407 MD_MN_MM_ALLOW_CHANGE) == -1) { 408 md_eprintf(gettext( 409 "Unable to reset mirror owner " 410 "for %s/d%u\n"), sp->setname, 411 (unsigned)MD_MIN2UNIT( 412 ownpar->d.mnum)); 413 goto out; 414 } 415 if (meta_mirror_resync(sp, devnp, 0, ep, 416 MD_RESYNC_KILL_NO_WAIT) != 0) { 417 md_eprintf(gettext( 418 "Unable to kill resync for" 419 " %s/d%u\n"), sp->setname, 420 (unsigned)MD_MIN2UNIT( 421 ownpar->d.mnum)); 422 goto out; 423 } 424 } 425 } 426 427 if (mode & CHOOSE_OWNER) { 428 /* 429 * only orphaned resyncs will have no owner. 430 * if that is the case choose a new owner. Otherwise 431 * re-establish the existing owner. This covers the 432 * case where a node that owned the mirror 433 * reboots/panics and comes back into the cluster before 434 * the reconfig cycle has completed. In this case the 435 * other cluster nodes will have the mirror owner marked 436 * as the rebooted node while it has the owner marked 437 * as 'None'. We have to reestablish the ownership so 438 * that the subsequent resync can continue. 439 */ 440 if (meta_mn_change_owner(&ownpar, sp->setno, 441 ownpar->d.mnum, ownpar->d.owner, 442 MD_MN_MM_CHOOSE_OWNER) == -1) { 443 md_eprintf(gettext("Unable to choose " 444 "mirror owner for %s/d%u\n"), sp->setname, 445 (unsigned)MD_MIN2UNIT(ownpar->d.mnum)); 446 goto out; 447 } 448 } 449 450 /* 451 * For RESET_ABR and UPDATE_ABR - only handle top 452 * level metadevices. 453 */ 454 if (has_parent) 455 continue; 456 457 if (mode & RESET_ABR) { 458 /* 459 * Reset the ABR (application based recovery) 460 * value on all nodes. We are dealing with 461 * the possibility that we have ABR set but the 462 * only node that had the device open with ABR has 463 * left the cluster. We simply open and close the 464 * device and if this is the last close in the 465 * cluster, ABR will be cleared on all nodes. 466 */ 467 char *miscname; 468 char name[MD_MAX_CTDLEN]; 469 int mnum, fd; 470 471 name[0] = '\0'; 472 mnum = meta_getminor(devnp->dev); 473 474 /* 475 * Ensure that we don't include soft-parts in the 476 * mirror-only call to RESET_ABR. meta_get_mirror_names 477 * returns a bogus list that includes all soft-parts 478 * built on mirrors. 479 */ 480 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 481 goto out; 482 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 483 continue; 484 485 meta_mc_log(MC_LOG4, gettext("Re-setting ABR state " 486 "for %s/d%u: %s"), sp->setname, 487 (unsigned)MD_MIN2UNIT(mnum), 488 meta_print_hrtime(gethrtime() - start_time)); 489 490 /* compose the absolute device path and open it */ 491 if (compose_path(sp, mnum, &name[0], 492 sizeof (name)) != 0) 493 goto out; 494 if ((fd = open(name, O_RDWR, 0)) < 0) { 495 md_perror(gettext("Could not open device %s"), 496 name); 497 continue; 498 } 499 500 (void) close(fd); 501 } 502 503 if (mode & UPDATE_ABR) { 504 /* 505 * Update the ABR value on this node. We obtain the 506 * current ABR state from the master node. 507 */ 508 509 char *miscname; 510 char name[MD_MAX_CTDLEN]; 511 int mnum, fd; 512 volcap_t vc; 513 uint_t tstate; 514 515 name[0] = '\0'; 516 mnum = meta_getminor(devnp->dev); 517 518 /* 519 * Ensure that we don't include soft-parts in the 520 * mirror-only call to UPDATE_ABR. meta_get_mirror_names 521 * returns a bogus list that includes all soft-parts 522 * built on mirrors. 523 */ 524 if ((miscname = metagetmiscname(devnp, ep)) == NULL) 525 goto out; 526 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) 527 continue; 528 529 /* Get tstate from Master */ 530 if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) 531 != 0) 532 continue; 533 /* If not set on the master, nothing to do */ 534 if (!(tstate & MD_ABR_CAP)) 535 continue; 536 537 meta_mc_log(MC_LOG4, gettext("Updating ABR state " 538 "for %s/d%u: %s"), sp->setname, 539 (unsigned)MD_MIN2UNIT(mnum), 540 meta_print_hrtime(gethrtime() - start_time)); 541 542 /* compose the absolute device path and open it */ 543 if (compose_path(sp, mnum, &name[0], 544 sizeof (name)) != 0) 545 goto out; 546 if ((fd = open(name, O_RDWR, 0)) < 0) { 547 md_perror(gettext("Could not open device %s"), 548 name); 549 continue; 550 } 551 552 /* set ABR state */ 553 vc.vc_info = 0; 554 vc.vc_set = 0; 555 if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) { 556 /* 557 * Ignore if device does not support this 558 * ioctl 559 */ 560 if ((errno != ENOTTY) && (errno != ENOTSUP)) { 561 md_perror(gettext("Could not get " 562 "ABR/DMR state for device %s"), 563 name); 564 } 565 (void) close(fd); 566 continue; 567 } 568 if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) { 569 (void) close(fd); 570 continue; 571 } 572 573 vc.vc_set = DKV_ABR_CAP; 574 if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) { 575 md_perror(gettext( 576 "Could not set ABR state for " 577 "device %s"), name); 578 (void) close(fd); 579 goto out; 580 } else { 581 md_eprintf(gettext( 582 "Setting ABR state on device %s\n"), name); 583 } 584 585 (void) close(fd); 586 } 587 } 588 589 /* cleanup */ 590 if (mode & RESET_OWNER) { 591 meta_free_nodelist(nl); 592 } 593 metafreenamelist(devnlp); 594 return (0); 595 596 out: 597 /* cleanup */ 598 if (mode & RESET_OWNER) { 599 meta_free_nodelist(nl); 600 } 601 metafreenamelist(devnlp); 602 return (-1); 603 } 604 605 /* 606 * Print usage message 607 */ 608 static void 609 usage(mdsetname_t *sp, int eval) 610 { 611 (void) fprintf(stderr, gettext("usage:" 612 "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n" 613 "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n" 614 "\t%s [-V version] [-t timeout] [-d level] abort | stop\n" 615 "\t%s [-V | -? | -h]\n"), 616 myname, myname, myname, myname); 617 if (!eval) { 618 fprintf(stderr, gettext("\n" 619 "\tValid debug (-d) levels are 1-%d for increasing " 620 "verbosity.\n\tDefault is -d 3.\n\n" 621 "\tValid step values are: return | step1 | step2 | " 622 "step3 | step4\n\n" 623 "\tNodelist is a space-separated list of node id's\n\n"), 624 MAX_DEBUG_LEVEL); 625 } 626 md_exit(sp, eval); 627 } 628 629 /* 630 * Input: Input takes a config step name followed by a list of 631 * possible node id's. 632 * 633 * Returns: 0 - Success 634 * 1 - Fail 635 * Node will be removed from cluster membership 636 * by forcing node to panic. 637 * 205 - Unsuccessful. Start another reconfig cycle. 638 * Problem was encountered that could be fixed by 639 * running another reconfig cycle. 640 * Problem could be a result of a failure to read 641 * the nodelist file or that all work could not be 642 * accomplished in a reconfig step in the amount of 643 * time given so another reconfig cycle is needed in 644 * order to finish the current step. 645 */ 646 int 647 main(int argc, char **argv) 648 { 649 mdsetname_t *sp = NULL; 650 md_error_t status = mdnullerror; 651 md_error_t *ep = &status; 652 set_t max_sets, setno; 653 int c, clust = 0; 654 struct sigaction nsa, osa; 655 struct step_t *step_ptr; 656 mdsetname_t *local_sp = NULL; 657 md_drive_desc *dd; 658 int rval = 0; 659 md_set_desc *sd; 660 mddb_block_parm_t mbp; 661 uint_t debug = 3; /* log upto MC_LOG3 by default */ 662 int version_table_size; 663 mddb_setflags_config_t sf; 664 int ret_val; 665 mddb_config_t cfg; 666 int set_info[MD_MAXSETS]; 667 668 /* 669 * Get the locale set up before calling any other routines 670 * with messages to ouput. Just in case we're not in a build 671 * environment, make sure that TEXT_DOMAIN gets set to 672 * something. 673 */ 674 #if !defined(TEXT_DOMAIN) 675 #define TEXT_DOMAIN "SYS_TEST" 676 #endif 677 (void) setlocale(LC_ALL, ""); 678 (void) textdomain(TEXT_DOMAIN); 679 680 if ((clust = sdssc_bind_library()) == SDSSC_ERROR) { 681 md_eprintf(gettext("Interface error with libsds_sc.so\n")); 682 exit(1); 683 } 684 685 if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) { 686 mde_perror(ep, ""); 687 md_exit(sp, 1); 688 } 689 690 /* 691 * open log and enable libmeta logging. Do it here explicitly 692 * rather than letting md_init() do it because we are not really 693 * a daemon and that is what md_init() opens the log as. 694 */ 695 openlog("metaclust", LOG_CONS, LOG_USER); 696 697 version_table_size = sizeof (version_table) / sizeof (version_table[0]); 698 699 optind = 1; 700 opterr = 0; 701 while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) { 702 switch (c) { 703 case 'h': 704 usage(sp, 0); 705 break; 706 707 case 'd': 708 if (sscanf(optarg, "%u", &debug) != 1) { 709 md_eprintf(gettext("Invalid debug level\n")); 710 md_exit(sp, 1); 711 } else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) { 712 debug = min(max(debug, 1), MAX_DEBUG_LEVEL); 713 md_eprintf(gettext("Debug level must be " 714 "between 1 and %d inclusive.\n"), 715 MAX_DEBUG_LEVEL); 716 md_eprintf(gettext("Debug level set to %d.\n"), 717 debug); 718 } 719 break; 720 721 case 'V': 722 version = Strdup(optarg); 723 break; 724 725 case 't': 726 if (sscanf(optarg, "%u", &timeout) != 1) { 727 md_eprintf(gettext("Invalid timeout value\n")); 728 md_exit(sp, 1); 729 } 730 break; 731 732 case '?': 733 if (optopt == '?') { 734 usage(sp, 0); 735 } else if (optopt == 'V') { 736 int i; 737 738 fprintf(stdout, gettext( 739 "%s: Versions Supported:"), myname); 740 for (i = 0; i < version_table_size; i++) { 741 fprintf(stdout, " %s", 742 version_table[i]); 743 } 744 fprintf(stdout, "\n"); 745 md_exit(sp, 0); 746 } 747 /*FALLTHROUGH*/ 748 749 default: 750 usage(sp, 1); 751 break; 752 } 753 } 754 755 /* initialise the debug level and start time */ 756 setup_mc_log(debug); 757 758 /* 759 * check that the version specified (if any) is supported. 760 */ 761 if (version != NULL) { 762 int i, found = 0; 763 764 for (i = 0; i < version_table_size; i++) { 765 if (strcmp(version, version_table[i]) == 0) { 766 found = 1; 767 break; 768 } 769 } 770 if (!found) { 771 md_eprintf(gettext("Version %s not supported\n"), 772 version); 773 md_exit(sp, 1); 774 } 775 } 776 777 argc -= optind; 778 argv += optind; 779 780 /* parse arguments */ 781 if (argc <= 0) { 782 usage(sp, 1); 783 } 784 785 /* convert the step name to the corresponding number */ 786 step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) / 787 sizeof (step_table[0])), sizeof (step_table[0]), mc_compare); 788 if (step_ptr != NULL) { 789 stepnum = step_ptr->step_num; 790 } 791 792 --argc; 793 ++argv; 794 795 /* set timeout alarm signal, a value of 0 will disable timeout */ 796 if (timeout > 0) { 797 int stat_loc = 0; 798 799 c_pid = fork(); 800 801 if (c_pid == (pid_t)-1) { 802 md_perror(gettext("Unable to fork")); 803 md_exit(sp, 1); 804 } else if (c_pid) { 805 /* parent */ 806 nsa.sa_flags = 0; 807 if (sigfillset(&nsa.sa_mask) < 0) { 808 md_perror(gettext("Unable to set signal mask")); 809 md_exit(sp, 1); 810 } 811 812 nsa.sa_handler = sigalarmhandler; 813 if (sigaction(SIGALRM, &nsa, &osa) == -1) { 814 md_perror(gettext("Unable to set alarm " 815 "handler")); 816 md_exit(sp, 1); 817 } 818 819 (void) alarm(timeout); 820 821 /* 822 * wait for child to exit or timeout to expire. 823 * keep retrying if the call is interrupted 824 */ 825 while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) { 826 if (errno != EINTR) { 827 break; 828 } 829 } 830 if (ret_val == c_pid) { 831 /* exit with the childs exit value */ 832 exit(WEXITSTATUS(stat_loc)); 833 } else if (errno == ECHILD) { 834 md_exit(sp, 0); 835 } else { 836 perror(myname); 837 md_exit(sp, 1); 838 } 839 } 840 } 841 842 /* 843 * If a timeout value is given, everything from this point onwards is 844 * executed in the child process. 845 */ 846 847 switch (stepnum) { 848 case MC_START: 849 /* 850 * Start Step 851 * 852 * - Suspend all rpc.mdcommd messages 853 */ 854 855 /* expect the local node id to be given only */ 856 if (argc != 1) 857 usage(sp, 1); 858 859 meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"), 860 meta_print_hrtime(0)); 861 862 /* 863 * Does local set exist? If not, exit with 0 864 * since there's no reason to have this node panic if 865 * the local set cannot be started. 866 */ 867 if ((local_sp = load_local_set(ep)) == NULL) { 868 md_exit(local_sp, 0); 869 } 870 871 if ((max_sets = get_max_sets(ep)) == 0) { 872 mde_perror(ep, ""); 873 md_exit(sp, 1); 874 } 875 876 /* start walking through all possible disksets */ 877 for (setno = 1; setno < max_sets; setno++) { 878 if ((sp = metasetnosetname(setno, ep)) == NULL) { 879 if (mdiserror(ep, MDE_NO_SET)) { 880 /* No set for this setno - continue */ 881 mdclrerror(ep); 882 continue; 883 } else { 884 mde_perror(ep, gettext("Unable to " 885 "get set %d information"), setno); 886 md_exit(sp, 1); 887 } 888 } 889 890 /* only check multi-node disksets */ 891 if (!meta_is_mn_set(sp, ep)) { 892 mdclrerror(ep); 893 continue; 894 } 895 896 meta_mc_log(MC_LOG3, gettext("Start - block parse " 897 "messages for set %s: %s"), sp->setname, 898 meta_print_hrtime(gethrtime() - start_time)); 899 900 /* 901 * Mddb parse messages are sent amongst the nodes 902 * in a diskset whenever the locator block or 903 * locator names structure has been changed. 904 * A locator block change could occur as a result 905 * of a disk failure during the reconfig cycle, 906 * so block the mddb parse messages while the 907 * rpc.mdcommd is suspended during the reconfig cycle. 908 */ 909 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 910 (void) memset(&mbp, 0, sizeof (mbp)); 911 mbp.c_setno = setno; 912 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 913 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 914 &mbp.c_mde, NULL)) { 915 mdstealerror(ep, &mbp.c_mde); 916 mde_perror(ep, gettext("Could not " 917 "block set %s"), sp->setname); 918 md_exit(sp, 1); 919 } 920 } 921 922 /* suspend commd and spin waiting for drain */ 923 while ((ret_val = mdmn_suspend(setno, 924 MD_COMM_ALL_CLASSES)) == 925 MDE_DS_COMMDCTL_SUSPEND_NYD) { 926 sleep(1); 927 } 928 929 if (ret_val) { 930 md_eprintf(gettext("Could not suspend " 931 "rpc.mdcommd for set %s\n"), sp->setname); 932 md_exit(sp, 1); 933 } 934 935 /* 936 * Set start step flag for set. This is set to indicate 937 * that this node entered the reconfig cycle through 938 * the start step. This is used during the reconfig 939 * cycle to determine whether the node had entered 940 * through the start step or the return step. 941 */ 942 (void) memset(&sf, 0, sizeof (sf)); 943 sf.sf_setno = sp->setno; 944 sf.sf_setflags = MD_SET_MN_START_RC; 945 sf.sf_flags = MDDB_NM_SET; 946 /* Use magic to help protect ioctl against attack. */ 947 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 948 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 949 &sf.sf_mde, NULL)) { 950 mdstealerror(ep, &sf.sf_mde); 951 mde_perror(ep, gettext("Could not set " 952 "start_step flag for set %s"), sp->setname); 953 md_exit(sp, 1); 954 } 955 956 } 957 958 meta_mc_log(MC_LOG2, gettext("Start step completed: %s"), 959 meta_print_hrtime(gethrtime() - start_time)); 960 961 break; 962 963 case MC_STOP: 964 /* 965 * Stop Step 966 * 967 * - ??? 968 */ 969 970 /* don't expect any more arguments to follow the step name */ 971 if (argc != 0) 972 usage(sp, 1); 973 974 break; 975 976 case MC_ABORT: 977 /* 978 * Abort Step 979 * 980 * - Abort rpc.mdcommd 981 */ 982 983 /* don't expect any more arguments to follow the step name */ 984 if (argc != 0) 985 usage(sp, 1); 986 987 meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"), 988 meta_print_hrtime(0)); 989 990 /* 991 * Does local set exist? If not, exit with 0 992 * since there's no reason to have this node panic if 993 * the local set cannot be started. 994 */ 995 if ((local_sp = load_local_set(ep)) == NULL) { 996 md_exit(local_sp, 0); 997 } 998 999 /* 1000 * abort the rpc.mdcommd. The abort is only issued on this node 1001 * meaning that the abort reconfig step is called on this 1002 * node before a panic while the rest of the cluster will 1003 * undergo a reconfig cycle. 1004 * There is no time relation between this node running a 1005 * reconfig abort and the the rest of the cluster 1006 * running a reconfig cycle meaning that this node may 1007 * panic before, during or after the cluster has run 1008 * a reconfig cycle. 1009 */ 1010 mdmn_abort(); 1011 1012 meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"), 1013 meta_print_hrtime(gethrtime() - start_time)); 1014 1015 break; 1016 1017 case MC_RETURN: 1018 /* 1019 * Return Step 1020 * 1021 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL 1022 * and release local set lock. Grabbing the local set 1023 * lock allows any active metaset/metadb commands to 1024 * terminate gracefully and will keep a metaset/metadb 1025 * command from starting until the DRAIN ALL is issued. 1026 * The metaset/metadb commands can issue 1027 * DRAIN ALL/RESUME ALL commands to rpc.mdcommd, 1028 * so the return step must not issue the DRAIN ALL command 1029 * until metaset/metadb have finished or metaset may issue 1030 * a RESUME ALL after this return reconfig step has issued 1031 * the DRAIN ALL command. 1032 * After this reconfig step has issued the DRAIN_ALL and 1033 * released the local set lock, metaset/metadb will fail 1034 * when attempting to contact the rpc.mdcommd and will 1035 * terminate without making any configuration changes. 1036 * The DRAIN ALL command will keep all other meta* commands 1037 * from running during the reconfig cycle (these commands 1038 * will wait until the rpc.mdcommd is resumed) since the 1039 * reconfig cycle may be changing the diskset configuration. 1040 */ 1041 1042 /* expect the nodelist to follow the step name */ 1043 if (argc < 1) 1044 usage(sp, 1); 1045 1046 meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"), 1047 meta_print_hrtime(0)); 1048 1049 /* 1050 * Does local set exist? If not, exit with 0 1051 * since there's no reason to have this node panic if 1052 * the local set cannot be started. 1053 */ 1054 if ((local_sp = load_local_set(ep)) == NULL) { 1055 md_exit(local_sp, 0); 1056 } 1057 1058 /* 1059 * Suspend any mirror resyncs that are in progress. This 1060 * stops unnecessary timeouts. 1061 */ 1062 meta_mirror_resync_block_all(); 1063 1064 if (meta_lock(local_sp, TRUE, ep) != 0) { 1065 mde_perror(ep, ""); 1066 md_exit(local_sp, 1); 1067 } 1068 1069 /* 1070 * All metaset and metadb commands on this node have now 1071 * terminated gracefully. Now, issue a drain all to 1072 * the rpc.mdcommd. Any meta command issued after the 1073 * drain all will either spin sending the command to the 1074 * master until after the reconfig cycle has finished OR 1075 * will terminate gracefully (metaset/metadb). 1076 */ 1077 if ((max_sets = get_max_sets(ep)) == 0) { 1078 mde_perror(ep, ""); 1079 md_exit(sp, 1); 1080 } 1081 1082 /* start walking through all possible disksets */ 1083 for (setno = 1; setno < max_sets; setno++) { 1084 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1085 if (mdiserror(ep, MDE_NO_SET)) { 1086 /* No set for this setno - continue */ 1087 mdclrerror(ep); 1088 continue; 1089 } else { 1090 mde_perror(ep, gettext("Unable to " 1091 "get set %d information"), setno); 1092 md_exit(sp, 1); 1093 } 1094 } 1095 1096 /* only check multi-node disksets */ 1097 if (!meta_is_mn_set(sp, ep)) { 1098 mdclrerror(ep); 1099 continue; 1100 } 1101 1102 meta_mc_log(MC_LOG3, gettext("Return - block parse " 1103 "messages for set %s: %s"), sp->setname, 1104 meta_print_hrtime(gethrtime() - start_time)); 1105 1106 /* 1107 * Mddb parse messages are sent amongst the nodes 1108 * in a diskset whenever the locator block or 1109 * locator names structure has been changed. 1110 * A locator block change could occur as a result 1111 * of a disk failure during the reconfig cycle, 1112 * so block the mddb parse messages while the 1113 * rpc.commd is suspended during the reconfig cycle. 1114 */ 1115 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1116 (void) memset(&mbp, 0, sizeof (mbp)); 1117 mbp.c_setno = setno; 1118 mbp.c_blk_flags = MDDB_BLOCK_PARSE; 1119 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1120 &mbp.c_mde, NULL)) { 1121 mdstealerror(ep, &mbp.c_mde); 1122 mde_perror(ep, gettext("Could not " 1123 "block set %s"), sp->setname); 1124 md_exit(sp, 1); 1125 } 1126 } 1127 1128 /* suspend commd and spin waiting for drain */ 1129 while ((ret_val = mdmn_suspend(setno, 1130 MD_COMM_ALL_CLASSES)) == 1131 MDE_DS_COMMDCTL_SUSPEND_NYD) { 1132 sleep(1); 1133 } 1134 1135 if (ret_val) { 1136 md_eprintf(gettext("Could not suspend " 1137 "rpc.mdcommd for set %s\n"), sp->setname); 1138 md_exit(sp, 1); 1139 } 1140 } 1141 /* 1142 * Resume all I/Os for this node for all MN sets in 1143 * case master node had suspended I/Os but panic'd 1144 * before resuming I/Os. In case of failure, exit 1145 * with a 1 since unable to resume I/Os on this node. 1146 */ 1147 if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) { 1148 mde_perror(ep, gettext( 1149 "Unable to resume I/O on node %s for all sets"), 1150 mynode()); 1151 md_exit(sp, 1); 1152 } 1153 1154 1155 /* 1156 * Can now unlock local set lock. New metaset/metadb 1157 * commands are now held off using drain all. 1158 */ 1159 (void) meta_unlock(local_sp, ep); 1160 1161 meta_mc_log(MC_LOG2, gettext("Return step completed: %s"), 1162 meta_print_hrtime(gethrtime() - start_time)); 1163 1164 break; 1165 1166 case MC_STEP1: 1167 /* 1168 * Step 1 1169 * 1170 * - Populate nodelist file if we are on clustering 1171 * and pick a master node for each MN diskset. 1172 */ 1173 1174 /* expect the nodelist to follow the step name */ 1175 if (argc < 1) 1176 usage(sp, 1); 1177 1178 meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"), 1179 meta_print_hrtime(0)); 1180 1181 /* Always write nodelist file even if no local set exists */ 1182 if (clust == SDSSC_OKAY) { 1183 /* skip to the nodelist args */ 1184 if (meta_write_nodelist(argc, argv, ep) != 0) { 1185 mde_perror(ep, gettext( 1186 "Could not populate nodelist file")); 1187 md_exit(sp, 1); 1188 } 1189 } 1190 1191 /* 1192 * Does local set exist? If not, exit with 0 1193 * since there's no reason to have this node panic if 1194 * the local set cannot be started. 1195 */ 1196 if ((local_sp = load_local_set(ep)) == NULL) { 1197 md_exit(local_sp, 0); 1198 } 1199 1200 /* 1201 * At this point, all meta* commands are blocked across 1202 * all disksets since the master rpc.mdcommd has drained or 1203 * the master node has died. 1204 * If a metaset or metadb command had been in progress 1205 * at the start of the reconfig cycle, this command has 1206 * either completed or it has been terminated due to 1207 * the death of the master node. 1208 * 1209 * This means that that it is now ok to remove any 1210 * outstanding clnt_locks associated with multinode 1211 * disksets on this node due to a node panic during 1212 * a metaset operation. This allows the routines that 1213 * choose the master to use rpc.metad to determine the 1214 * master of the diskset. 1215 */ 1216 if (clnt_clr_mnsetlock(mynode(), ep) != 0) { 1217 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1218 "clear locks failed %s"), 1219 meta_print_hrtime(gethrtime() - start_time)); 1220 md_exit(local_sp, 1); 1221 } 1222 1223 /* 1224 * Call reconfig_choose_master to choose a master for 1225 * each MN diskset, update the nodelist for each diskset 1226 * given the member information and send a reinit message 1227 * to rpc.mdcommd to reload the nodelist. 1228 */ 1229 rval = meta_reconfig_choose_master(ep); 1230 if (rval == 205) { 1231 /* 1232 * NOTE: Should issue call to reboot remote host that 1233 * is causing the RPC failure. Clustering to 1234 * provide interface in the future. This should 1235 * stop a never-ending set of 205 reconfig cycles. 1236 * Remote host causing failure is stored in 1237 * ep->host if ep is an RPC error. 1238 * if (mdanyrpcerror(ep)) 1239 * reboot (ep->host); 1240 */ 1241 meta_mc_log(MC_LOG2, gettext("Step1 aborted:" 1242 "choose master failure of 205 %s"), 1243 meta_print_hrtime(gethrtime() - start_time)); 1244 md_exit(local_sp, 205); 1245 } else if (rval != 0) { 1246 meta_mc_log(MC_LOG2, gettext("Step1 failure: " 1247 "choose master failure %s"), 1248 meta_print_hrtime(gethrtime() - start_time)); 1249 md_exit(local_sp, 1); 1250 } 1251 1252 meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"), 1253 meta_print_hrtime(gethrtime() - start_time)); 1254 1255 md_exit(local_sp, rval); 1256 break; 1257 1258 case MC_STEP2: 1259 /* 1260 * Step 2 1261 * 1262 * In Step 2, each node walks the list of disksets. If a 1263 * node is a master of a MN diskset, it synchronizes 1264 * the local set USER records for that diskset. 1265 * 1266 * If disks exist in the diskset and there is a joined 1267 * (owner) node in the diskset, the master will also: 1268 * - synchronize the diskset mddbs to the master 1269 * - play the change log 1270 * 1271 * The master node will now attempt to join any unjoined 1272 * nodes that are currently members in the membership list. 1273 */ 1274 1275 /* expect the nodelist to follow the step name */ 1276 if (argc < 1) 1277 usage(sp, 1); 1278 1279 meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"), 1280 meta_print_hrtime(0)); 1281 1282 /* 1283 * Does local set exist? If not, exit with 0 1284 * since there's no reason to have this node panic if 1285 * the local set cannot be started. 1286 */ 1287 if ((local_sp = load_local_set(ep)) == NULL) { 1288 md_exit(local_sp, 0); 1289 } 1290 1291 if ((max_sets = get_max_sets(ep)) == 0) { 1292 mde_perror(ep, ""); 1293 md_exit(local_sp, 1); 1294 } 1295 1296 /* start walking through all possible disksets */ 1297 for (setno = 1; setno < max_sets; setno++) { 1298 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1299 if (mdiserror(ep, MDE_NO_SET)) { 1300 /* No set for this setno - continue */ 1301 mdclrerror(ep); 1302 continue; 1303 } else if (mdanyrpcerror(ep)) { 1304 /* Fail on RPC failure to self */ 1305 mde_perror(ep, gettext( 1306 "Unable to get information for " 1307 "set number %d"), setno); 1308 md_exit(local_sp, 1); 1309 } else { 1310 mde_perror(ep, gettext( 1311 "Unable to get information for " 1312 "set number %d"), setno); 1313 mdclrerror(ep); 1314 continue; 1315 } 1316 } 1317 1318 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1319 if (mdanyrpcerror(ep)) { 1320 /* Fail on RPC failure to self */ 1321 mde_perror(ep, gettext( 1322 "Unable to get information for " 1323 "set number %d"), setno); 1324 md_exit(local_sp, 1); 1325 } 1326 mde_perror(ep, gettext("Unable to get set " 1327 "%s desc information"), sp->setname); 1328 mdclrerror(ep); 1329 continue; 1330 } 1331 1332 /* Only check MN disksets */ 1333 if (!(MD_MNSET_DESC(sd))) { 1334 continue; 1335 } 1336 1337 /* All actions in step 2 are driven by master */ 1338 if (!(sd->sd_mn_am_i_master)) { 1339 continue; 1340 } 1341 1342 meta_mc_log(MC_LOG3, gettext("Step2 - begin record " 1343 "synchronization for set %s: %s"), sp->setname, 1344 meta_print_hrtime(gethrtime() - start_time)); 1345 1346 /* 1347 * Synchronize the USER records in the local mddbs 1348 * for hosts that are members. The USER records 1349 * contain set, drive and host information. 1350 */ 1351 rval = meta_mnsync_user_records(sp, ep); 1352 if (rval != 0) { 1353 mde_perror(ep, gettext( 1354 "Synchronization of user records " 1355 "in set %s failed\n"), sp->setname); 1356 if (rval == 205) { 1357 /* 1358 * NOTE: Should issue call to reboot 1359 * remote host that is causing the RPC 1360 * failure. Clustering to provide 1361 * interface in the future. This 1362 * should stop a never-ending set of 1363 * 205 reconfig cycles. 1364 * Remote host causing failure is 1365 * stored in ep->host if ep is an 1366 * RPC error. 1367 * if (mdanyrpcerror(ep)) 1368 * reboot (ep->host); 1369 */ 1370 md_exit(local_sp, 205); 1371 } else { 1372 md_exit(local_sp, 1); 1373 } 1374 } 1375 1376 /* Reget sd since sync_user_recs may have flushed it */ 1377 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1378 mde_perror(ep, gettext("Unable to get set " 1379 "%s desc information"), sp->setname); 1380 md_exit(local_sp, 1); 1381 } 1382 1383 dd = metaget_drivedesc(sp, 1384 (MD_BASICNAME_OK | PRINT_FAST), ep); 1385 if (! mdisok(ep)) { 1386 mde_perror(ep, gettext("Unable to get set " 1387 "%s drive information"), sp->setname); 1388 md_exit(local_sp, 1); 1389 } 1390 1391 /* 1392 * No drives in set, continue to next set. 1393 */ 1394 if (dd == NULL) { 1395 /* Done with this set */ 1396 continue; 1397 } 1398 1399 meta_mc_log(MC_LOG3, gettext("Step2 - local set user " 1400 "records completed for set %s: %s"), sp->setname, 1401 meta_print_hrtime(gethrtime() - start_time)); 1402 1403 /* 1404 * Synchronize the diskset mddbs for hosts 1405 * that are members. This may involve 1406 * playing the changelog and writing out 1407 * to the diskset mddbs. 1408 */ 1409 rval = meta_mnsync_diskset_mddbs(sp, ep); 1410 if (rval != 0) { 1411 mde_perror(ep, gettext( 1412 "Synchronization of diskset mddbs " 1413 "in set %s failed\n"), sp->setname); 1414 meta_mc_log(MC_LOG3, gettext("Step2 - diskset " 1415 "mddb synchronization failed for " 1416 "set %s: %s"), sp->setname, 1417 meta_print_hrtime(gethrtime() - 1418 start_time)); 1419 if (rval == 205) { 1420 /* 1421 * NOTE: Should issue call to reboot 1422 * remote host that is causing the RPC 1423 * failure. Clustering to provide 1424 * interface in the future. This 1425 * should stop a never-ending set of 1426 * 205 reconfig cycles. 1427 * Remote host causing failure is 1428 * stored in ep->host if ep is an 1429 * RPC error. 1430 * if (mdanyrpcerror(ep)) 1431 * reboot (ep->host); 1432 */ 1433 md_exit(local_sp, 205); 1434 } else if (rval == 1) { 1435 continue; 1436 } else { 1437 md_exit(local_sp, 1); 1438 } 1439 } 1440 1441 meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb " 1442 "synchronization completed for set %s: %s"), 1443 sp->setname, 1444 meta_print_hrtime(gethrtime() - start_time)); 1445 1446 /* Join the starting nodes to the diskset */ 1447 rval = meta_mnjoin_all(sp, ep); 1448 if (rval != 0) { 1449 mde_perror(ep, gettext( 1450 "Join of non-owner (starting) nodes " 1451 "in set %s failed\n"), sp->setname); 1452 meta_mc_log(MC_LOG3, gettext("Step2 - non owner" 1453 "nodes joined for set %s: %s"), 1454 sp->setname, 1455 meta_print_hrtime(gethrtime() - 1456 start_time)); 1457 if (rval == 205) { 1458 /* 1459 * NOTE: Should issue call to reboot 1460 * remote host that is causing the RPC 1461 * failure. Clustering to provide 1462 * interface in the future. This 1463 * should stop a never-ending set of 1464 * 205 reconfig cycles. 1465 * Remote host causing failure is 1466 * stored in ep->host if ep is an 1467 * RPC error. 1468 * if (mdanyrpcerror(ep)) 1469 * reboot (ep->host); 1470 */ 1471 md_exit(local_sp, 205); 1472 } else { 1473 md_exit(local_sp, 1); 1474 } 1475 } 1476 1477 meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes " 1478 "joined for set %s: %s"), sp->setname, 1479 meta_print_hrtime(gethrtime() - start_time)); 1480 1481 } 1482 1483 meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"), 1484 meta_print_hrtime(gethrtime() - start_time)); 1485 1486 break; 1487 1488 case MC_STEP3: 1489 /* 1490 * Step 3 1491 * 1492 * For all multinode sets do, 1493 * - Reinitialise rpc.mdcommd 1494 * - Reset mirror owners to null if the current owner is 1495 * no longer in the membership list 1496 */ 1497 1498 /* expect the nodelist to follow the step name */ 1499 if (argc < 1) 1500 usage(sp, 1); 1501 1502 meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"), 1503 meta_print_hrtime(0)); 1504 1505 /* 1506 * Does local set exist? If not, exit with 0 1507 * since there's no reason to have this node panic if 1508 * the local set cannot be started. 1509 */ 1510 if ((local_sp = load_local_set(ep)) == NULL) { 1511 md_exit(local_sp, 0); 1512 } 1513 1514 /* 1515 * walk through all sets on this node which could include: 1516 * - MN disksets 1517 * - traditional disksets 1518 * - non-existent disksets 1519 * start mirror resync for all MN sets 1520 */ 1521 if ((max_sets = get_max_sets(ep)) == 0) { 1522 mde_perror(ep, ""); 1523 md_exit(local_sp, 1); 1524 } 1525 1526 /* start walking through all possible disksets */ 1527 for (setno = 1; setno < max_sets; setno++) { 1528 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1529 if (mdiserror(ep, MDE_NO_SET)) { 1530 /* No set for this setno - continue */ 1531 mdclrerror(ep); 1532 continue; 1533 } else { 1534 mde_perror(ep, gettext("Unable to " 1535 "get set %d information"), setno); 1536 md_exit(local_sp, 1); 1537 } 1538 } 1539 1540 /* only check multi-node disksets */ 1541 if (!meta_is_mn_set(sp, ep)) { 1542 mdclrerror(ep); 1543 continue; 1544 } 1545 1546 if (meta_lock(sp, TRUE, ep) != 0) { 1547 mde_perror(ep, ""); 1548 md_exit(local_sp, 1); 1549 } 1550 1551 /* If this node isn't joined to set, do nothing */ 1552 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1553 if (!mdisok(ep)) { 1554 mde_perror(ep, gettext("Could " 1555 "not get set %s ownership"), 1556 sp->setname); 1557 md_exit(sp, 1); 1558 } 1559 mdclrerror(ep); 1560 meta_unlock(sp, ep); 1561 continue; 1562 } 1563 1564 meta_mc_log(MC_LOG3, gettext("Step3 - begin " 1565 "re-initialising rpc.mdcommd and resetting mirror " 1566 "owners for set %s: %s"), sp->setname, 1567 meta_print_hrtime(gethrtime() - start_time)); 1568 1569 /* reinitialzse rpc.mdcommd with new nodelist */ 1570 if (mdmn_reinit_set(setno)) { 1571 md_eprintf(gettext( 1572 "Could not re-initialise rpc.mdcommd for " 1573 "set %s\n"), sp->setname); 1574 md_exit(sp, 1); 1575 } 1576 1577 (void) memset(&cfg, 0, sizeof (cfg)); 1578 cfg.c_id = 0; 1579 cfg.c_setno = sp->setno; 1580 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1581 NULL) != 0) { 1582 mdstealerror(ep, &cfg.c_mde); 1583 mde_perror(ep, gettext("Could " 1584 "not get set %s information"), 1585 sp->setname); 1586 md_exit(sp, 1); 1587 } 1588 1589 /* Don't do anything else if set is stale */ 1590 if (cfg.c_flags & MDDB_C_STALE) { 1591 meta_unlock(sp, ep); 1592 mdclrerror(ep); 1593 continue; 1594 } 1595 1596 /* reset mirror owners */ 1597 if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) { 1598 md_exit(sp, 1); 1599 } 1600 1601 meta_unlock(sp, ep); 1602 1603 meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd " 1604 "re-initialised and mirror owners reset for " 1605 "set %s: %s"), sp->setname, 1606 meta_print_hrtime(gethrtime() - start_time)); 1607 } 1608 1609 meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"), 1610 meta_print_hrtime(gethrtime() - start_time)); 1611 1612 break; 1613 1614 case MC_STEP4: 1615 /* 1616 * Step 4 1617 * 1618 * For all multinode sets do: 1619 * - Resume the rpc.mdcommd messages. Must resume all 1620 * sets before issuing I/O to any set since an error 1621 * encountered in a commd suspended set could be 1622 * blocked waiting for commd in another set to resume. 1623 * (This happens since the daemon queues service 1624 * all sets). An open of a soft partition causes 1625 * a read of the watermarks during the open. 1626 * - If set is non-writable (not an owner or STALE), then 1627 * continue to next set. 1628 * 1629 * For all multinode sets do, 1630 * - Reset ABR states for all mirrors, ie clear ABR if not 1631 * open on any node. 1632 * - Reset ABR states for all soft partitions, ie clear ABR if 1633 * not open on any node. 1634 * - For all slave nodes that have entered through the start 1635 * step, update the ABR state to that of the master and 1636 * get the submirror state from the master 1637 * - meta_lock set 1638 * - Resync all mirrors 1639 * - unlock meta_lock for this set. 1640 * - Choose a new owner for any orphaned resyncs 1641 * 1642 * There is one potential issue here. when concurrently 1643 * resetting and updating the ABR state. If the master has ABR 1644 * set, but should no longer have because the only node that 1645 * had the metadevice open and had ABR set has paniced, the 1646 * master will send a message to all nodes to clear the ABR 1647 * state. Meanwhile any node that has come through the 1648 * start step will get tstate from the master and will update 1649 * ABR if it was set in tstate. So, we appear to have a problem 1650 * if the following sequence occurs:- 1651 * - The slave gets tstate with ABR set 1652 * - The master sends a message to clear ABR 1653 * - The slave updates ABR with the value it got from tstate. 1654 * We now have the master with ABR clear and the slave with ABR 1655 * set. Fortunately, having set ABR, the slave will close the 1656 * metadevice after setting ABR and as there are no nodes with 1657 * the device open, the close will send a message to clear ABR 1658 * on all nodes. So, the nodes will all have ABR unset. 1659 */ 1660 1661 /* expect the nodelist to follow the step name */ 1662 if (argc < 1) 1663 usage(sp, 1); 1664 1665 meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"), 1666 meta_print_hrtime(0)); 1667 1668 /* 1669 * Does local set exist? If not, exit with 0 1670 * since there's no reason to have this node panic if 1671 * the local set cannot be started. 1672 */ 1673 if ((local_sp = load_local_set(ep)) == NULL) { 1674 md_exit(local_sp, 0); 1675 } 1676 1677 /* 1678 * walk through all sets on this node which could include: 1679 * - MN disksets 1680 * - traditional disksets 1681 * - non-existent disksets 1682 * start mirror resync for all MN sets 1683 */ 1684 if ((max_sets = get_max_sets(ep)) == 0) { 1685 mde_perror(ep, ""); 1686 md_exit(local_sp, 1); 1687 } 1688 1689 /* Clear set_info structure */ 1690 for (setno = 1; setno < max_sets; setno++) { 1691 set_info[setno] = 0; 1692 } 1693 1694 /* start walking through all possible disksets */ 1695 for (setno = 1; setno < max_sets; setno++) { 1696 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1697 if (mdiserror(ep, MDE_NO_SET)) { 1698 /* No set for this setno - continue */ 1699 mdclrerror(ep); 1700 continue; 1701 } else { 1702 mde_perror(ep, gettext("Unable to " 1703 "get set %d information"), setno); 1704 md_exit(local_sp, 1); 1705 } 1706 } 1707 1708 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1709 mde_perror(ep, gettext("Unable to get set " 1710 "%s desc information"), sp->setname); 1711 mdclrerror(ep); 1712 continue; 1713 } 1714 1715 /* only check multi-node disksets */ 1716 if (!meta_is_mn_set(sp, ep)) { 1717 mdclrerror(ep); 1718 continue; 1719 } 1720 1721 set_info[setno] |= SET_INFO_MN; 1722 1723 /* 1724 * If not an owner (all mddbs failed) or stale 1725 * (< 50% mddbs operational), then set is 1726 * non-writable so just resume commd and 1727 * unblock mddb messages. 1728 */ 1729 mdclrerror(ep); 1730 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { 1731 set_info[setno] |= SET_INFO_NO_WR; 1732 } 1733 if (!mdisok(ep)) { 1734 mde_perror(ep, gettext("Could " 1735 "not get set %s ownership"), 1736 sp->setname); 1737 md_exit(local_sp, 1); 1738 } 1739 /* Set is owned - is it stale? */ 1740 if (!set_info[setno] & SET_INFO_NO_WR) { 1741 (void) memset(&cfg, 0, sizeof (cfg)); 1742 cfg.c_id = 0; 1743 cfg.c_setno = sp->setno; 1744 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, 1745 NULL) != 0) { 1746 mdstealerror(ep, &cfg.c_mde); 1747 mde_perror(ep, gettext("Could " 1748 "not get set %s information"), 1749 sp->setname); 1750 md_exit(local_sp, 1); 1751 } 1752 if (cfg.c_flags & MDDB_C_STALE) { 1753 set_info[setno] |= SET_INFO_NO_WR; 1754 } 1755 } 1756 1757 /* resume rpc.mdcommd */ 1758 if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0)) { 1759 md_eprintf(gettext("Unable to resume " 1760 "rpc.mdcommd for set %s\n"), sp->setname); 1761 md_exit(local_sp, 1); 1762 } 1763 meta_ping_mnset(setno); 1764 1765 /* Unblock mddb parse messages */ 1766 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { 1767 (void) memset(&mbp, 0, sizeof (mbp)); 1768 mbp.c_setno = setno; 1769 mbp.c_blk_flags = MDDB_UNBLOCK_PARSE; 1770 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, 1771 &mbp.c_mde, NULL)) { 1772 mdstealerror(ep, &mbp.c_mde); 1773 mde_perror(ep, gettext("Could not " 1774 "unblock set %s"), sp->setname); 1775 md_exit(local_sp, 1); 1776 } 1777 } 1778 meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd " 1779 "resumed and messages unblocked for set %s: %s"), 1780 sp->setname, 1781 meta_print_hrtime(gethrtime() - start_time)); 1782 } 1783 1784 for (setno = 1; setno < max_sets; setno++) { 1785 int start_step; 1786 1787 /* Skip traditional disksets. */ 1788 if ((set_info[setno] & SET_INFO_MN) == 0) 1789 continue; 1790 1791 /* 1792 * If already determined that this set is 1793 * a non-writable set, then just continue 1794 * to next set since there's nothing else 1795 * to do for a non-writable set. 1796 */ 1797 if (set_info[setno] & SET_INFO_NO_WR) 1798 continue; 1799 1800 if ((sp = metasetnosetname(setno, ep)) == NULL) { 1801 if (mdiserror(ep, MDE_NO_SET)) { 1802 /* No set for this setno - continue */ 1803 mdclrerror(ep); 1804 continue; 1805 } else { 1806 mde_perror(ep, gettext("Unable to " 1807 "get set %d information"), setno); 1808 md_exit(local_sp, 1); 1809 } 1810 } 1811 1812 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 1813 mde_perror(ep, gettext("Unable to get set " 1814 "%s desc information"), sp->setname); 1815 mdclrerror(ep); 1816 continue; 1817 } 1818 1819 /* See if this node came through the start step */ 1820 (void) memset(&sf, 0, sizeof (sf)); 1821 sf.sf_setno = sp->setno; 1822 sf.sf_flags = MDDB_NM_GET; 1823 /* Use magic to help protect ioctl against attack. */ 1824 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1825 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1826 &sf.sf_mde, NULL)) { 1827 mdstealerror(ep, &sf.sf_mde); 1828 mde_perror(ep, gettext("Could not get " 1829 "start_step flag for set %s"), sp->setname); 1830 md_exit(local_sp, 1); 1831 } 1832 start_step = 1833 (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0; 1834 1835 /* 1836 * We can now reset the start_step flag for the set 1837 * if it was already set. 1838 */ 1839 if (start_step) { 1840 (void) memset(&sf, 0, sizeof (sf)); 1841 sf.sf_setno = sp->setno; 1842 sf.sf_setflags = MD_SET_MN_START_RC; 1843 sf.sf_flags = MDDB_NM_RESET; 1844 /* 1845 * Use magic to help protect ioctl 1846 * against attack. 1847 */ 1848 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1849 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1850 &sf.sf_mde, NULL)) { 1851 mdstealerror(ep, &sf.sf_mde); 1852 mde_perror(ep, 1853 gettext("Could not reset " 1854 "start_step flag for set %s"), 1855 sp->setname); 1856 } 1857 } 1858 1859 meta_mc_log(MC_LOG3, gettext("Step4 - begin setting " 1860 "ABR state and restarting io's for " 1861 "set %s: %s"), sp->setname, 1862 meta_print_hrtime(gethrtime() - start_time)); 1863 1864 1865 /* 1866 * If we are not the master and we have come through 1867 * the start step, we must update the ABR states 1868 * for mirrors and soft partitions. Also the submirror 1869 * states need to be synchronised so that we see the 1870 * same status as other previously joined members. 1871 * This _must_ be done before starting the resync. 1872 */ 1873 if (!(sd->sd_mn_am_i_master) && start_step) { 1874 if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR, 1875 ep) == -1) { 1876 md_exit(local_sp, 1); 1877 } 1878 if (reset_state(UPDATE_ABR, sp, MD_SP, 1879 ep) == -1) { 1880 md_exit(local_sp, 1); 1881 } 1882 /* 1883 * Mark the fact that we've got the mirror 1884 * state. This allows the resync thread to 1885 * determine if _it_ needs to issue this. This 1886 * can happen if a node is added to a set after 1887 * a reconfig cycle has completed. 1888 */ 1889 (void) memset(&sf, 0, sizeof (sf)); 1890 sf.sf_setno = sp->setno; 1891 sf.sf_setflags = MD_SET_MN_MIR_STATE_RC; 1892 sf.sf_flags = MDDB_NM_SET; 1893 /* 1894 * Use magic to help protect ioctl 1895 * against attack. 1896 */ 1897 sf.sf_magic = MDDB_SETFLAGS_MAGIC; 1898 if (metaioctl(MD_MN_SET_SETFLAGS, &sf, 1899 &sf.sf_mde, NULL)) { 1900 mdstealerror(ep, &sf.sf_mde); 1901 mde_perror(ep, 1902 gettext("Could not set " 1903 "submirror state flag for set %s"), 1904 sp->setname); 1905 } 1906 } 1907 1908 /* 1909 * All remaining actions are only performed by the 1910 * master 1911 */ 1912 if (!(sd->sd_mn_am_i_master)) { 1913 if (meta_lock(sp, TRUE, ep) != 0) { 1914 mde_perror(ep, ""); 1915 md_exit(local_sp, 1); 1916 } 1917 meta_mirror_resync_unblock(sp); 1918 meta_unlock(sp, ep); 1919 continue; 1920 } 1921 1922 /* 1923 * If the master came through the start step, this 1924 * implies that all of the nodes must have done the 1925 * same and hence there can be no applications 1926 * running. Hence no need to reset ABR 1927 */ 1928 if (!start_step) { 1929 /* Reset ABR state for mirrors */ 1930 if (reset_state(RESET_ABR, sp, MD_MIRROR, 1931 ep) == -1) { 1932 md_exit(local_sp, 1); 1933 } 1934 /* ...and now the same for soft partitions */ 1935 if (reset_state(RESET_ABR, sp, MD_SP, 1936 ep) == -1) { 1937 md_exit(local_sp, 1); 1938 } 1939 } 1940 1941 /* 1942 * choose owners for orphaned resyncs and reset 1943 * non-orphaned resyncs so that an owner node that 1944 * reboots will restart the resync if needed. 1945 */ 1946 if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1) 1947 md_exit(local_sp, 1); 1948 1949 /* 1950 * Must unlock set lock before meta_mirror_resync_all 1951 * sends a message to run the metasync command 1952 * which also grabs the meta_lock. 1953 */ 1954 if (meta_lock(sp, TRUE, ep) != 0) { 1955 mde_perror(ep, ""); 1956 md_exit(local_sp, 1); 1957 } 1958 meta_mirror_resync_unblock(sp); 1959 meta_unlock(sp, ep); 1960 1961 /* resync all mirrors in set */ 1962 if (meta_mirror_resync_all(sp, 0, ep) != 0) { 1963 mde_perror(ep, gettext("Mirror resyncs " 1964 "failed for set %s"), sp->setname); 1965 md_exit(local_sp, 1); 1966 } 1967 1968 meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted " 1969 "for set %s: %s"), sp->setname, 1970 meta_print_hrtime(gethrtime() - start_time)); 1971 } 1972 1973 meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"), 1974 meta_print_hrtime(gethrtime() - start_time)); 1975 1976 break; 1977 1978 default: 1979 usage(sp, 1); 1980 break; 1981 } 1982 1983 md_exit(sp, 0); 1984 /* NOTREACHED */ 1985 return (0); 1986 } 1987