/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #define MY_VERSION "1.0" /* the highest supported version */ #define MAX_DEBUG_LEVEL 5 /* maximum verbosity level */ #define RESET_OWNER 0x0001 #define CHOOSE_OWNER 0x0002 #define RESET_ABR 0x0004 #define UPDATE_ABR 0x0008 #define GET_MIRROR_STATE 0x0010 #define SET_INFO_NO_WR 0x0002 #define SET_INFO_MN 0x0004 /* * This table defines all the metaclust reconfig steps we understand */ typedef enum stpnum { MC_UNK = 0, MC_START, MC_STOP, MC_ABORT, MC_RETURN, MC_STEP1, MC_STEP2, MC_STEP3, MC_STEP4 } stepnum_t; /* * Structure for step_name -> step_number mapping */ struct step_t { char *step_nam; stepnum_t step_num; }; /* * Step name to step number mapping table * This table MUST be sorted alphabetically in ascending order of step name */ static struct step_t step_table[] = { { "abort", MC_ABORT }, { "return", MC_RETURN }, { "start", MC_START }, { "step1", MC_STEP1 }, { "step2", MC_STEP2 }, { "step3", MC_STEP3 }, { "step4", MC_STEP4 }, { "stop", MC_STOP } }; /* * If support for a different version is added, the new version number should * be appended to the version_table below. This list will be searched to * determine if a version requested via the -V option is supported or not. */ static char *version_table[] = { MY_VERSION }; uint_t timeout = 0; /* disable timeout by default */ char *version = MY_VERSION; /* use latest version by default */ int stepnum = MC_UNK; /* reconfiguration step number */ pid_t c_pid; /* child process id */ /* * Binary search comparison routine */ static int mc_compare(const void *stp1, const void *stp2) { return (strcmp((const char *)stp1, ((const struct step_t *)stp2)->step_nam)); } /* * Timeout expiry alarm signal handler */ /*ARGSUSED*/ static void sigalarmhandler(int sig) { int i, n, ret, stat_loc = 0; FILE *pgcore; char corecmd[256]; n = sizeof (step_table) / sizeof (step_table[0]); for (i = 0; i < n; i++) { if (stepnum == step_table[i].step_num) break; } assert(i != n); meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"), step_table[i].step_nam, meta_print_hrtime(gethrtime() - start_time)); /* * See what the child was actually doing when the timeout expired. * A core-dump of this would be _really_ good, so let's just * try a 'gcore -g c_pid' and hope */ (void) memset(corecmd, 0, sizeof (corecmd)); (void) snprintf(corecmd, sizeof (corecmd), "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid); pgcore = popen(corecmd, "r"); if (pgcore == NULL) { meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"), c_pid); } else { (void) pclose(pgcore); } if ((ret = kill(c_pid, SIGKILL)) == 0) { /* * The child will wait forever until the status is retrieved * so get it now. Keep retrying if the call is interrupted. * * The possible results are, * * - child killed successfully * - signal sent but child not killed * - waitpid failed/interrupted */ (void) sleep(2); while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) { if (errno != EINTR) { break; } } if ((ret == c_pid) || (errno == ECHILD)) { ret = 0; } else { ret = 1; } } else if (errno == ESRCH) { /* * If the kill did not catch the child then it means the child * exited immediately after the timeout occured. */ ret = 0; } /* * make sure not to exit with 205 for any steps other than step1-step4. * Suncluster reconfiguration can't handle it otherwise. */ switch (stepnum) { case MC_STEP1: case MC_STEP2: case MC_STEP3: case MC_STEP4: /* * If the child was killed successfully return 205 for a * new reconfig cycle otherwise send 1 to panic the node. */ if (ret != 0) { md_eprintf(gettext("Could not kill child\n")); exit(1); } else { exit(205); } break; case MC_START: case MC_STOP: case MC_ABORT: case MC_RETURN: default: exit(1); break; } } /* * Attempt to load local set. * Returns: * pointer to mdsetname_t for local set (local_sp) is successful. * 0 if failure * if there are no local set mddbs, no error message is printed. * Otherwise, error message is printed so that user * can determine why the local set didn't start. */ mdsetname_t * load_local_set(md_error_t *ep) { mdsetname_t *local_sp = NULL; /* Does local set exist? If not, give no error */ if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) { return (0); } /* * snarf local set * If fails with MDE_DB_NODB, then just return 1 printing * no failure. * Otherwise, print error message, and return 1. */ if (meta_setup_db_locations(ep) != 0) { if (!(mdismddberror(ep, MDE_DB_NODB))) mde_perror(ep, ""); return (0); } /* local set loaded successfully */ return (local_sp); } /* * Purpose: Compose a full path name for a metadevice * * On entry: sp - setname pointer * mnum - minor number of metadevice * pathname - pointer to array to return path string * pathlen - max length of pathname array */ static int compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen) { int rtn; mdname_t *np; md_error_t status = mdnullerror; if (MD_MIN2SET(mnum) != sp->setno) { md_eprintf(gettext("minor number 0x%x invalid for set %d\n"), mnum, sp->setno); return (-1); } if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) { return (-1); } rtn = snprintf(pathname, pathlen, "%s", np->rname); if ((pathname[0] == '\0') || (rtn >= pathlen)) { md_eprintf(gettext( "Could not create path for device %s\n"), get_mdname(sp, mnum)); return (-1); } return (0); } /* * Purpose: Walk through all the devices specified for the given set * and do the action specified in mode */ static int reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep) { mdnamelist_t *devnlp = NULL; mdnamelist_t *p; mdname_t *devnp = NULL; md_set_mmown_params_t ownpar_p; md_set_mmown_params_t *ownpar = &ownpar_p; md_unit_t *mm; int mirror_dev = 0; mndiskset_membershiplist_t *nl; int cnt; int has_parent; md_mn_get_mir_state_t mir_state_p; md_mn_get_mir_state_t *mir_state = &mir_state_p; /* * if we are choosing or resetting the owners then make sure * we are only doing it for mirror devices */ mirror_dev = (strcmp(MD_MIRROR, drivername) == 0); if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) { return (-1); } /* get a list of all the metadevices for current set */ if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) { mde_perror(ep, gettext("Could not get mirrors for set %s"), sp->setname); return (-1); } else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) { mde_perror(ep, gettext( "Could not get soft partitions for set %s"), sp->setname); return (-1); } /* If resetting the owner, get the known membership list */ if (mode & RESET_OWNER) { if (meta_read_nodelist(&cnt, &nl, ep)) { mde_perror(ep, "Could not get nodelist"); return (-1); } } /* for each metadevice */ for (p = devnlp; (p != NULL); p = p->next) { devnp = p->namep; /* * Get the current setting for mirror ABR state and all of the * submirror state and flags from the master node. We only * perform this when going through a 'start' cycle. */ if ((mode & GET_MIRROR_STATE) && mirror_dev) { char *miscname; /* * Ensure that we ignore soft-parts that are returned * from the meta_get_mirror_names() call */ if ((miscname = metagetmiscname(devnp, ep)) == NULL) goto out; if (strcmp(miscname, MD_MIRROR) != 0) continue; mir_state->mnum = meta_getminor(devnp->dev); MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno); meta_mc_log(MC_LOG4, gettext("Getting mirror state" " for %s: %s"), get_mdname(sp, mir_state->mnum), meta_print_hrtime(gethrtime() - start_time)); if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep, "MD_MN_GET_MIRROR_STATE") != 0) { mde_perror(ep, gettext("Unable to get " "mirror state for %s"), get_mdname(sp, mir_state->mnum)); goto out; } else { continue; } } /* check if this is a top level metadevice */ if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL) goto out; if (MD_HAS_PARENT(MD_PARENT(mm))) { has_parent = 1; } else { has_parent = 0; } Free(mm); if (mode & (RESET_OWNER | CHOOSE_OWNER)) { char *miscname; /* * we can only do these for mirrors so make sure we * really have a mirror device and not a softpartition * imitating one. meta_get_mirror_names seems to think * softparts on top of a mirror are mirrors! */ if ((miscname = metagetmiscname(devnp, ep)) == NULL) goto out; if (strcmp(miscname, MD_MIRROR) != 0) continue; (void) memset(ownpar, 0, sizeof (*ownpar)); ownpar->d.mnum = meta_getminor(devnp->dev); MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno); meta_mc_log(MC_LOG4, gettext("Setting owner " "for %s: %s"), get_mdname(sp, ownpar->d.mnum), meta_print_hrtime(gethrtime() - start_time)); /* get the current owner id */ if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep, "MD_MN_GET_MM_OWNER") != 0) { mde_perror(ep, gettext("Unable to get " "mirror owner for %s"), get_mdname(sp, ownpar->d.mnum)); goto out; } } if (mode & RESET_OWNER) { if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) { mdclrerror(ep); continue; } /* * reset owner only if the current owner is * not in the membership list * Also kill the resync thread so that when the resync * is started, it will perform an optimized resync * for any resync regions that were dirty when the * current owner left the membership. */ if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) { if (meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum, MD_MN_MIRROR_UNOWNED, MD_MN_MM_ALLOW_CHANGE) == -1) { md_eprintf(gettext( "Unable to reset mirror owner " "for %s\n"), get_mdname(sp, ownpar->d.mnum)); goto out; } if (meta_mirror_resync(sp, devnp, 0, ep, MD_RESYNC_KILL_NO_WAIT) != 0) { md_eprintf(gettext( "Unable to kill resync for" " %s\n"), get_mdname(sp, ownpar->d.mnum)); goto out; } } } if (mode & CHOOSE_OWNER) { /* * only orphaned resyncs will have no owner. * if that is the case choose a new owner. Otherwise * re-establish the existing owner. This covers the * case where a node that owned the mirror * reboots/panics and comes back into the cluster before * the reconfig cycle has completed. In this case the * other cluster nodes will have the mirror owner marked * as the rebooted node while it has the owner marked * as 'None'. We have to reestablish the ownership so * that the subsequent resync can continue. */ if (meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum, ownpar->d.owner, MD_MN_MM_CHOOSE_OWNER) == -1) { md_eprintf(gettext("Unable to choose " "mirror owner for %s\n"), get_mdname(sp, ownpar->d.mnum)); goto out; } } /* * For RESET_ABR and UPDATE_ABR - only handle top * level metadevices. */ if (has_parent) continue; if (mode & RESET_ABR) { /* * Reset the ABR (application based recovery) * value on all nodes. We are dealing with * the possibility that we have ABR set but the * only node that had the device open with ABR has * left the cluster. We simply open and close the * device and if this is the last close in the * cluster, ABR will be cleared on all nodes. */ char *miscname; char name[MAXPATHLEN]; int mnum, fd; name[0] = '\0'; mnum = meta_getminor(devnp->dev); /* * Ensure that we don't include soft-parts in the * mirror-only call to RESET_ABR. meta_get_mirror_names * returns a bogus list that includes all soft-parts * built on mirrors. */ if ((miscname = metagetmiscname(devnp, ep)) == NULL) goto out; if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) continue; meta_mc_log(MC_LOG4, gettext("Re-setting ABR state " "for %s: %s"), get_mdname(sp, mnum), meta_print_hrtime(gethrtime() - start_time)); /* compose the absolute device path and open it */ if (compose_path(sp, mnum, &name[0], sizeof (name)) != 0) goto out; if ((fd = open(name, O_RDWR, 0)) < 0) { md_perror(gettext("Could not open device %s"), name); continue; } (void) close(fd); } if (mode & UPDATE_ABR) { /* * Update the ABR value on this node. We obtain the * current ABR state from the master node. */ char *miscname; char name[MAXPATHLEN]; int mnum, fd; volcap_t vc; uint_t tstate; name[0] = '\0'; mnum = meta_getminor(devnp->dev); /* * Ensure that we don't include soft-parts in the * mirror-only call to UPDATE_ABR. meta_get_mirror_names * returns a bogus list that includes all soft-parts * built on mirrors. */ if ((miscname = metagetmiscname(devnp, ep)) == NULL) goto out; if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0)) continue; /* Get tstate from Master */ if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) continue; /* If not set on the master, nothing to do */ if (!(tstate & MD_ABR_CAP)) continue; meta_mc_log(MC_LOG4, gettext("Updating ABR state " "for %s: %s"), get_mdname(sp, mnum), meta_print_hrtime(gethrtime() - start_time)); /* compose the absolute device path and open it */ if (compose_path(sp, mnum, &name[0], sizeof (name)) != 0) goto out; if ((fd = open(name, O_RDWR, 0)) < 0) { md_perror(gettext("Could not open device %s"), name); continue; } /* set ABR state */ vc.vc_info = 0; vc.vc_set = 0; if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) { /* * Ignore if device does not support this * ioctl */ if ((errno != ENOTTY) && (errno != ENOTSUP)) { md_perror(gettext("Could not get " "ABR/DMR state for device %s"), name); } (void) close(fd); continue; } if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) { (void) close(fd); continue; } vc.vc_set = DKV_ABR_CAP; if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) { md_perror(gettext( "Could not set ABR state for " "device %s"), name); (void) close(fd); goto out; } else { md_eprintf(gettext( "Setting ABR state on device %s\n"), name); } (void) close(fd); } } /* cleanup */ if (mode & RESET_OWNER) { meta_free_nodelist(nl); } metafreenamelist(devnlp); return (0); out: /* cleanup */ if (mode & RESET_OWNER) { meta_free_nodelist(nl); } metafreenamelist(devnlp); return (-1); } /* * Print usage message */ static void usage(mdsetname_t *sp, int eval) { (void) fprintf(stderr, gettext("usage:" "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n" "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n" "\t%s [-V version] [-t timeout] [-d level] abort | stop\n" "\t%s [-V | -? | -h]\n"), myname, myname, myname, myname); if (!eval) { (void) fprintf(stderr, gettext("\n" "\tValid debug (-d) levels are 1-%d for increasing " "verbosity.\n\tDefault is -d 3.\n\n" "\tValid step values are: return | step1 | step2 | " "step3 | step4\n\n" "\tNodelist is a space-separated list of node id's\n\n"), MAX_DEBUG_LEVEL); } md_exit(sp, eval); } /* * Input: Input takes a config step name followed by a list of * possible node id's. * * Returns: 0 - Success * 1 - Fail * Node will be removed from cluster membership * by forcing node to panic. * 205 - Unsuccessful. Start another reconfig cycle. * Problem was encountered that could be fixed by * running another reconfig cycle. * Problem could be a result of a failure to read * the nodelist file or that all work could not be * accomplished in a reconfig step in the amount of * time given so another reconfig cycle is needed in * order to finish the current step. */ int main(int argc, char **argv) { mdsetname_t *sp = NULL; md_error_t status = mdnullerror; md_error_t *ep = &status; set_t max_sets, setno; int c, clust = 0; struct sigaction nsa, osa; struct step_t *step_ptr; mdsetname_t *local_sp = NULL; md_drive_desc *dd; int rval = 0; md_set_desc *sd; mddb_block_parm_t mbp; uint_t debug = 3; /* log upto MC_LOG3 by default */ int version_table_size; mddb_setflags_config_t sf; int ret_val; mddb_config_t cfg; int set_info[MD_MAXSETS]; long commd_timeout = 0; /* * Get the locale set up before calling any other routines * with messages to ouput. Just in case we're not in a build * environment, make sure that TEXT_DOMAIN gets set to * something. */ #if !defined(TEXT_DOMAIN) #define TEXT_DOMAIN "SYS_TEST" #endif (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); if ((clust = sdssc_bind_library()) == SDSSC_ERROR) { md_eprintf(gettext("Interface error with libsds_sc.so\n")); exit(1); } if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) { mde_perror(ep, ""); md_exit(sp, 1); } /* * open log and enable libmeta logging. Do it here explicitly * rather than letting md_init() do it because we are not really * a daemon and that is what md_init() opens the log as. */ openlog("metaclust", LOG_CONS, LOG_USER); version_table_size = sizeof (version_table) / sizeof (version_table[0]); optind = 1; opterr = 0; while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) { switch (c) { case 'h': usage(sp, 0); break; case 'd': if (sscanf(optarg, "%u", &debug) != 1) { md_eprintf(gettext("Invalid debug level\n")); md_exit(sp, 1); } else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) { debug = min(max(debug, 1), MAX_DEBUG_LEVEL); md_eprintf(gettext("Debug level must be " "between 1 and %d inclusive.\n"), MAX_DEBUG_LEVEL); md_eprintf(gettext("Debug level set to %d.\n"), debug); } break; case 'V': version = Strdup(optarg); break; case 't': if (sscanf(optarg, "%u", &timeout) != 1) { md_eprintf(gettext("Invalid timeout value\n")); md_exit(sp, 1); } break; case '?': if (optopt == '?') { usage(sp, 0); } else if (optopt == 'V') { int i; (void) fprintf(stdout, gettext( "%s: Versions Supported:"), myname); for (i = 0; i < version_table_size; i++) { (void) fprintf(stdout, " %s", version_table[i]); } (void) fprintf(stdout, "\n"); md_exit(sp, 0); } /*FALLTHROUGH*/ default: usage(sp, 1); break; } } /* initialise the debug level and start time */ setup_mc_log(debug); /* * check that the version specified (if any) is supported. */ if (version != NULL) { int i, found = 0; for (i = 0; i < version_table_size; i++) { if (strcmp(version, version_table[i]) == 0) { found = 1; break; } } if (!found) { md_eprintf(gettext("Version %s not supported\n"), version); md_exit(sp, 1); } } argc -= optind; argv += optind; /* parse arguments */ if (argc <= 0) { usage(sp, 1); } /* convert the step name to the corresponding number */ step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) / sizeof (step_table[0])), sizeof (step_table[0]), mc_compare); if (step_ptr != NULL) { stepnum = step_ptr->step_num; } --argc; ++argv; /* set timeout alarm signal, a value of 0 will disable timeout */ if (timeout > 0) { int stat_loc = 0; commd_timeout = (long)(timeout * .75); c_pid = fork(); if (c_pid == (pid_t)-1) { md_perror(gettext("Unable to fork")); md_exit(sp, 1); } else if (c_pid) { /* parent */ nsa.sa_flags = 0; if (sigfillset(&nsa.sa_mask) < 0) { md_perror(gettext("Unable to set signal mask")); md_exit(sp, 1); } nsa.sa_handler = sigalarmhandler; if (sigaction(SIGALRM, &nsa, &osa) == -1) { md_perror(gettext("Unable to set alarm " "handler")); md_exit(sp, 1); } (void) alarm(timeout); /* * wait for child to exit or timeout to expire. * keep retrying if the call is interrupted */ while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) { if (errno != EINTR) { break; } } if (ret_val == c_pid) { /* exit with the childs exit value */ exit(WEXITSTATUS(stat_loc)); } else if (errno == ECHILD) { md_exit(sp, 0); } else { perror(myname); md_exit(sp, 1); } } } /* * If a timeout value is given, everything from this point onwards is * executed in the child process. */ switch (stepnum) { case MC_START: /* * Start Step * * - Suspend all rpc.mdcommd messages */ /* expect the local node id to be given only */ if (argc != 1) usage(sp, 1); meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"), meta_print_hrtime(0)); /* * With multinode disksets configured we need to * update all replicas on all cluster nodes to have * the same status. If local replicas on a cluster * node are not accessible we need to panic this * node, otherwise we abort in the reconfig cycle * and failfast/reboot the "good" cluster node too. * To avoid a total cluster outage in the above case * we panic only the failing node via md_exit(.., 1). */ if ((local_sp = load_local_set(ep)) == NULL) { /* panic the node */ md_exit(local_sp, 1); } if ((max_sets = get_max_sets(ep)) == 0) { mde_perror(ep, ""); md_exit(sp, 1); } /* start walking through all possible disksets */ for (setno = 1; setno < max_sets; setno++) { if ((sp = metasetnosetname(setno, ep)) == NULL) { if (mdiserror(ep, MDE_NO_SET)) { /* No set for this setno - continue */ mdclrerror(ep); continue; } else { mde_perror(ep, gettext("Unable to " "get set %d information"), setno); md_exit(sp, 1); } } /* only check multi-node disksets */ if (!meta_is_mn_set(sp, ep)) { mdclrerror(ep); continue; } meta_mc_log(MC_LOG3, gettext("Start - block parse " "messages for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); /* * Mddb parse messages are sent amongst the nodes * in a diskset whenever the locator block or * locator names structure has been changed. * A locator block change could occur as a result * of a disk failure during the reconfig cycle, * so block the mddb parse messages while the * rpc.mdcommd is suspended during the reconfig cycle. */ if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { (void) memset(&mbp, 0, sizeof (mbp)); mbp.c_setno = setno; mbp.c_blk_flags = MDDB_BLOCK_PARSE; if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, &mbp.c_mde, NULL)) { (void) mdstealerror(ep, &mbp.c_mde); mde_perror(ep, gettext("Could not " "block set %s"), sp->setname); md_exit(sp, 1); } } /* suspend commd and spin waiting for drain */ while ((ret_val = mdmn_suspend(setno, MD_COMM_ALL_CLASSES, commd_timeout)) == MDE_DS_COMMDCTL_SUSPEND_NYD) { (void) sleep(1); } if (ret_val) { md_eprintf(gettext("Could not suspend " "rpc.mdcommd for set %s\n"), sp->setname); md_exit(sp, 1); } /* * Set start step flag for set. This is set to indicate * that this node entered the reconfig cycle through * the start step. This is used during the reconfig * cycle to determine whether the node had entered * through the start step or the return step. */ (void) memset(&sf, 0, sizeof (sf)); sf.sf_setno = sp->setno; sf.sf_setflags = MD_SET_MN_START_RC; sf.sf_flags = MDDB_NM_SET; /* Use magic to help protect ioctl against attack. */ sf.sf_magic = MDDB_SETFLAGS_MAGIC; if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL)) { (void) mdstealerror(ep, &sf.sf_mde); mde_perror(ep, gettext("Could not set " "start_step flag for set %s"), sp->setname); md_exit(sp, 1); } } meta_mc_log(MC_LOG2, gettext("Start step completed: %s"), meta_print_hrtime(gethrtime() - start_time)); break; case MC_STOP: /* * Stop Step * * - ??? */ /* don't expect any more arguments to follow the step name */ if (argc != 0) usage(sp, 1); break; case MC_ABORT: /* * Abort Step * * - Abort rpc.mdcommd */ /* don't expect any more arguments to follow the step name */ if (argc != 0) usage(sp, 1); meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"), meta_print_hrtime(0)); /* * Does local set exist? If not, exit with 0 * since there's no reason to have this node panic if * the local set cannot be started. */ if ((local_sp = load_local_set(ep)) == NULL) { md_exit(local_sp, 0); } /* * abort the rpc.mdcommd. The abort is only issued on this node * meaning that the abort reconfig step is called on this * node before a panic while the rest of the cluster will * undergo a reconfig cycle. * There is no time relation between this node running a * reconfig abort and the the rest of the cluster * running a reconfig cycle meaning that this node may * panic before, during or after the cluster has run * a reconfig cycle. */ mdmn_abort(); meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"), meta_print_hrtime(gethrtime() - start_time)); break; case MC_RETURN: /* * Return Step * * - Grab local set lock, issue rpc.mdcommd DRAIN ALL * and release local set lock. Grabbing the local set * lock allows any active metaset/metadb commands to * terminate gracefully and will keep a metaset/metadb * command from starting until the DRAIN ALL is issued. * The metaset/metadb commands can issue * DRAIN ALL/RESUME ALL commands to rpc.mdcommd, * so the return step must not issue the DRAIN ALL command * until metaset/metadb have finished or metaset may issue * a RESUME ALL after this return reconfig step has issued * the DRAIN ALL command. * After this reconfig step has issued the DRAIN_ALL and * released the local set lock, metaset/metadb will fail * when attempting to contact the rpc.mdcommd and will * terminate without making any configuration changes. * The DRAIN ALL command will keep all other meta* commands * from running during the reconfig cycle (these commands * will wait until the rpc.mdcommd is resumed) since the * reconfig cycle may be changing the diskset configuration. */ /* expect the nodelist to follow the step name */ if (argc < 1) usage(sp, 1); meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"), meta_print_hrtime(0)); /* * Does local set exist? If not, exit with 0 * since there's no reason to have this node panic if * the local set cannot be started. */ if ((local_sp = load_local_set(ep)) == NULL) { md_exit(local_sp, 0); } /* * Suspend any mirror resyncs that are in progress. This * stops unnecessary timeouts. */ meta_mirror_resync_block_all(); if (meta_lock(local_sp, TRUE, ep) != 0) { mde_perror(ep, ""); md_exit(local_sp, 1); } /* * All metaset and metadb commands on this node have now * terminated gracefully. Now, issue a drain all to * the rpc.mdcommd. Any meta command issued after the * drain all will either spin sending the command to the * master until after the reconfig cycle has finished OR * will terminate gracefully (metaset/metadb). */ if ((max_sets = get_max_sets(ep)) == 0) { mde_perror(ep, ""); md_exit(sp, 1); } /* start walking through all possible disksets */ for (setno = 1; setno < max_sets; setno++) { if ((sp = metasetnosetname(setno, ep)) == NULL) { if (mdiserror(ep, MDE_NO_SET)) { /* No set for this setno - continue */ mdclrerror(ep); continue; } else { mde_perror(ep, gettext("Unable to " "get set %d information"), setno); md_exit(sp, 1); } } /* only check multi-node disksets */ if (!meta_is_mn_set(sp, ep)) { mdclrerror(ep); continue; } meta_mc_log(MC_LOG3, gettext("Return - block parse " "messages for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); /* * Mddb parse messages are sent amongst the nodes * in a diskset whenever the locator block or * locator names structure has been changed. * A locator block change could occur as a result * of a disk failure during the reconfig cycle, * so block the mddb parse messages while the * rpc.commd is suspended during the reconfig cycle. */ if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { (void) memset(&mbp, 0, sizeof (mbp)); mbp.c_setno = setno; mbp.c_blk_flags = MDDB_BLOCK_PARSE; if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, &mbp.c_mde, NULL)) { (void) mdstealerror(ep, &mbp.c_mde); mde_perror(ep, gettext("Could not " "block set %s"), sp->setname); md_exit(sp, 1); } } /* suspend commd and spin waiting for drain */ while ((ret_val = mdmn_suspend(setno, MD_COMM_ALL_CLASSES, commd_timeout)) == MDE_DS_COMMDCTL_SUSPEND_NYD) { (void) sleep(1); } if (ret_val) { md_eprintf(gettext("Could not suspend " "rpc.mdcommd for set %s\n"), sp->setname); md_exit(sp, 1); } } /* * Resume all I/Os for this node for all MN sets in * case master node had suspended I/Os but panic'd * before resuming I/Os. In case of failure, exit * with a 1 since unable to resume I/Os on this node. */ if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) { mde_perror(ep, gettext( "Unable to resume I/O on node %s for all sets"), mynode()); md_exit(sp, 1); } /* * Can now unlock local set lock. New metaset/metadb * commands are now held off using drain all. */ (void) meta_unlock(local_sp, ep); meta_mc_log(MC_LOG2, gettext("Return step completed: %s"), meta_print_hrtime(gethrtime() - start_time)); break; case MC_STEP1: /* * Step 1 * * - Populate nodelist file if we are on clustering * and pick a master node for each MN diskset. */ /* expect the nodelist to follow the step name */ if (argc < 1) usage(sp, 1); meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"), meta_print_hrtime(0)); /* Always write nodelist file even if no local set exists */ if (clust == SDSSC_OKAY) { /* skip to the nodelist args */ if (meta_write_nodelist(argc, argv, ep) != 0) { mde_perror(ep, gettext( "Could not populate nodelist file")); md_exit(sp, 1); } } /* * Does local set exist? If not, exit with 0 * since there's no reason to have this node panic if * the local set cannot be started. */ if ((local_sp = load_local_set(ep)) == NULL) { md_exit(local_sp, 0); } /* * At this point, all meta* commands are blocked across * all disksets since the master rpc.mdcommd has drained or * the master node has died. * If a metaset or metadb command had been in progress * at the start of the reconfig cycle, this command has * either completed or it has been terminated due to * the death of the master node. * * This means that that it is now ok to remove any * outstanding clnt_locks associated with multinode * disksets on this node due to a node panic during * a metaset operation. This allows the routines that * choose the master to use rpc.metad to determine the * master of the diskset. */ if (clnt_clr_mnsetlock(mynode(), ep) != 0) { meta_mc_log(MC_LOG2, gettext("Step1 aborted:" "clear locks failed %s"), meta_print_hrtime(gethrtime() - start_time)); md_exit(local_sp, 1); } /* * Call reconfig_choose_master to choose a master for * each MN diskset, update the nodelist for each diskset * given the member information and send a reinit message * to rpc.mdcommd to reload the nodelist. */ rval = meta_reconfig_choose_master(commd_timeout, ep); if (rval == 205) { /* * NOTE: Should issue call to reboot remote host that * is causing the RPC failure. Clustering to * provide interface in the future. This should * stop a never-ending set of 205 reconfig cycles. * Remote host causing failure is stored in * ep->host if ep is an RPC error. * if (mdanyrpcerror(ep)) * reboot (ep->host); */ meta_mc_log(MC_LOG2, gettext("Step1 aborted:" "choose master failure of 205 %s"), meta_print_hrtime(gethrtime() - start_time)); md_exit(local_sp, 205); } else if (rval != 0) { meta_mc_log(MC_LOG2, gettext("Step1 failure: " "choose master failure %s"), meta_print_hrtime(gethrtime() - start_time)); md_exit(local_sp, 1); } meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"), meta_print_hrtime(gethrtime() - start_time)); md_exit(local_sp, rval); break; case MC_STEP2: /* * Step 2 * * In Step 2, each node walks the list of disksets. If a * node is a master of a MN diskset, it synchronizes * the local set USER records for that diskset. * * If disks exist in the diskset and there is a joined * (owner) node in the diskset, the master will also: * - synchronize the diskset mddbs to the master * - play the change log * * The master node will now attempt to join any unjoined * nodes that are currently members in the membership list. */ /* expect the nodelist to follow the step name */ if (argc < 1) usage(sp, 1); meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"), meta_print_hrtime(0)); /* * Does local set exist? If not, exit with 0 * since there's no reason to have this node panic if * the local set cannot be started. */ if ((local_sp = load_local_set(ep)) == NULL) { md_exit(local_sp, 0); } if ((max_sets = get_max_sets(ep)) == 0) { mde_perror(ep, ""); md_exit(local_sp, 1); } /* start walking through all possible disksets */ for (setno = 1; setno < max_sets; setno++) { if ((sp = metasetnosetname(setno, ep)) == NULL) { if (mdiserror(ep, MDE_NO_SET)) { /* No set for this setno - continue */ mdclrerror(ep); continue; } else if (mdanyrpcerror(ep)) { /* Fail on RPC failure to self */ mde_perror(ep, gettext( "Unable to get information for " "set number %d"), setno); md_exit(local_sp, 1); } else { mde_perror(ep, gettext( "Unable to get information for " "set number %d"), setno); mdclrerror(ep); continue; } } if ((sd = metaget_setdesc(sp, ep)) == NULL) { if (mdanyrpcerror(ep)) { /* Fail on RPC failure to self */ mde_perror(ep, gettext( "Unable to get information for " "set number %d"), setno); md_exit(local_sp, 1); } mde_perror(ep, gettext("Unable to get set " "%s desc information"), sp->setname); mdclrerror(ep); continue; } /* Only check MN disksets */ if (!(MD_MNSET_DESC(sd))) { continue; } /* All actions in step 2 are driven by master */ if (!(sd->sd_mn_am_i_master)) { continue; } meta_mc_log(MC_LOG3, gettext("Step2 - begin record " "synchronization for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); /* * Synchronize the USER records in the local mddbs * for hosts that are members. The USER records * contain set, drive and host information. */ rval = meta_mnsync_user_records(sp, ep); if (rval != 0) { mde_perror(ep, gettext( "Synchronization of user records " "in set %s failed\n"), sp->setname); if (rval == 205) { /* * NOTE: Should issue call to reboot * remote host that is causing the RPC * failure. Clustering to provide * interface in the future. This * should stop a never-ending set of * 205 reconfig cycles. * Remote host causing failure is * stored in ep->host if ep is an * RPC error. * if (mdanyrpcerror(ep)) * reboot (ep->host); */ md_exit(local_sp, 205); } else { md_exit(local_sp, 1); } } /* Reget sd since sync_user_recs may have flushed it */ if ((sd = metaget_setdesc(sp, ep)) == NULL) { mde_perror(ep, gettext("Unable to get set " "%s desc information"), sp->setname); md_exit(local_sp, 1); } dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); if (! mdisok(ep)) { mde_perror(ep, gettext("Unable to get set " "%s drive information"), sp->setname); md_exit(local_sp, 1); } /* * No drives in set, continue to next set. */ if (dd == NULL) { /* Done with this set */ continue; } meta_mc_log(MC_LOG3, gettext("Step2 - local set user " "records completed for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); /* * Synchronize the diskset mddbs for hosts * that are members. This may involve * playing the changelog and writing out * to the diskset mddbs. */ rval = meta_mnsync_diskset_mddbs(sp, ep); if (rval != 0) { mde_perror(ep, gettext( "Synchronization of diskset mddbs " "in set %s failed\n"), sp->setname); meta_mc_log(MC_LOG3, gettext("Step2 - diskset " "mddb synchronization failed for " "set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); if (rval == 205) { /* * NOTE: Should issue call to reboot * remote host that is causing the RPC * failure. Clustering to provide * interface in the future. This * should stop a never-ending set of * 205 reconfig cycles. * Remote host causing failure is * stored in ep->host if ep is an * RPC error. * if (mdanyrpcerror(ep)) * reboot (ep->host); */ md_exit(local_sp, 205); } else if (rval == 1) { continue; } else { md_exit(local_sp, 1); } } meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb " "synchronization completed for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); /* Join the starting nodes to the diskset */ rval = meta_mnjoin_all(sp, ep); if (rval != 0) { mde_perror(ep, gettext( "Join of non-owner (starting) nodes " "in set %s failed\n"), sp->setname); meta_mc_log(MC_LOG3, gettext("Step2 - non owner" "nodes joined for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); if (rval == 205) { /* * NOTE: Should issue call to reboot * remote host that is causing the RPC * failure. Clustering to provide * interface in the future. This * should stop a never-ending set of * 205 reconfig cycles. * Remote host causing failure is * stored in ep->host if ep is an * RPC error. * if (mdanyrpcerror(ep)) * reboot (ep->host); */ md_exit(local_sp, 205); } else { md_exit(local_sp, 1); } } meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes " "joined for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); } meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"), meta_print_hrtime(gethrtime() - start_time)); break; case MC_STEP3: /* * Step 3 * * For all multinode sets do, * - Reinitialise rpc.mdcommd * - Reset mirror owners to null if the current owner is * no longer in the membership list */ /* expect the nodelist to follow the step name */ if (argc < 1) usage(sp, 1); meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"), meta_print_hrtime(0)); /* * Does local set exist? If not, exit with 0 * since there's no reason to have this node panic if * the local set cannot be started. */ if ((local_sp = load_local_set(ep)) == NULL) { md_exit(local_sp, 0); } /* * walk through all sets on this node which could include: * - MN disksets * - traditional disksets * - non-existent disksets * start mirror resync for all MN sets */ if ((max_sets = get_max_sets(ep)) == 0) { mde_perror(ep, ""); md_exit(local_sp, 1); } /* start walking through all possible disksets */ for (setno = 1; setno < max_sets; setno++) { if ((sp = metasetnosetname(setno, ep)) == NULL) { if (mdiserror(ep, MDE_NO_SET)) { /* No set for this setno - continue */ mdclrerror(ep); continue; } else { mde_perror(ep, gettext("Unable to " "get set %d information"), setno); md_exit(local_sp, 1); } } /* only check multi-node disksets */ if (!meta_is_mn_set(sp, ep)) { mdclrerror(ep); continue; } if (meta_lock(sp, TRUE, ep) != 0) { mde_perror(ep, ""); md_exit(local_sp, 1); } /* If this node isn't joined to set, do nothing */ if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { if (!mdisok(ep)) { mde_perror(ep, gettext("Could " "not get set %s ownership"), sp->setname); md_exit(sp, 1); } mdclrerror(ep); (void) meta_unlock(sp, ep); continue; } meta_mc_log(MC_LOG3, gettext("Step3 - begin " "re-initialising rpc.mdcommd and resetting mirror " "owners for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); /* reinitialzse rpc.mdcommd with new nodelist */ if (mdmn_reinit_set(setno, commd_timeout)) { md_eprintf(gettext( "Could not re-initialise rpc.mdcommd for " "set %s\n"), sp->setname); md_exit(sp, 1); } (void) memset(&cfg, 0, sizeof (cfg)); cfg.c_id = 0; cfg.c_setno = sp->setno; if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, NULL) != 0) { (void) mdstealerror(ep, &cfg.c_mde); mde_perror(ep, gettext("Could " "not get set %s information"), sp->setname); md_exit(sp, 1); } /* Don't do anything else if set is stale */ if (cfg.c_flags & MDDB_C_STALE) { (void) meta_unlock(sp, ep); mdclrerror(ep); continue; } /* reset mirror owners */ if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) { md_exit(sp, 1); } (void) meta_unlock(sp, ep); meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd " "re-initialised and mirror owners reset for " "set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); } meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"), meta_print_hrtime(gethrtime() - start_time)); break; case MC_STEP4: /* * Step 4 * * For all multinode sets do: * - Resume the rpc.mdcommd messages. Must resume all * sets before issuing I/O to any set since an error * encountered in a commd suspended set could be * blocked waiting for commd in another set to resume. * (This happens since the daemon queues service * all sets). An open of a soft partition causes * a read of the watermarks during the open. * - If set is non-writable (not an owner or STALE), then * continue to next set. * * For all multinode sets do, * - Reset ABR states for all mirrors, ie clear ABR if not * open on any node. * - Reset ABR states for all soft partitions, ie clear ABR if * not open on any node. * - For all slave nodes that have entered through the start * step, update the ABR state to that of the master and * get the submirror state from the master * - meta_lock set * - Resync all mirrors * - unlock meta_lock for this set. * - Choose a new owner for any orphaned resyncs * * There is one potential issue here. when concurrently * resetting and updating the ABR state. If the master has ABR * set, but should no longer have because the only node that * had the metadevice open and had ABR set has paniced, the * master will send a message to all nodes to clear the ABR * state. Meanwhile any node that has come through the * start step will get tstate from the master and will update * ABR if it was set in tstate. So, we appear to have a problem * if the following sequence occurs:- * - The slave gets tstate with ABR set * - The master sends a message to clear ABR * - The slave updates ABR with the value it got from tstate. * We now have the master with ABR clear and the slave with ABR * set. Fortunately, having set ABR, the slave will close the * metadevice after setting ABR and as there are no nodes with * the device open, the close will send a message to clear ABR * on all nodes. So, the nodes will all have ABR unset. */ /* expect the nodelist to follow the step name */ if (argc < 1) usage(sp, 1); meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"), meta_print_hrtime(0)); /* * Does local set exist? If not, exit with 0 * since there's no reason to have this node panic if * the local set cannot be started. */ if ((local_sp = load_local_set(ep)) == NULL) { md_exit(local_sp, 0); } /* * walk through all sets on this node which could include: * - MN disksets * - traditional disksets * - non-existent disksets * start mirror resync for all MN sets */ if ((max_sets = get_max_sets(ep)) == 0) { mde_perror(ep, ""); md_exit(local_sp, 1); } /* Clear set_info structure */ for (setno = 1; setno < max_sets; setno++) { set_info[setno] = 0; } /* start walking through all possible disksets */ for (setno = 1; setno < max_sets; setno++) { if ((sp = metasetnosetname(setno, ep)) == NULL) { if (mdiserror(ep, MDE_NO_SET)) { /* No set for this setno - continue */ mdclrerror(ep); continue; } else { mde_perror(ep, gettext("Unable to " "get set %d information"), setno); md_exit(local_sp, 1); } } if ((sd = metaget_setdesc(sp, ep)) == NULL) { mde_perror(ep, gettext("Unable to get set " "%s desc information"), sp->setname); mdclrerror(ep); continue; } /* only check multi-node disksets */ if (!meta_is_mn_set(sp, ep)) { mdclrerror(ep); continue; } set_info[setno] |= SET_INFO_MN; /* * If not an owner (all mddbs failed) or stale * (< 50% mddbs operational), then set is * non-writable so just resume commd and * unblock mddb messages. */ mdclrerror(ep); if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) { set_info[setno] |= SET_INFO_NO_WR; } if (!mdisok(ep)) { mde_perror(ep, gettext("Could " "not get set %s ownership"), sp->setname); md_exit(local_sp, 1); } /* Set is owned - is it stale? */ if (!set_info[setno] & SET_INFO_NO_WR) { (void) memset(&cfg, 0, sizeof (cfg)); cfg.c_id = 0; cfg.c_setno = sp->setno; if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde, NULL) != 0) { (void) mdstealerror(ep, &cfg.c_mde); mde_perror(ep, gettext("Could " "not get set %s information"), sp->setname); md_exit(local_sp, 1); } if (cfg.c_flags & MDDB_C_STALE) { set_info[setno] |= SET_INFO_NO_WR; } } /* resume rpc.mdcommd */ if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0, commd_timeout)) { md_eprintf(gettext("Unable to resume " "rpc.mdcommd for set %s\n"), sp->setname); md_exit(local_sp, 1); } /* Unblock mddb parse messages */ if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { (void) memset(&mbp, 0, sizeof (mbp)); mbp.c_setno = setno; mbp.c_blk_flags = MDDB_UNBLOCK_PARSE; if (metaioctl(MD_MN_MDDB_BLOCK, &mbp, &mbp.c_mde, NULL)) { (void) mdstealerror(ep, &mbp.c_mde); mde_perror(ep, gettext("Could not " "unblock set %s"), sp->setname); md_exit(local_sp, 1); } } meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd " "resumed and messages unblocked for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); } for (setno = 1; setno < max_sets; setno++) { int start_step; /* Skip traditional disksets. */ if ((set_info[setno] & SET_INFO_MN) == 0) continue; /* * If already determined that this set is * a non-writable set, then just continue * to next set since there's nothing else * to do for a non-writable set. */ if (set_info[setno] & SET_INFO_NO_WR) continue; if ((sp = metasetnosetname(setno, ep)) == NULL) { if (mdiserror(ep, MDE_NO_SET)) { /* No set for this setno - continue */ mdclrerror(ep); continue; } else { mde_perror(ep, gettext("Unable to " "get set %d information"), setno); md_exit(local_sp, 1); } } if ((sd = metaget_setdesc(sp, ep)) == NULL) { mde_perror(ep, gettext("Unable to get set " "%s desc information"), sp->setname); mdclrerror(ep); continue; } /* See if this node came through the start step */ (void) memset(&sf, 0, sizeof (sf)); sf.sf_setno = sp->setno; sf.sf_flags = MDDB_NM_GET; /* Use magic to help protect ioctl against attack. */ sf.sf_magic = MDDB_SETFLAGS_MAGIC; if (metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL)) { (void) mdstealerror(ep, &sf.sf_mde); mde_perror(ep, gettext("Could not get " "start_step flag for set %s"), sp->setname); md_exit(local_sp, 1); } start_step = (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0; /* * We can now reset the start_step flag for the set * if it was already set. */ if (start_step) { (void) memset(&sf, 0, sizeof (sf)); sf.sf_setno = sp->setno; sf.sf_setflags = MD_SET_MN_START_RC; sf.sf_flags = MDDB_NM_RESET; /* * Use magic to help protect ioctl * against attack. */ sf.sf_magic = MDDB_SETFLAGS_MAGIC; if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL)) { (void) mdstealerror(ep, &sf.sf_mde); mde_perror(ep, gettext("Could not reset " "start_step flag for set %s"), sp->setname); } } meta_mc_log(MC_LOG3, gettext("Step4 - begin setting " "ABR state and restarting io's for " "set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); /* * If we are not the master and we have come through * the start step, we must update the ABR states * for mirrors and soft partitions. Also the submirror * states need to be synchronised so that we see the * same status as other previously joined members. * This _must_ be done before starting the resync. */ if (!(sd->sd_mn_am_i_master) && start_step) { if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR, ep) == -1) { md_exit(local_sp, 1); } if (reset_state(UPDATE_ABR, sp, MD_SP, ep) == -1) { md_exit(local_sp, 1); } /* * Mark the fact that we've got the mirror * state. This allows the resync thread to * determine if _it_ needs to issue this. This * can happen if a node is added to a set after * a reconfig cycle has completed. */ (void) memset(&sf, 0, sizeof (sf)); sf.sf_setno = sp->setno; sf.sf_setflags = MD_SET_MN_MIR_STATE_RC; sf.sf_flags = MDDB_NM_SET; /* * Use magic to help protect ioctl * against attack. */ sf.sf_magic = MDDB_SETFLAGS_MAGIC; if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL)) { (void) mdstealerror(ep, &sf.sf_mde); mde_perror(ep, gettext("Could not set " "submirror state flag for set %s"), sp->setname); } } /* * All remaining actions are only performed by the * master */ if (!(sd->sd_mn_am_i_master)) { if (meta_lock(sp, TRUE, ep) != 0) { mde_perror(ep, ""); md_exit(local_sp, 1); } meta_mirror_resync_unblock(sp); (void) meta_unlock(sp, ep); continue; } /* * If the master came through the start step, this * implies that all of the nodes must have done the * same and hence there can be no applications * running. Hence no need to reset ABR */ if (!start_step) { /* Reset ABR state for mirrors */ if (reset_state(RESET_ABR, sp, MD_MIRROR, ep) == -1) { md_exit(local_sp, 1); } /* ...and now the same for soft partitions */ if (reset_state(RESET_ABR, sp, MD_SP, ep) == -1) { md_exit(local_sp, 1); } } /* * choose owners for orphaned resyncs and reset * non-orphaned resyncs so that an owner node that * reboots will restart the resync if needed. */ if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1) md_exit(local_sp, 1); /* * Must unlock set lock before meta_mirror_resync_all * sends a message to run the metasync command * which also grabs the meta_lock. */ if (meta_lock(sp, TRUE, ep) != 0) { mde_perror(ep, ""); md_exit(local_sp, 1); } meta_mirror_resync_unblock(sp); (void) meta_unlock(sp, ep); /* resync all mirrors in set */ if (meta_mirror_resync_all(sp, 0, ep) != 0) { mde_perror(ep, gettext("Mirror resyncs " "failed for set %s"), sp->setname); md_exit(local_sp, 1); } meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted " "for set %s: %s"), sp->setname, meta_print_hrtime(gethrtime() - start_time)); } meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"), meta_print_hrtime(gethrtime() - start_time)); break; default: usage(sp, 1); break; } md_exit(sp, 0); /* NOTREACHED */ return (0); }