1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <meta.h>
28 #include <sdssc.h>
29 #include <signal.h>
30 #include <syslog.h>
31 #include <sys/types.h>
32 #include <sys/wait.h>
33 #include <sys/lvm/md_mirror.h>
34 #include <metad.h>
35
36 #define MY_VERSION "1.0" /* the highest supported version */
37 #define MAX_DEBUG_LEVEL 5 /* maximum verbosity level */
38
39 #define RESET_OWNER 0x0001
40 #define CHOOSE_OWNER 0x0002
41 #define RESET_ABR 0x0004
42 #define UPDATE_ABR 0x0008
43 #define GET_MIRROR_STATE 0x0010
44
45 #define SET_INFO_NO_WR 0x0002
46 #define SET_INFO_MN 0x0004
47
48 /*
49 * This table defines all the metaclust reconfig steps we understand
50 */
51 typedef enum stpnum {
52 MC_UNK = 0,
53 MC_START,
54 MC_STOP,
55 MC_ABORT,
56 MC_RETURN,
57 MC_STEP1,
58 MC_STEP2,
59 MC_STEP3,
60 MC_STEP4
61 } stepnum_t;
62
63 /*
64 * Structure for step_name -> step_number mapping
65 */
66 struct step_t {
67 char *step_nam;
68 stepnum_t step_num;
69 };
70
71 /*
72 * Step name to step number mapping table
73 * This table MUST be sorted alphabetically in ascending order of step name
74 */
75 static struct step_t step_table[] = {
76 { "abort", MC_ABORT },
77 { "return", MC_RETURN },
78 { "start", MC_START },
79 { "step1", MC_STEP1 },
80 { "step2", MC_STEP2 },
81 { "step3", MC_STEP3 },
82 { "step4", MC_STEP4 },
83 { "stop", MC_STOP }
84 };
85
86 /*
87 * If support for a different version is added, the new version number should
88 * be appended to the version_table below. This list will be searched to
89 * determine if a version requested via the -V option is supported or not.
90 */
91 static char *version_table[] = {
92 MY_VERSION
93 };
94
95 uint_t timeout = 0; /* disable timeout by default */
96 char *version = MY_VERSION; /* use latest version by default */
97 int stepnum = MC_UNK; /* reconfiguration step number */
98 pid_t c_pid; /* child process id */
99
100 /*
101 * Binary search comparison routine
102 */
103 static int
mc_compare(const void * stp1,const void * stp2)104 mc_compare(const void *stp1, const void *stp2)
105 {
106 return (strcmp((const char *)stp1,
107 ((const struct step_t *)stp2)->step_nam));
108 }
109
110 /*
111 * Timeout expiry alarm signal handler
112 */
113 /*ARGSUSED*/
114 static void
sigalarmhandler(int sig)115 sigalarmhandler(int sig)
116 {
117 int i, n, ret, stat_loc = 0;
118 FILE *pgcore;
119 char corecmd[256];
120
121 n = sizeof (step_table) / sizeof (step_table[0]);
122 for (i = 0; i < n; i++) {
123 if (stepnum == step_table[i].step_num)
124 break;
125 }
126
127 assert(i != n);
128
129 meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
130 step_table[i].step_nam,
131 meta_print_hrtime(gethrtime() - start_time));
132
133 /*
134 * See what the child was actually doing when the timeout expired.
135 * A core-dump of this would be _really_ good, so let's just
136 * try a 'gcore -g c_pid' and hope
137 */
138
139 (void) memset(corecmd, 0, sizeof (corecmd));
140 (void) snprintf(corecmd, sizeof (corecmd),
141 "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid);
142
143 pgcore = popen(corecmd, "r");
144
145 if (pgcore == NULL) {
146 meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"),
147 c_pid);
148 } else {
149 (void) pclose(pgcore);
150 }
151
152 if ((ret = kill(c_pid, SIGKILL)) == 0) {
153 /*
154 * The child will wait forever until the status is retrieved
155 * so get it now. Keep retrying if the call is interrupted.
156 *
157 * The possible results are,
158 *
159 * - child killed successfully
160 * - signal sent but child not killed
161 * - waitpid failed/interrupted
162 */
163 (void) sleep(2);
164 while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
165 if (errno != EINTR) {
166 break;
167 }
168 }
169 if ((ret == c_pid) || (errno == ECHILD)) {
170 ret = 0;
171 } else {
172 ret = 1;
173 }
174 } else if (errno == ESRCH) {
175 /*
176 * If the kill did not catch the child then it means the child
177 * exited immediately after the timeout occured.
178 */
179 ret = 0;
180 }
181
182 /*
183 * make sure not to exit with 205 for any steps other than step1-step4.
184 * Suncluster reconfiguration can't handle it otherwise.
185 */
186 switch (stepnum) {
187 case MC_STEP1:
188 case MC_STEP2:
189 case MC_STEP3:
190 case MC_STEP4:
191 /*
192 * If the child was killed successfully return 205 for a
193 * new reconfig cycle otherwise send 1 to panic the node.
194 */
195 if (ret != 0) {
196 md_eprintf(gettext("Could not kill child\n"));
197 exit(1);
198 } else {
199 exit(205);
200 }
201 break;
202 case MC_START:
203 case MC_STOP:
204 case MC_ABORT:
205 case MC_RETURN:
206 default:
207 exit(1);
208 break;
209 }
210 }
211
212 /*
213 * Attempt to load local set.
214 * Returns:
215 * pointer to mdsetname_t for local set (local_sp) is successful.
216 * 0 if failure
217 * if there are no local set mddbs, no error message is printed.
218 * Otherwise, error message is printed so that user
219 * can determine why the local set didn't start.
220 */
221 mdsetname_t *
load_local_set(md_error_t * ep)222 load_local_set(md_error_t *ep)
223 {
224 mdsetname_t *local_sp = NULL;
225
226 /* Does local set exist? If not, give no error */
227 if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
228 return (0);
229 }
230
231 /*
232 * snarf local set
233 * If fails with MDE_DB_NODB, then just return 1 printing
234 * no failure.
235 * Otherwise, print error message, and return 1.
236 */
237 if (meta_setup_db_locations(ep) != 0) {
238 if (!(mdismddberror(ep, MDE_DB_NODB)))
239 mde_perror(ep, "");
240 return (0);
241 }
242
243 /* local set loaded successfully */
244 return (local_sp);
245 }
246
247 /*
248 * Purpose: Compose a full path name for a metadevice
249 *
250 * On entry: sp - setname pointer
251 * mnum - minor number of metadevice
252 * pathname - pointer to array to return path string
253 * pathlen - max length of pathname array
254 */
255 static int
compose_path(mdsetname_t * sp,int mnum,char * pathname,int pathlen)256 compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
257 {
258 int rtn;
259 mdname_t *np;
260 md_error_t status = mdnullerror;
261
262 if (MD_MIN2SET(mnum) != sp->setno) {
263 md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
264 mnum, sp->setno);
265 return (-1);
266 }
267
268 if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) {
269 return (-1);
270 }
271
272 rtn = snprintf(pathname, pathlen, "%s", np->rname);
273
274 if ((pathname[0] == '\0') || (rtn >= pathlen)) {
275 md_eprintf(gettext(
276 "Could not create path for device %s\n"),
277 get_mdname(sp, mnum));
278 return (-1);
279 }
280 return (0);
281 }
282
283 /*
284 * Purpose: Walk through all the devices specified for the given set
285 * and do the action specified in mode
286 */
287 static int
reset_state(uint_t mode,mdsetname_t * sp,char * drivername,md_error_t * ep)288 reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
289 {
290 mdnamelist_t *devnlp = NULL;
291 mdnamelist_t *p;
292 mdname_t *devnp = NULL;
293 md_set_mmown_params_t ownpar_p;
294 md_set_mmown_params_t *ownpar = &ownpar_p;
295 md_unit_t *mm;
296 int mirror_dev = 0;
297 mndiskset_membershiplist_t *nl;
298 int cnt;
299 int has_parent;
300 md_mn_get_mir_state_t mir_state_p;
301 md_mn_get_mir_state_t *mir_state = &mir_state_p;
302
303 /*
304 * if we are choosing or resetting the owners then make sure
305 * we are only doing it for mirror devices
306 */
307 mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
308 if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
309 return (-1);
310 }
311
312 /* get a list of all the metadevices for current set */
313 if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
314 mde_perror(ep, gettext("Could not get mirrors for set %s"),
315 sp->setname);
316 return (-1);
317 } else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
318 mde_perror(ep, gettext(
319 "Could not get soft partitions for set %s"), sp->setname);
320 return (-1);
321 }
322
323 /* If resetting the owner, get the known membership list */
324 if (mode & RESET_OWNER) {
325 if (meta_read_nodelist(&cnt, &nl, ep)) {
326 mde_perror(ep, "Could not get nodelist");
327 return (-1);
328 }
329 }
330
331 /* for each metadevice */
332 for (p = devnlp; (p != NULL); p = p->next) {
333 devnp = p->namep;
334
335 /*
336 * Get the current setting for mirror ABR state and all of the
337 * submirror state and flags from the master node. We only
338 * perform this when going through a 'start' cycle.
339 */
340 if ((mode & GET_MIRROR_STATE) && mirror_dev) {
341 char *miscname;
342
343 /*
344 * Ensure that we ignore soft-parts that are returned
345 * from the meta_get_mirror_names() call
346 */
347 if ((miscname = metagetmiscname(devnp, ep)) == NULL)
348 goto out;
349 if (strcmp(miscname, MD_MIRROR) != 0)
350 continue;
351
352 mir_state->mnum = meta_getminor(devnp->dev);
353 MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
354 meta_mc_log(MC_LOG4, gettext("Getting mirror state"
355 " for %s: %s"), get_mdname(sp, mir_state->mnum),
356 meta_print_hrtime(gethrtime() - start_time));
357
358 if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
359 "MD_MN_GET_MIRROR_STATE") != 0) {
360 mde_perror(ep, gettext("Unable to get "
361 "mirror state for %s"),
362 get_mdname(sp, mir_state->mnum));
363 goto out;
364 } else {
365 continue;
366 }
367 }
368
369 /* check if this is a top level metadevice */
370 if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
371 goto out;
372 if (MD_HAS_PARENT(MD_PARENT(mm))) {
373 has_parent = 1;
374 } else {
375 has_parent = 0;
376 }
377 Free(mm);
378
379 if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
380 char *miscname;
381
382 /*
383 * we can only do these for mirrors so make sure we
384 * really have a mirror device and not a softpartition
385 * imitating one. meta_get_mirror_names seems to think
386 * softparts on top of a mirror are mirrors!
387 */
388 if ((miscname = metagetmiscname(devnp, ep)) == NULL)
389 goto out;
390 if (strcmp(miscname, MD_MIRROR) != 0)
391 continue;
392
393 (void) memset(ownpar, 0, sizeof (*ownpar));
394 ownpar->d.mnum = meta_getminor(devnp->dev);
395 MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);
396
397 meta_mc_log(MC_LOG4, gettext("Setting owner "
398 "for %s: %s"), get_mdname(sp, ownpar->d.mnum),
399 meta_print_hrtime(gethrtime() - start_time));
400
401 /* get the current owner id */
402 if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
403 "MD_MN_GET_MM_OWNER") != 0) {
404 mde_perror(ep, gettext("Unable to get "
405 "mirror owner for %s"),
406 get_mdname(sp, ownpar->d.mnum));
407 goto out;
408 }
409 }
410
411 if (mode & RESET_OWNER) {
412 if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
413 mdclrerror(ep);
414 continue;
415 }
416
417 /*
418 * reset owner only if the current owner is
419 * not in the membership list
420 * Also kill the resync thread so that when the resync
421 * is started, it will perform an optimized resync
422 * for any resync regions that were dirty when the
423 * current owner left the membership.
424 */
425 if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
426 if (meta_mn_change_owner(&ownpar,
427 sp->setno, ownpar->d.mnum,
428 MD_MN_MIRROR_UNOWNED,
429 MD_MN_MM_ALLOW_CHANGE) == -1) {
430 md_eprintf(gettext(
431 "Unable to reset mirror owner "
432 "for %s\n"),
433 get_mdname(sp, ownpar->d.mnum));
434 goto out;
435 }
436 if (meta_mirror_resync(sp, devnp, 0, ep,
437 MD_RESYNC_KILL_NO_WAIT) != 0) {
438 md_eprintf(gettext(
439 "Unable to kill resync for"
440 " %s\n"),
441 get_mdname(sp, ownpar->d.mnum));
442 goto out;
443 }
444 }
445 }
446
447 if (mode & CHOOSE_OWNER) {
448 /*
449 * only orphaned resyncs will have no owner.
450 * if that is the case choose a new owner. Otherwise
451 * re-establish the existing owner. This covers the
452 * case where a node that owned the mirror
453 * reboots/panics and comes back into the cluster before
454 * the reconfig cycle has completed. In this case the
455 * other cluster nodes will have the mirror owner marked
456 * as the rebooted node while it has the owner marked
457 * as 'None'. We have to reestablish the ownership so
458 * that the subsequent resync can continue.
459 */
460 if (meta_mn_change_owner(&ownpar, sp->setno,
461 ownpar->d.mnum, ownpar->d.owner,
462 MD_MN_MM_CHOOSE_OWNER) == -1) {
463 md_eprintf(gettext("Unable to choose "
464 "mirror owner for %s\n"),
465 get_mdname(sp, ownpar->d.mnum));
466 goto out;
467 }
468 }
469
470 /*
471 * For RESET_ABR and UPDATE_ABR - only handle top
472 * level metadevices.
473 */
474 if (has_parent)
475 continue;
476
477 if (mode & RESET_ABR) {
478 /*
479 * Reset the ABR (application based recovery)
480 * value on all nodes. We are dealing with
481 * the possibility that we have ABR set but the
482 * only node that had the device open with ABR has
483 * left the cluster. We simply open and close the
484 * device and if this is the last close in the
485 * cluster, ABR will be cleared on all nodes.
486 */
487 char *miscname;
488 char name[MAXPATHLEN];
489 int mnum, fd;
490
491 name[0] = '\0';
492 mnum = meta_getminor(devnp->dev);
493
494 /*
495 * Ensure that we don't include soft-parts in the
496 * mirror-only call to RESET_ABR. meta_get_mirror_names
497 * returns a bogus list that includes all soft-parts
498 * built on mirrors.
499 */
500 if ((miscname = metagetmiscname(devnp, ep)) == NULL)
501 goto out;
502 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
503 continue;
504
505 meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
506 "for %s: %s"), get_mdname(sp, mnum),
507 meta_print_hrtime(gethrtime() - start_time));
508
509 /* compose the absolute device path and open it */
510 if (compose_path(sp, mnum, &name[0],
511 sizeof (name)) != 0)
512 goto out;
513 if ((fd = open(name, O_RDWR, 0)) < 0) {
514 md_perror(gettext("Could not open device %s"),
515 name);
516 continue;
517 }
518
519 (void) close(fd);
520 }
521
522 if (mode & UPDATE_ABR) {
523 /*
524 * Update the ABR value on this node. We obtain the
525 * current ABR state from the master node.
526 */
527
528 char *miscname;
529 char name[MAXPATHLEN];
530 int mnum, fd;
531 volcap_t vc;
532 uint_t tstate;
533
534 name[0] = '\0';
535 mnum = meta_getminor(devnp->dev);
536
537 /*
538 * Ensure that we don't include soft-parts in the
539 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
540 * returns a bogus list that includes all soft-parts
541 * built on mirrors.
542 */
543 if ((miscname = metagetmiscname(devnp, ep)) == NULL)
544 goto out;
545 if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
546 continue;
547
548 /* Get tstate from Master */
549 if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
550 != 0)
551 continue;
552 /* If not set on the master, nothing to do */
553 if (!(tstate & MD_ABR_CAP))
554 continue;
555
556 meta_mc_log(MC_LOG4, gettext("Updating ABR state "
557 "for %s: %s"), get_mdname(sp, mnum),
558 meta_print_hrtime(gethrtime() - start_time));
559
560 /* compose the absolute device path and open it */
561 if (compose_path(sp, mnum, &name[0],
562 sizeof (name)) != 0)
563 goto out;
564 if ((fd = open(name, O_RDWR, 0)) < 0) {
565 md_perror(gettext("Could not open device %s"),
566 name);
567 continue;
568 }
569
570 /* set ABR state */
571 vc.vc_info = 0;
572 vc.vc_set = 0;
573 if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
574 /*
575 * Ignore if device does not support this
576 * ioctl
577 */
578 if ((errno != ENOTTY) && (errno != ENOTSUP)) {
579 md_perror(gettext("Could not get "
580 "ABR/DMR state for device %s"),
581 name);
582 }
583 (void) close(fd);
584 continue;
585 }
586 if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
587 (void) close(fd);
588 continue;
589 }
590
591 vc.vc_set = DKV_ABR_CAP;
592 if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
593 md_perror(gettext(
594 "Could not set ABR state for "
595 "device %s"), name);
596 (void) close(fd);
597 goto out;
598 } else {
599 md_eprintf(gettext(
600 "Setting ABR state on device %s\n"), name);
601 }
602
603 (void) close(fd);
604 }
605 }
606
607 /* cleanup */
608 if (mode & RESET_OWNER) {
609 meta_free_nodelist(nl);
610 }
611 metafreenamelist(devnlp);
612 return (0);
613
614 out:
615 /* cleanup */
616 if (mode & RESET_OWNER) {
617 meta_free_nodelist(nl);
618 }
619 metafreenamelist(devnlp);
620 return (-1);
621 }
622
623 /*
624 * Print usage message
625 */
626 static void
usage(mdsetname_t * sp,int eval)627 usage(mdsetname_t *sp, int eval)
628 {
629 (void) fprintf(stderr, gettext("usage:"
630 "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
631 "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
632 "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
633 "\t%s [-V | -? | -h]\n"),
634 myname, myname, myname, myname);
635 if (!eval) {
636 (void) fprintf(stderr, gettext("\n"
637 "\tValid debug (-d) levels are 1-%d for increasing "
638 "verbosity.\n\tDefault is -d 3.\n\n"
639 "\tValid step values are: return | step1 | step2 | "
640 "step3 | step4\n\n"
641 "\tNodelist is a space-separated list of node id's\n\n"),
642 MAX_DEBUG_LEVEL);
643 }
644 md_exit(sp, eval);
645 }
646
647 /*
648 * Input: Input takes a config step name followed by a list of
649 * possible node id's.
650 *
651 * Returns: 0 - Success
652 * 1 - Fail
653 * Node will be removed from cluster membership
654 * by forcing node to panic.
655 * 205 - Unsuccessful. Start another reconfig cycle.
656 * Problem was encountered that could be fixed by
657 * running another reconfig cycle.
658 * Problem could be a result of a failure to read
659 * the nodelist file or that all work could not be
660 * accomplished in a reconfig step in the amount of
661 * time given so another reconfig cycle is needed in
662 * order to finish the current step.
663 */
664 int
main(int argc,char ** argv)665 main(int argc, char **argv)
666 {
667 mdsetname_t *sp = NULL;
668 md_error_t status = mdnullerror;
669 md_error_t *ep = &status;
670 set_t max_sets, setno;
671 int c, clust = 0;
672 struct sigaction nsa, osa;
673 struct step_t *step_ptr;
674 mdsetname_t *local_sp = NULL;
675 md_drive_desc *dd;
676 int rval = 0;
677 md_set_desc *sd;
678 mddb_block_parm_t mbp;
679 uint_t debug = 3; /* log upto MC_LOG3 by default */
680 int version_table_size;
681 mddb_setflags_config_t sf;
682 int ret_val;
683 mddb_config_t cfg;
684 int set_info[MD_MAXSETS];
685 long commd_timeout = 0;
686
687 /*
688 * Get the locale set up before calling any other routines
689 * with messages to ouput. Just in case we're not in a build
690 * environment, make sure that TEXT_DOMAIN gets set to
691 * something.
692 */
693 #if !defined(TEXT_DOMAIN)
694 #define TEXT_DOMAIN "SYS_TEST"
695 #endif
696 (void) setlocale(LC_ALL, "");
697 (void) textdomain(TEXT_DOMAIN);
698
699 if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
700 md_eprintf(gettext("Interface error with libsds_sc.so\n"));
701 exit(1);
702 }
703
704 if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
705 mde_perror(ep, "");
706 md_exit(sp, 1);
707 }
708
709 /*
710 * open log and enable libmeta logging. Do it here explicitly
711 * rather than letting md_init() do it because we are not really
712 * a daemon and that is what md_init() opens the log as.
713 */
714 openlog("metaclust", LOG_CONS, LOG_USER);
715
716 version_table_size = sizeof (version_table) / sizeof (version_table[0]);
717
718 optind = 1;
719 opterr = 0;
720 while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
721 switch (c) {
722 case 'h':
723 usage(sp, 0);
724 break;
725
726 case 'd':
727 if (sscanf(optarg, "%u", &debug) != 1) {
728 md_eprintf(gettext("Invalid debug level\n"));
729 md_exit(sp, 1);
730 } else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
731 debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
732 md_eprintf(gettext("Debug level must be "
733 "between 1 and %d inclusive.\n"),
734 MAX_DEBUG_LEVEL);
735 md_eprintf(gettext("Debug level set to %d.\n"),
736 debug);
737 }
738 break;
739
740 case 'V':
741 version = Strdup(optarg);
742 break;
743
744 case 't':
745 if (sscanf(optarg, "%u", &timeout) != 1) {
746 md_eprintf(gettext("Invalid timeout value\n"));
747 md_exit(sp, 1);
748 }
749 break;
750
751 case '?':
752 if (optopt == '?') {
753 usage(sp, 0);
754 } else if (optopt == 'V') {
755 int i;
756
757 (void) fprintf(stdout, gettext(
758 "%s: Versions Supported:"), myname);
759 for (i = 0; i < version_table_size; i++) {
760 (void) fprintf(stdout, " %s",
761 version_table[i]);
762 }
763 (void) fprintf(stdout, "\n");
764 md_exit(sp, 0);
765 }
766 /*FALLTHROUGH*/
767
768 default:
769 usage(sp, 1);
770 break;
771 }
772 }
773
774 /* initialise the debug level and start time */
775 setup_mc_log(debug);
776
777 /*
778 * check that the version specified (if any) is supported.
779 */
780 if (version != NULL) {
781 int i, found = 0;
782
783 for (i = 0; i < version_table_size; i++) {
784 if (strcmp(version, version_table[i]) == 0) {
785 found = 1;
786 break;
787 }
788 }
789 if (!found) {
790 md_eprintf(gettext("Version %s not supported\n"),
791 version);
792 md_exit(sp, 1);
793 }
794 }
795
796 argc -= optind;
797 argv += optind;
798
799 /* parse arguments */
800 if (argc <= 0) {
801 usage(sp, 1);
802 }
803
804 /* convert the step name to the corresponding number */
805 step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
806 sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
807 if (step_ptr != NULL) {
808 stepnum = step_ptr->step_num;
809 }
810
811 --argc;
812 ++argv;
813
814 /* set timeout alarm signal, a value of 0 will disable timeout */
815 if (timeout > 0) {
816 int stat_loc = 0;
817 commd_timeout = (long)(timeout * .75);
818
819 c_pid = fork();
820
821 if (c_pid == (pid_t)-1) {
822 md_perror(gettext("Unable to fork"));
823 md_exit(sp, 1);
824 } else if (c_pid) {
825 /* parent */
826 nsa.sa_flags = 0;
827 if (sigfillset(&nsa.sa_mask) < 0) {
828 md_perror(gettext("Unable to set signal mask"));
829 md_exit(sp, 1);
830 }
831
832 nsa.sa_handler = sigalarmhandler;
833 if (sigaction(SIGALRM, &nsa, &osa) == -1) {
834 md_perror(gettext("Unable to set alarm "
835 "handler"));
836 md_exit(sp, 1);
837 }
838
839 (void) alarm(timeout);
840
841 /*
842 * wait for child to exit or timeout to expire.
843 * keep retrying if the call is interrupted
844 */
845 while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
846 if (errno != EINTR) {
847 break;
848 }
849 }
850 if (ret_val == c_pid) {
851 /* exit with the childs exit value */
852 exit(WEXITSTATUS(stat_loc));
853 } else if (errno == ECHILD) {
854 md_exit(sp, 0);
855 } else {
856 perror(myname);
857 md_exit(sp, 1);
858 }
859 }
860 }
861
862 /*
863 * If a timeout value is given, everything from this point onwards is
864 * executed in the child process.
865 */
866
867 switch (stepnum) {
868 case MC_START:
869 /*
870 * Start Step
871 *
872 * - Suspend all rpc.mdcommd messages
873 */
874
875 /* expect the local node id to be given only */
876 if (argc != 1)
877 usage(sp, 1);
878
879 meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
880 meta_print_hrtime(0));
881
882 /*
883 * With multinode disksets configured we need to
884 * update all replicas on all cluster nodes to have
885 * the same status. If local replicas on a cluster
886 * node are not accessible we need to panic this
887 * node, otherwise we abort in the reconfig cycle
888 * and failfast/reboot the "good" cluster node too.
889 * To avoid a total cluster outage in the above case
890 * we panic only the failing node via md_exit(.., 1).
891 */
892 if ((local_sp = load_local_set(ep)) == NULL) {
893 /* panic the node */
894 md_exit(local_sp, 1);
895 }
896
897 if ((max_sets = get_max_sets(ep)) == 0) {
898 mde_perror(ep, "");
899 md_exit(sp, 1);
900 }
901
902 /* start walking through all possible disksets */
903 for (setno = 1; setno < max_sets; setno++) {
904 if ((sp = metasetnosetname(setno, ep)) == NULL) {
905 if (mdiserror(ep, MDE_NO_SET)) {
906 /* No set for this setno - continue */
907 mdclrerror(ep);
908 continue;
909 } else {
910 mde_perror(ep, gettext("Unable to "
911 "get set %d information"), setno);
912 md_exit(sp, 1);
913 }
914 }
915
916 /* only check multi-node disksets */
917 if (!meta_is_mn_set(sp, ep)) {
918 mdclrerror(ep);
919 continue;
920 }
921
922 meta_mc_log(MC_LOG3, gettext("Start - block parse "
923 "messages for set %s: %s"), sp->setname,
924 meta_print_hrtime(gethrtime() - start_time));
925
926 /*
927 * Mddb parse messages are sent amongst the nodes
928 * in a diskset whenever the locator block or
929 * locator names structure has been changed.
930 * A locator block change could occur as a result
931 * of a disk failure during the reconfig cycle,
932 * so block the mddb parse messages while the
933 * rpc.mdcommd is suspended during the reconfig cycle.
934 */
935 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
936 (void) memset(&mbp, 0, sizeof (mbp));
937 mbp.c_setno = setno;
938 mbp.c_blk_flags = MDDB_BLOCK_PARSE;
939 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
940 &mbp.c_mde, NULL)) {
941 (void) mdstealerror(ep, &mbp.c_mde);
942 mde_perror(ep, gettext("Could not "
943 "block set %s"), sp->setname);
944 md_exit(sp, 1);
945 }
946 }
947
948 /* suspend commd and spin waiting for drain */
949 while ((ret_val = mdmn_suspend(setno,
950 MD_COMM_ALL_CLASSES, commd_timeout)) ==
951 MDE_DS_COMMDCTL_SUSPEND_NYD) {
952 (void) sleep(1);
953 }
954
955 if (ret_val) {
956 md_eprintf(gettext("Could not suspend "
957 "rpc.mdcommd for set %s\n"), sp->setname);
958 md_exit(sp, 1);
959 }
960
961 /*
962 * Set start step flag for set. This is set to indicate
963 * that this node entered the reconfig cycle through
964 * the start step. This is used during the reconfig
965 * cycle to determine whether the node had entered
966 * through the start step or the return step.
967 */
968 (void) memset(&sf, 0, sizeof (sf));
969 sf.sf_setno = sp->setno;
970 sf.sf_setflags = MD_SET_MN_START_RC;
971 sf.sf_flags = MDDB_NM_SET;
972 /* Use magic to help protect ioctl against attack. */
973 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
974 if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
975 &sf.sf_mde, NULL)) {
976 (void) mdstealerror(ep, &sf.sf_mde);
977 mde_perror(ep, gettext("Could not set "
978 "start_step flag for set %s"), sp->setname);
979 md_exit(sp, 1);
980 }
981
982 }
983
984 meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
985 meta_print_hrtime(gethrtime() - start_time));
986
987 break;
988
989 case MC_STOP:
990 /*
991 * Stop Step
992 *
993 * - ???
994 */
995
996 /* don't expect any more arguments to follow the step name */
997 if (argc != 0)
998 usage(sp, 1);
999
1000 break;
1001
1002 case MC_ABORT:
1003 /*
1004 * Abort Step
1005 *
1006 * - Abort rpc.mdcommd
1007 */
1008
1009 /* don't expect any more arguments to follow the step name */
1010 if (argc != 0)
1011 usage(sp, 1);
1012
1013 meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
1014 meta_print_hrtime(0));
1015
1016 /*
1017 * Does local set exist? If not, exit with 0
1018 * since there's no reason to have this node panic if
1019 * the local set cannot be started.
1020 */
1021 if ((local_sp = load_local_set(ep)) == NULL) {
1022 md_exit(local_sp, 0);
1023 }
1024
1025 /*
1026 * abort the rpc.mdcommd. The abort is only issued on this node
1027 * meaning that the abort reconfig step is called on this
1028 * node before a panic while the rest of the cluster will
1029 * undergo a reconfig cycle.
1030 * There is no time relation between this node running a
1031 * reconfig abort and the the rest of the cluster
1032 * running a reconfig cycle meaning that this node may
1033 * panic before, during or after the cluster has run
1034 * a reconfig cycle.
1035 */
1036 mdmn_abort();
1037
1038 meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
1039 meta_print_hrtime(gethrtime() - start_time));
1040
1041 break;
1042
1043 case MC_RETURN:
1044 /*
1045 * Return Step
1046 *
1047 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
1048 * and release local set lock. Grabbing the local set
1049 * lock allows any active metaset/metadb commands to
1050 * terminate gracefully and will keep a metaset/metadb
1051 * command from starting until the DRAIN ALL is issued.
1052 * The metaset/metadb commands can issue
1053 * DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
1054 * so the return step must not issue the DRAIN ALL command
1055 * until metaset/metadb have finished or metaset may issue
1056 * a RESUME ALL after this return reconfig step has issued
1057 * the DRAIN ALL command.
1058 * After this reconfig step has issued the DRAIN_ALL and
1059 * released the local set lock, metaset/metadb will fail
1060 * when attempting to contact the rpc.mdcommd and will
1061 * terminate without making any configuration changes.
1062 * The DRAIN ALL command will keep all other meta* commands
1063 * from running during the reconfig cycle (these commands
1064 * will wait until the rpc.mdcommd is resumed) since the
1065 * reconfig cycle may be changing the diskset configuration.
1066 */
1067
1068 /* expect the nodelist to follow the step name */
1069 if (argc < 1)
1070 usage(sp, 1);
1071
1072 meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
1073 meta_print_hrtime(0));
1074
1075 /*
1076 * Does local set exist? If not, exit with 0
1077 * since there's no reason to have this node panic if
1078 * the local set cannot be started.
1079 */
1080 if ((local_sp = load_local_set(ep)) == NULL) {
1081 md_exit(local_sp, 0);
1082 }
1083
1084 /*
1085 * Suspend any mirror resyncs that are in progress. This
1086 * stops unnecessary timeouts.
1087 */
1088 meta_mirror_resync_block_all();
1089
1090 if (meta_lock(local_sp, TRUE, ep) != 0) {
1091 mde_perror(ep, "");
1092 md_exit(local_sp, 1);
1093 }
1094
1095 /*
1096 * All metaset and metadb commands on this node have now
1097 * terminated gracefully. Now, issue a drain all to
1098 * the rpc.mdcommd. Any meta command issued after the
1099 * drain all will either spin sending the command to the
1100 * master until after the reconfig cycle has finished OR
1101 * will terminate gracefully (metaset/metadb).
1102 */
1103 if ((max_sets = get_max_sets(ep)) == 0) {
1104 mde_perror(ep, "");
1105 md_exit(sp, 1);
1106 }
1107
1108 /* start walking through all possible disksets */
1109 for (setno = 1; setno < max_sets; setno++) {
1110 if ((sp = metasetnosetname(setno, ep)) == NULL) {
1111 if (mdiserror(ep, MDE_NO_SET)) {
1112 /* No set for this setno - continue */
1113 mdclrerror(ep);
1114 continue;
1115 } else {
1116 mde_perror(ep, gettext("Unable to "
1117 "get set %d information"), setno);
1118 md_exit(sp, 1);
1119 }
1120 }
1121
1122 /* only check multi-node disksets */
1123 if (!meta_is_mn_set(sp, ep)) {
1124 mdclrerror(ep);
1125 continue;
1126 }
1127
1128 meta_mc_log(MC_LOG3, gettext("Return - block parse "
1129 "messages for set %s: %s"), sp->setname,
1130 meta_print_hrtime(gethrtime() - start_time));
1131
1132 /*
1133 * Mddb parse messages are sent amongst the nodes
1134 * in a diskset whenever the locator block or
1135 * locator names structure has been changed.
1136 * A locator block change could occur as a result
1137 * of a disk failure during the reconfig cycle,
1138 * so block the mddb parse messages while the
1139 * rpc.commd is suspended during the reconfig cycle.
1140 */
1141 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1142 (void) memset(&mbp, 0, sizeof (mbp));
1143 mbp.c_setno = setno;
1144 mbp.c_blk_flags = MDDB_BLOCK_PARSE;
1145 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1146 &mbp.c_mde, NULL)) {
1147 (void) mdstealerror(ep, &mbp.c_mde);
1148 mde_perror(ep, gettext("Could not "
1149 "block set %s"), sp->setname);
1150 md_exit(sp, 1);
1151 }
1152 }
1153
1154 /* suspend commd and spin waiting for drain */
1155 while ((ret_val = mdmn_suspend(setno,
1156 MD_COMM_ALL_CLASSES, commd_timeout)) ==
1157 MDE_DS_COMMDCTL_SUSPEND_NYD) {
1158 (void) sleep(1);
1159 }
1160
1161 if (ret_val) {
1162 md_eprintf(gettext("Could not suspend "
1163 "rpc.mdcommd for set %s\n"), sp->setname);
1164 md_exit(sp, 1);
1165 }
1166 }
1167 /*
1168 * Resume all I/Os for this node for all MN sets in
1169 * case master node had suspended I/Os but panic'd
1170 * before resuming I/Os. In case of failure, exit
1171 * with a 1 since unable to resume I/Os on this node.
1172 */
1173 if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
1174 mde_perror(ep, gettext(
1175 "Unable to resume I/O on node %s for all sets"),
1176 mynode());
1177 md_exit(sp, 1);
1178 }
1179
1180
1181 /*
1182 * Can now unlock local set lock. New metaset/metadb
1183 * commands are now held off using drain all.
1184 */
1185 (void) meta_unlock(local_sp, ep);
1186
1187 meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
1188 meta_print_hrtime(gethrtime() - start_time));
1189
1190 break;
1191
1192 case MC_STEP1:
1193 /*
1194 * Step 1
1195 *
1196 * - Populate nodelist file if we are on clustering
1197 * and pick a master node for each MN diskset.
1198 */
1199
1200 /* expect the nodelist to follow the step name */
1201 if (argc < 1)
1202 usage(sp, 1);
1203
1204 meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
1205 meta_print_hrtime(0));
1206
1207 /* Always write nodelist file even if no local set exists */
1208 if (clust == SDSSC_OKAY) {
1209 /* skip to the nodelist args */
1210 if (meta_write_nodelist(argc, argv, ep) != 0) {
1211 mde_perror(ep, gettext(
1212 "Could not populate nodelist file"));
1213 md_exit(sp, 1);
1214 }
1215 }
1216
1217 /*
1218 * Does local set exist? If not, exit with 0
1219 * since there's no reason to have this node panic if
1220 * the local set cannot be started.
1221 */
1222 if ((local_sp = load_local_set(ep)) == NULL) {
1223 md_exit(local_sp, 0);
1224 }
1225
1226 /*
1227 * At this point, all meta* commands are blocked across
1228 * all disksets since the master rpc.mdcommd has drained or
1229 * the master node has died.
1230 * If a metaset or metadb command had been in progress
1231 * at the start of the reconfig cycle, this command has
1232 * either completed or it has been terminated due to
1233 * the death of the master node.
1234 *
1235 * This means that that it is now ok to remove any
1236 * outstanding clnt_locks associated with multinode
1237 * disksets on this node due to a node panic during
1238 * a metaset operation. This allows the routines that
1239 * choose the master to use rpc.metad to determine the
1240 * master of the diskset.
1241 */
1242 if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
1243 meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1244 "clear locks failed %s"),
1245 meta_print_hrtime(gethrtime() - start_time));
1246 md_exit(local_sp, 1);
1247 }
1248
1249 /*
1250 * Call reconfig_choose_master to choose a master for
1251 * each MN diskset, update the nodelist for each diskset
1252 * given the member information and send a reinit message
1253 * to rpc.mdcommd to reload the nodelist.
1254 */
1255 rval = meta_reconfig_choose_master(commd_timeout, ep);
1256 if (rval == 205) {
1257 /*
1258 * NOTE: Should issue call to reboot remote host that
1259 * is causing the RPC failure. Clustering to
1260 * provide interface in the future. This should
1261 * stop a never-ending set of 205 reconfig cycles.
1262 * Remote host causing failure is stored in
1263 * ep->host if ep is an RPC error.
1264 * if (mdanyrpcerror(ep))
1265 * reboot (ep->host);
1266 */
1267 meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1268 "choose master failure of 205 %s"),
1269 meta_print_hrtime(gethrtime() - start_time));
1270 md_exit(local_sp, 205);
1271 } else if (rval != 0) {
1272 meta_mc_log(MC_LOG2, gettext("Step1 failure: "
1273 "choose master failure %s"),
1274 meta_print_hrtime(gethrtime() - start_time));
1275 md_exit(local_sp, 1);
1276 }
1277
1278 meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
1279 meta_print_hrtime(gethrtime() - start_time));
1280
1281 md_exit(local_sp, rval);
1282 break;
1283
1284 case MC_STEP2:
1285 /*
1286 * Step 2
1287 *
1288 * In Step 2, each node walks the list of disksets. If a
1289 * node is a master of a MN diskset, it synchronizes
1290 * the local set USER records for that diskset.
1291 *
1292 * If disks exist in the diskset and there is a joined
1293 * (owner) node in the diskset, the master will also:
1294 * - synchronize the diskset mddbs to the master
1295 * - play the change log
1296 *
1297 * The master node will now attempt to join any unjoined
1298 * nodes that are currently members in the membership list.
1299 */
1300
1301 /* expect the nodelist to follow the step name */
1302 if (argc < 1)
1303 usage(sp, 1);
1304
1305 meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
1306 meta_print_hrtime(0));
1307
1308 /*
1309 * Does local set exist? If not, exit with 0
1310 * since there's no reason to have this node panic if
1311 * the local set cannot be started.
1312 */
1313 if ((local_sp = load_local_set(ep)) == NULL) {
1314 md_exit(local_sp, 0);
1315 }
1316
1317 if ((max_sets = get_max_sets(ep)) == 0) {
1318 mde_perror(ep, "");
1319 md_exit(local_sp, 1);
1320 }
1321
1322 /* start walking through all possible disksets */
1323 for (setno = 1; setno < max_sets; setno++) {
1324 if ((sp = metasetnosetname(setno, ep)) == NULL) {
1325 if (mdiserror(ep, MDE_NO_SET)) {
1326 /* No set for this setno - continue */
1327 mdclrerror(ep);
1328 continue;
1329 } else if (mdanyrpcerror(ep)) {
1330 /* Fail on RPC failure to self */
1331 mde_perror(ep, gettext(
1332 "Unable to get information for "
1333 "set number %d"), setno);
1334 md_exit(local_sp, 1);
1335 } else {
1336 mde_perror(ep, gettext(
1337 "Unable to get information for "
1338 "set number %d"), setno);
1339 mdclrerror(ep);
1340 continue;
1341 }
1342 }
1343
1344 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1345 if (mdanyrpcerror(ep)) {
1346 /* Fail on RPC failure to self */
1347 mde_perror(ep, gettext(
1348 "Unable to get information for "
1349 "set number %d"), setno);
1350 md_exit(local_sp, 1);
1351 }
1352 mde_perror(ep, gettext("Unable to get set "
1353 "%s desc information"), sp->setname);
1354 mdclrerror(ep);
1355 continue;
1356 }
1357
1358 /* Only check MN disksets */
1359 if (!(MD_MNSET_DESC(sd))) {
1360 continue;
1361 }
1362
1363 /* All actions in step 2 are driven by master */
1364 if (!(sd->sd_mn_am_i_master)) {
1365 continue;
1366 }
1367
1368 meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
1369 "synchronization for set %s: %s"), sp->setname,
1370 meta_print_hrtime(gethrtime() - start_time));
1371
1372 /*
1373 * Synchronize the USER records in the local mddbs
1374 * for hosts that are members. The USER records
1375 * contain set, drive and host information.
1376 */
1377 rval = meta_mnsync_user_records(sp, ep);
1378 if (rval != 0) {
1379 mde_perror(ep, gettext(
1380 "Synchronization of user records "
1381 "in set %s failed\n"), sp->setname);
1382 if (rval == 205) {
1383 /*
1384 * NOTE: Should issue call to reboot
1385 * remote host that is causing the RPC
1386 * failure. Clustering to provide
1387 * interface in the future. This
1388 * should stop a never-ending set of
1389 * 205 reconfig cycles.
1390 * Remote host causing failure is
1391 * stored in ep->host if ep is an
1392 * RPC error.
1393 * if (mdanyrpcerror(ep))
1394 * reboot (ep->host);
1395 */
1396 md_exit(local_sp, 205);
1397 } else {
1398 md_exit(local_sp, 1);
1399 }
1400 }
1401
1402 /* Reget sd since sync_user_recs may have flushed it */
1403 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1404 mde_perror(ep, gettext("Unable to get set "
1405 "%s desc information"), sp->setname);
1406 md_exit(local_sp, 1);
1407 }
1408
1409 dd = metaget_drivedesc(sp,
1410 (MD_BASICNAME_OK | PRINT_FAST), ep);
1411 if (! mdisok(ep)) {
1412 mde_perror(ep, gettext("Unable to get set "
1413 "%s drive information"), sp->setname);
1414 md_exit(local_sp, 1);
1415 }
1416
1417 /*
1418 * No drives in set, continue to next set.
1419 */
1420 if (dd == NULL) {
1421 /* Done with this set */
1422 continue;
1423 }
1424
1425 meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
1426 "records completed for set %s: %s"), sp->setname,
1427 meta_print_hrtime(gethrtime() - start_time));
1428
1429 /*
1430 * Synchronize the diskset mddbs for hosts
1431 * that are members. This may involve
1432 * playing the changelog and writing out
1433 * to the diskset mddbs.
1434 */
1435 rval = meta_mnsync_diskset_mddbs(sp, ep);
1436 if (rval != 0) {
1437 mde_perror(ep, gettext(
1438 "Synchronization of diskset mddbs "
1439 "in set %s failed\n"), sp->setname);
1440 meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
1441 "mddb synchronization failed for "
1442 "set %s: %s"), sp->setname,
1443 meta_print_hrtime(gethrtime() -
1444 start_time));
1445 if (rval == 205) {
1446 /*
1447 * NOTE: Should issue call to reboot
1448 * remote host that is causing the RPC
1449 * failure. Clustering to provide
1450 * interface in the future. This
1451 * should stop a never-ending set of
1452 * 205 reconfig cycles.
1453 * Remote host causing failure is
1454 * stored in ep->host if ep is an
1455 * RPC error.
1456 * if (mdanyrpcerror(ep))
1457 * reboot (ep->host);
1458 */
1459 md_exit(local_sp, 205);
1460 } else if (rval == 1) {
1461 continue;
1462 } else {
1463 md_exit(local_sp, 1);
1464 }
1465 }
1466
1467 meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
1468 "synchronization completed for set %s: %s"),
1469 sp->setname,
1470 meta_print_hrtime(gethrtime() - start_time));
1471
1472 /* Join the starting nodes to the diskset */
1473 rval = meta_mnjoin_all(sp, ep);
1474 if (rval != 0) {
1475 mde_perror(ep, gettext(
1476 "Join of non-owner (starting) nodes "
1477 "in set %s failed\n"), sp->setname);
1478 meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
1479 "nodes joined for set %s: %s"),
1480 sp->setname,
1481 meta_print_hrtime(gethrtime() -
1482 start_time));
1483 if (rval == 205) {
1484 /*
1485 * NOTE: Should issue call to reboot
1486 * remote host that is causing the RPC
1487 * failure. Clustering to provide
1488 * interface in the future. This
1489 * should stop a never-ending set of
1490 * 205 reconfig cycles.
1491 * Remote host causing failure is
1492 * stored in ep->host if ep is an
1493 * RPC error.
1494 * if (mdanyrpcerror(ep))
1495 * reboot (ep->host);
1496 */
1497 md_exit(local_sp, 205);
1498 } else {
1499 md_exit(local_sp, 1);
1500 }
1501 }
1502
1503 meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
1504 "joined for set %s: %s"), sp->setname,
1505 meta_print_hrtime(gethrtime() - start_time));
1506
1507 }
1508
1509 meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
1510 meta_print_hrtime(gethrtime() - start_time));
1511
1512 break;
1513
1514 case MC_STEP3:
1515 /*
1516 * Step 3
1517 *
1518 * For all multinode sets do,
1519 * - Reinitialise rpc.mdcommd
1520 * - Reset mirror owners to null if the current owner is
1521 * no longer in the membership list
1522 */
1523
1524 /* expect the nodelist to follow the step name */
1525 if (argc < 1)
1526 usage(sp, 1);
1527
1528 meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
1529 meta_print_hrtime(0));
1530
1531 /*
1532 * Does local set exist? If not, exit with 0
1533 * since there's no reason to have this node panic if
1534 * the local set cannot be started.
1535 */
1536 if ((local_sp = load_local_set(ep)) == NULL) {
1537 md_exit(local_sp, 0);
1538 }
1539
1540 /*
1541 * walk through all sets on this node which could include:
1542 * - MN disksets
1543 * - traditional disksets
1544 * - non-existent disksets
1545 * start mirror resync for all MN sets
1546 */
1547 if ((max_sets = get_max_sets(ep)) == 0) {
1548 mde_perror(ep, "");
1549 md_exit(local_sp, 1);
1550 }
1551
1552 /* start walking through all possible disksets */
1553 for (setno = 1; setno < max_sets; setno++) {
1554 if ((sp = metasetnosetname(setno, ep)) == NULL) {
1555 if (mdiserror(ep, MDE_NO_SET)) {
1556 /* No set for this setno - continue */
1557 mdclrerror(ep);
1558 continue;
1559 } else {
1560 mde_perror(ep, gettext("Unable to "
1561 "get set %d information"), setno);
1562 md_exit(local_sp, 1);
1563 }
1564 }
1565
1566 /* only check multi-node disksets */
1567 if (!meta_is_mn_set(sp, ep)) {
1568 mdclrerror(ep);
1569 continue;
1570 }
1571
1572 if (meta_lock(sp, TRUE, ep) != 0) {
1573 mde_perror(ep, "");
1574 md_exit(local_sp, 1);
1575 }
1576
1577 /* If this node isn't joined to set, do nothing */
1578 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1579 if (!mdisok(ep)) {
1580 mde_perror(ep, gettext("Could "
1581 "not get set %s ownership"),
1582 sp->setname);
1583 md_exit(sp, 1);
1584 }
1585 mdclrerror(ep);
1586 (void) meta_unlock(sp, ep);
1587 continue;
1588 }
1589
1590 meta_mc_log(MC_LOG3, gettext("Step3 - begin "
1591 "re-initialising rpc.mdcommd and resetting mirror "
1592 "owners for set %s: %s"), sp->setname,
1593 meta_print_hrtime(gethrtime() - start_time));
1594
1595 /* reinitialzse rpc.mdcommd with new nodelist */
1596 if (mdmn_reinit_set(setno, commd_timeout)) {
1597 md_eprintf(gettext(
1598 "Could not re-initialise rpc.mdcommd for "
1599 "set %s\n"), sp->setname);
1600 md_exit(sp, 1);
1601 }
1602
1603 (void) memset(&cfg, 0, sizeof (cfg));
1604 cfg.c_id = 0;
1605 cfg.c_setno = sp->setno;
1606 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1607 NULL) != 0) {
1608 (void) mdstealerror(ep, &cfg.c_mde);
1609 mde_perror(ep, gettext("Could "
1610 "not get set %s information"),
1611 sp->setname);
1612 md_exit(sp, 1);
1613 }
1614
1615 /* Don't do anything else if set is stale */
1616 if (cfg.c_flags & MDDB_C_STALE) {
1617 (void) meta_unlock(sp, ep);
1618 mdclrerror(ep);
1619 continue;
1620 }
1621
1622 /* reset mirror owners */
1623 if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
1624 md_exit(sp, 1);
1625 }
1626
1627 (void) meta_unlock(sp, ep);
1628
1629 meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
1630 "re-initialised and mirror owners reset for "
1631 "set %s: %s"), sp->setname,
1632 meta_print_hrtime(gethrtime() - start_time));
1633 }
1634
1635 meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
1636 meta_print_hrtime(gethrtime() - start_time));
1637
1638 break;
1639
1640 case MC_STEP4:
1641 /*
1642 * Step 4
1643 *
1644 * For all multinode sets do:
1645 * - Resume the rpc.mdcommd messages. Must resume all
1646 * sets before issuing I/O to any set since an error
1647 * encountered in a commd suspended set could be
1648 * blocked waiting for commd in another set to resume.
1649 * (This happens since the daemon queues service
1650 * all sets). An open of a soft partition causes
1651 * a read of the watermarks during the open.
1652 * - If set is non-writable (not an owner or STALE), then
1653 * continue to next set.
1654 *
1655 * For all multinode sets do,
1656 * - Reset ABR states for all mirrors, ie clear ABR if not
1657 * open on any node.
1658 * - Reset ABR states for all soft partitions, ie clear ABR if
1659 * not open on any node.
1660 * - For all slave nodes that have entered through the start
1661 * step, update the ABR state to that of the master and
1662 * get the submirror state from the master
1663 * - meta_lock set
1664 * - Resync all mirrors
1665 * - unlock meta_lock for this set.
1666 * - Choose a new owner for any orphaned resyncs
1667 *
1668 * There is one potential issue here. when concurrently
1669 * resetting and updating the ABR state. If the master has ABR
1670 * set, but should no longer have because the only node that
1671 * had the metadevice open and had ABR set has paniced, the
1672 * master will send a message to all nodes to clear the ABR
1673 * state. Meanwhile any node that has come through the
1674 * start step will get tstate from the master and will update
1675 * ABR if it was set in tstate. So, we appear to have a problem
1676 * if the following sequence occurs:-
1677 * - The slave gets tstate with ABR set
1678 * - The master sends a message to clear ABR
1679 * - The slave updates ABR with the value it got from tstate.
1680 * We now have the master with ABR clear and the slave with ABR
1681 * set. Fortunately, having set ABR, the slave will close the
1682 * metadevice after setting ABR and as there are no nodes with
1683 * the device open, the close will send a message to clear ABR
1684 * on all nodes. So, the nodes will all have ABR unset.
1685 */
1686
1687 /* expect the nodelist to follow the step name */
1688 if (argc < 1)
1689 usage(sp, 1);
1690
1691 meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
1692 meta_print_hrtime(0));
1693
1694 /*
1695 * Does local set exist? If not, exit with 0
1696 * since there's no reason to have this node panic if
1697 * the local set cannot be started.
1698 */
1699 if ((local_sp = load_local_set(ep)) == NULL) {
1700 md_exit(local_sp, 0);
1701 }
1702
1703 /*
1704 * walk through all sets on this node which could include:
1705 * - MN disksets
1706 * - traditional disksets
1707 * - non-existent disksets
1708 * start mirror resync for all MN sets
1709 */
1710 if ((max_sets = get_max_sets(ep)) == 0) {
1711 mde_perror(ep, "");
1712 md_exit(local_sp, 1);
1713 }
1714
1715 /* Clear set_info structure */
1716 for (setno = 1; setno < max_sets; setno++) {
1717 set_info[setno] = 0;
1718 }
1719
1720 /* start walking through all possible disksets */
1721 for (setno = 1; setno < max_sets; setno++) {
1722 if ((sp = metasetnosetname(setno, ep)) == NULL) {
1723 if (mdiserror(ep, MDE_NO_SET)) {
1724 /* No set for this setno - continue */
1725 mdclrerror(ep);
1726 continue;
1727 } else {
1728 mde_perror(ep, gettext("Unable to "
1729 "get set %d information"), setno);
1730 md_exit(local_sp, 1);
1731 }
1732 }
1733
1734 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1735 mde_perror(ep, gettext("Unable to get set "
1736 "%s desc information"), sp->setname);
1737 mdclrerror(ep);
1738 continue;
1739 }
1740
1741 /* only check multi-node disksets */
1742 if (!meta_is_mn_set(sp, ep)) {
1743 mdclrerror(ep);
1744 continue;
1745 }
1746
1747 set_info[setno] |= SET_INFO_MN;
1748
1749 /*
1750 * If not an owner (all mddbs failed) or stale
1751 * (< 50% mddbs operational), then set is
1752 * non-writable so just resume commd and
1753 * unblock mddb messages.
1754 */
1755 mdclrerror(ep);
1756 if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1757 set_info[setno] |= SET_INFO_NO_WR;
1758 }
1759 if (!mdisok(ep)) {
1760 mde_perror(ep, gettext("Could "
1761 "not get set %s ownership"),
1762 sp->setname);
1763 md_exit(local_sp, 1);
1764 }
1765 /* Set is owned - is it stale? */
1766 if (!set_info[setno] & SET_INFO_NO_WR) {
1767 (void) memset(&cfg, 0, sizeof (cfg));
1768 cfg.c_id = 0;
1769 cfg.c_setno = sp->setno;
1770 if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1771 NULL) != 0) {
1772 (void) mdstealerror(ep, &cfg.c_mde);
1773 mde_perror(ep, gettext("Could "
1774 "not get set %s information"),
1775 sp->setname);
1776 md_exit(local_sp, 1);
1777 }
1778 if (cfg.c_flags & MDDB_C_STALE) {
1779 set_info[setno] |= SET_INFO_NO_WR;
1780 }
1781 }
1782
1783 /* resume rpc.mdcommd */
1784 if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0,
1785 commd_timeout)) {
1786 md_eprintf(gettext("Unable to resume "
1787 "rpc.mdcommd for set %s\n"), sp->setname);
1788 md_exit(local_sp, 1);
1789 }
1790
1791 /* Unblock mddb parse messages */
1792 if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1793 (void) memset(&mbp, 0, sizeof (mbp));
1794 mbp.c_setno = setno;
1795 mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
1796 if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1797 &mbp.c_mde, NULL)) {
1798 (void) mdstealerror(ep, &mbp.c_mde);
1799 mde_perror(ep, gettext("Could not "
1800 "unblock set %s"), sp->setname);
1801 md_exit(local_sp, 1);
1802 }
1803 }
1804 meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
1805 "resumed and messages unblocked for set %s: %s"),
1806 sp->setname,
1807 meta_print_hrtime(gethrtime() - start_time));
1808 }
1809
1810 for (setno = 1; setno < max_sets; setno++) {
1811 int start_step;
1812
1813 /* Skip traditional disksets. */
1814 if ((set_info[setno] & SET_INFO_MN) == 0)
1815 continue;
1816
1817 /*
1818 * If already determined that this set is
1819 * a non-writable set, then just continue
1820 * to next set since there's nothing else
1821 * to do for a non-writable set.
1822 */
1823 if (set_info[setno] & SET_INFO_NO_WR)
1824 continue;
1825
1826 if ((sp = metasetnosetname(setno, ep)) == NULL) {
1827 if (mdiserror(ep, MDE_NO_SET)) {
1828 /* No set for this setno - continue */
1829 mdclrerror(ep);
1830 continue;
1831 } else {
1832 mde_perror(ep, gettext("Unable to "
1833 "get set %d information"), setno);
1834 md_exit(local_sp, 1);
1835 }
1836 }
1837
1838 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1839 mde_perror(ep, gettext("Unable to get set "
1840 "%s desc information"), sp->setname);
1841 mdclrerror(ep);
1842 continue;
1843 }
1844
1845 /* See if this node came through the start step */
1846 (void) memset(&sf, 0, sizeof (sf));
1847 sf.sf_setno = sp->setno;
1848 sf.sf_flags = MDDB_NM_GET;
1849 /* Use magic to help protect ioctl against attack. */
1850 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1851 if (metaioctl(MD_MN_GET_SETFLAGS, &sf,
1852 &sf.sf_mde, NULL)) {
1853 (void) mdstealerror(ep, &sf.sf_mde);
1854 mde_perror(ep, gettext("Could not get "
1855 "start_step flag for set %s"), sp->setname);
1856 md_exit(local_sp, 1);
1857 }
1858 start_step =
1859 (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;
1860
1861 /*
1862 * We can now reset the start_step flag for the set
1863 * if it was already set.
1864 */
1865 if (start_step) {
1866 (void) memset(&sf, 0, sizeof (sf));
1867 sf.sf_setno = sp->setno;
1868 sf.sf_setflags = MD_SET_MN_START_RC;
1869 sf.sf_flags = MDDB_NM_RESET;
1870 /*
1871 * Use magic to help protect ioctl
1872 * against attack.
1873 */
1874 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1875 if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1876 &sf.sf_mde, NULL)) {
1877 (void) mdstealerror(ep, &sf.sf_mde);
1878 mde_perror(ep,
1879 gettext("Could not reset "
1880 "start_step flag for set %s"),
1881 sp->setname);
1882 }
1883 }
1884
1885 meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
1886 "ABR state and restarting io's for "
1887 "set %s: %s"), sp->setname,
1888 meta_print_hrtime(gethrtime() - start_time));
1889
1890
1891 /*
1892 * If we are not the master and we have come through
1893 * the start step, we must update the ABR states
1894 * for mirrors and soft partitions. Also the submirror
1895 * states need to be synchronised so that we see the
1896 * same status as other previously joined members.
1897 * This _must_ be done before starting the resync.
1898 */
1899 if (!(sd->sd_mn_am_i_master) && start_step) {
1900 if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
1901 ep) == -1) {
1902 md_exit(local_sp, 1);
1903 }
1904 if (reset_state(UPDATE_ABR, sp, MD_SP,
1905 ep) == -1) {
1906 md_exit(local_sp, 1);
1907 }
1908 /*
1909 * Mark the fact that we've got the mirror
1910 * state. This allows the resync thread to
1911 * determine if _it_ needs to issue this. This
1912 * can happen if a node is added to a set after
1913 * a reconfig cycle has completed.
1914 */
1915 (void) memset(&sf, 0, sizeof (sf));
1916 sf.sf_setno = sp->setno;
1917 sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
1918 sf.sf_flags = MDDB_NM_SET;
1919 /*
1920 * Use magic to help protect ioctl
1921 * against attack.
1922 */
1923 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1924 if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1925 &sf.sf_mde, NULL)) {
1926 (void) mdstealerror(ep, &sf.sf_mde);
1927 mde_perror(ep,
1928 gettext("Could not set "
1929 "submirror state flag for set %s"),
1930 sp->setname);
1931 }
1932 }
1933
1934 /*
1935 * All remaining actions are only performed by the
1936 * master
1937 */
1938 if (!(sd->sd_mn_am_i_master)) {
1939 if (meta_lock(sp, TRUE, ep) != 0) {
1940 mde_perror(ep, "");
1941 md_exit(local_sp, 1);
1942 }
1943 meta_mirror_resync_unblock(sp);
1944 (void) meta_unlock(sp, ep);
1945 continue;
1946 }
1947
1948 /*
1949 * If the master came through the start step, this
1950 * implies that all of the nodes must have done the
1951 * same and hence there can be no applications
1952 * running. Hence no need to reset ABR
1953 */
1954 if (!start_step) {
1955 /* Reset ABR state for mirrors */
1956 if (reset_state(RESET_ABR, sp, MD_MIRROR,
1957 ep) == -1) {
1958 md_exit(local_sp, 1);
1959 }
1960 /* ...and now the same for soft partitions */
1961 if (reset_state(RESET_ABR, sp, MD_SP,
1962 ep) == -1) {
1963 md_exit(local_sp, 1);
1964 }
1965 }
1966
1967 /*
1968 * choose owners for orphaned resyncs and reset
1969 * non-orphaned resyncs so that an owner node that
1970 * reboots will restart the resync if needed.
1971 */
1972 if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
1973 md_exit(local_sp, 1);
1974
1975 /*
1976 * Must unlock set lock before meta_mirror_resync_all
1977 * sends a message to run the metasync command
1978 * which also grabs the meta_lock.
1979 */
1980 if (meta_lock(sp, TRUE, ep) != 0) {
1981 mde_perror(ep, "");
1982 md_exit(local_sp, 1);
1983 }
1984 meta_mirror_resync_unblock(sp);
1985 (void) meta_unlock(sp, ep);
1986
1987 /* resync all mirrors in set */
1988 if (meta_mirror_resync_all(sp, 0, ep) != 0) {
1989 mde_perror(ep, gettext("Mirror resyncs "
1990 "failed for set %s"), sp->setname);
1991 md_exit(local_sp, 1);
1992 }
1993
1994 meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
1995 "for set %s: %s"), sp->setname,
1996 meta_print_hrtime(gethrtime() - start_time));
1997 }
1998
1999 meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
2000 meta_print_hrtime(gethrtime() - start_time));
2001
2002 break;
2003
2004 default:
2005 usage(sp, 1);
2006 break;
2007 }
2008
2009 md_exit(sp, 0);
2010 /* NOTREACHED */
2011 return (0);
2012 }
2013