1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25
26 /*
27 * Just in case we're not in a build environment, make sure that
28 * TEXT_DOMAIN gets set to something.
29 */
30 #if !defined(TEXT_DOMAIN)
31 #define TEXT_DOMAIN "SYS_TEST"
32 #endif
33
34 #include <meta.h>
35 #include <sdssc.h>
36 #include <arpa/inet.h>
37 #include <sys/lvm/md_mddb.h>
38
39 #define MAX_LINE_SIZE 1024
40
41 /*
42 * Maximum amount of time to spend waiting for an ownership change to complete.
43 */
44 static const int OWNER_TIMEOUT = 3;
45
46 /*
47 * FUNCTION: meta_is_mn_set()
48 * INPUT: sp - the set name
49 * OUTPUT: ep - return error pointer
50 * RETURNS: int - 1 if MultiNode set else 0
51 * PURPOSE: checks if the set is a MultiNode set
52 */
53 int
meta_is_mn_set(mdsetname_t * sp,md_error_t * ep)54 meta_is_mn_set(
55 mdsetname_t *sp,
56 md_error_t *ep
57 )
58 {
59 md_set_desc *sd;
60
61 /* Local set cannot be MultiNode */
62 if ((sp == NULL) || (sp->setname == NULL) ||
63 (strcmp(sp->setname, MD_LOCAL_NAME) == 0))
64 return (0);
65 sd = metaget_setdesc(sp, ep);
66
67 /*
68 * sd can be NULL if there is a difference between
69 * the setrecords and the setlistp caches. This can happen
70 * if this function is called while a set is being
71 * removed during a cluster reconfiguration.
72 */
73 if (sd == NULL)
74 return (0);
75 if (sd->sd_flags & MD_SR_MN)
76 return (1);
77 return (0);
78 }
79
80 /*
81 * FUNCTION: meta_is_mn_name()
82 * INPUT: spp - ptr to the set name, if NULL the setname is derived
83 * from the metadevice name (eg set/d10 )
84 * name - the metadevice/hsp name
85 * OUTPUT: ep - return error pointer
86 * RETURNS: int - 1 if MultiNode set else 0
87 * PURPOSE: checks if the metadevice is in a MultiNode set
88 */
89 int
meta_is_mn_name(mdsetname_t ** spp,char * name,md_error_t * ep)90 meta_is_mn_name(
91 mdsetname_t **spp,
92 char *name,
93 md_error_t *ep
94 )
95 {
96 if (*spp == NULL) {
97 char *cname;
98
99 /*
100 * if the setname is specified in uname and *spp is
101 * not set, then it is setup using that set name value.
102 * If *spp is set and a setname specified in uname and
103 * the set names don't agree then cname will be
104 * returned as NULL
105 */
106 cname = meta_canonicalize_check_set(spp, name, ep);
107 if (cname == NULL) {
108 mdclrerror(ep);
109 return (0);
110 }
111
112 Free(cname);
113 }
114
115 if ((strcmp((*spp)->setname, MD_LOCAL_NAME) != 0) &&
116 (metaget_setdesc(*spp, ep) != NULL) &&
117 ((*spp)->setdesc->sd_flags & MD_SR_MN)) {
118 return (1);
119 }
120 return (0);
121 }
122
123 /*
124 * meta_ping_mnset(set_t setno)
125 * Send a test message for this set in order to make commd do some init stuff
126 * Don't bother changelog.
127 * If set is suspended, fail immediately.
128 */
129 void
meta_ping_mnset(set_t setno)130 meta_ping_mnset(set_t setno)
131 {
132 char *data = "test";
133 md_error_t mde = mdnullerror;
134 md_mn_result_t *resp = NULL;
135
136 (void) mdmn_send_message(setno, MD_MN_MSG_TEST2,
137 MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, 0, data,
138 sizeof (data), &resp, &mde);
139
140 if (resp != (md_mn_result_t *)NULL) {
141 free_result(resp);
142 }
143 }
144
145 /*
146 *
147 * FUNCTION: print_stderr
148 * INPUT: errstr - the error message returned by the command
149 * context - the context string from metainit -a
150 * PURPOSE: called from meta_mn_send_command to print the error message
151 * to stderr. When context is NO_CONTEXT_STRING, the errstr string
152 * is output unchanged. When context is a string, it is the context
153 * string for the metainit -a command and in this case the errstr
154 * string has to be parsed to extract the command and node name
155 * and to send a message to stderr in the format
156 * command: node: context: error message
157 */
158 static void
print_stderr(char * errstr,char * context)159 print_stderr(
160 char *errstr,
161 char *context
162 )
163 {
164 char *command;
165 char *node;
166 char *message;
167 int length = strlen(errstr + 1);
168
169 if (context == NO_CONTEXT_STRING) {
170 (void) fprintf(stderr, "%s", errstr);
171 } else {
172 command = Malloc(length);
173 node = Malloc(length);
174 message = Malloc(length);
175 if (sscanf(errstr, "%[^:]: %[^:]: %[^\n]", command, node,
176 message) == 3) {
177 (void) fprintf(stderr, "%s: %s: %s: %s\n", command,
178 node, context, message);
179 } else {
180 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
181 "%s: Invalid format error message"), errstr);
182 }
183 Free(command);
184 Free(node);
185 Free(message);
186 }
187 }
188
189 /*
190 * FUNCTION: meta_mn_send_command()
191 * INPUT: sp - the set name
192 * argc - number of arguments
193 * argv - arg list
194 * flags - some controlling flags
195 * initall_context - context string for metainit -a
196 * OUTPUT: ep - return error pointer
197 * RETURNS: return exitval from mdmn_send_message
198 * PURPOSE: sends the command to the master node for execution
199 */
200 int
meta_mn_send_command(mdsetname_t * sp,int argc,char * argv[],int flags,char * initall_context,md_error_t * ep)201 meta_mn_send_command(
202 mdsetname_t *sp,
203 int argc,
204 char *argv[],
205 int flags,
206 char *initall_context,
207 md_error_t *ep
208 )
209 {
210 int a;
211 int err;
212 int retval;
213 int send_message_flags = MD_MSGF_DEFAULT_FLAGS;
214 int send_message_type;
215 char *cmd;
216 md_mn_result_t *resp = NULL;
217
218 cmd = Malloc(1024);
219 (void) strlcpy(cmd, argv[0], 1024);
220 for (a = 1; a < argc; a++) {
221 /* don't copy empty arguments */
222 if (*argv[a] == '\0') {
223 continue;
224 }
225 (void) strcat(cmd, " ");
226 (void) strcat(cmd, argv[a]);
227 }
228 /*
229 * in dryrun mode stop on the first error
230 * use the CMD_RETRY message type if RETRY_BUSY flag set
231 */
232 if (flags & MD_DRYRUN)
233 send_message_flags |= MD_MSGF_STOP_ON_ERROR;
234 if (flags & MD_NOLOG)
235 send_message_flags |= MD_MSGF_NO_LOG;
236 if (flags & MD_PANIC_WHEN_INCONSISTENT)
237 send_message_flags |= MD_MSGF_PANIC_WHEN_INCONSISTENT;
238 if (flags & MD_RETRY_BUSY) {
239 send_message_type = MD_MN_MSG_BC_CMD_RETRY;
240 } else {
241 send_message_type = MD_MN_MSG_BC_CMD;
242 }
243 err = mdmn_send_message(
244 sp->setno, send_message_type, send_message_flags, 0,
245 cmd, 1024, &resp, ep);
246
247 free(cmd);
248
249 if (err == 0) {
250 /*
251 * stderr may be turned off by IGNORE_STDERR
252 * In dryrun we only print stderr if the exit_val is non-zero
253 */
254 if ((resp->mmr_err_size != 0) &&
255 ((flags & MD_IGNORE_STDERR) == 0)) {
256 if (((flags & MD_DRYRUN) == 0) ||
257 (resp->mmr_exitval != 0)) {
258 print_stderr(resp->mmr_err, initall_context);
259 }
260 }
261
262 /*
263 * If dryrun is set, we don't display stdout,
264 * because the real run has yet to follow.
265 */
266 if (((flags & MD_DRYRUN) == 0) && (resp->mmr_out_size != 0)) {
267 (void) printf("%s", resp->mmr_out);
268 }
269 retval = resp->mmr_exitval;
270 free_result(resp);
271 return (retval);
272 }
273 if (resp != NULL) {
274 if (resp->mmr_comm_state == MDMNE_CLASS_BUSY) {
275 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
276 "rpc.mdcommd currently busy. "
277 "Retry operation later.\n"));
278 } else if (resp->mmr_comm_state == MDMNE_NOT_JOINED) {
279 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
280 "Node %s must join the %s multi-owner diskset to "
281 "issue commands.\n"
282 "To join, use: metaset -s %s -j\n"),
283 mynode(), sp->setname, sp->setname);
284 } else if (resp->mmr_comm_state == MDMNE_LOG_FAIL) {
285 mddb_config_t c;
286
287 (void) memset(&c, 0, sizeof (c));
288 c.c_setno = sp->setno;
289 (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
290 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
291 "Command not attempted: Unable to log message "
292 "in set %s\n"), sp->setname);
293 if (c.c_flags & MDDB_C_STALE) {
294 (void) mdmddberror(ep, MDE_DB_STALE,
295 (minor_t)NODEV64, sp->setno, 0, NULL);
296 mde_perror(ep, "");
297 }
298 } else {
299 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
300 "Command failed: Commd State %d "
301 "encountered.\n"), resp->mmr_comm_state);
302 }
303 free_result(resp);
304 } else {
305 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
306 "Command failed: mdmn_send_message returned %d.\n"),
307 err);
308 }
309
310
311 return (1);
312 }
313
314 /*
315 * FUNCTION: meta_mn_send_suspend_writes()
316 * INPUT: mnum - minor num of mirror
317 * OUTPUT: ep - return error pointer
318 * RETURNS: return value from mdmn_send_message()
319 * PURPOSE: sends message to all nodes to suspend writes to the mirror.
320 */
321 int
meta_mn_send_suspend_writes(minor_t mnum,md_error_t * ep)322 meta_mn_send_suspend_writes(
323 minor_t mnum,
324 md_error_t *ep
325 )
326 {
327 int result;
328 md_mn_msg_suspwr_t suspwrmsg;
329 md_mn_result_t *resp = NULL;
330
331 suspwrmsg.msg_suspwr_mnum = mnum;
332 /*
333 * This message is never directly issued.
334 * So we launch it with a suspend override flag.
335 * If the commd is suspended, and this message comes
336 * along it must be sent due to replaying a command or similar.
337 * In that case we don't want this message to be blocked.
338 * If the commd is not suspended, the flag does no harm.
339 */
340 result = mdmn_send_message(MD_MIN2SET(mnum),
341 MD_MN_MSG_SUSPEND_WRITES,
342 MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
343 (char *)&suspwrmsg, sizeof (suspwrmsg), &resp, ep);
344 if (resp != NULL) {
345 free_result(resp);
346 }
347 return (result);
348 }
349
350 /*
351 * Parse the multi-node list file
352 *
353 * Return Values: Zero - Success
354 * Non Zero - Failure
355 *
356 * File content: The content of the nodelist file should consist of
357 * triplets of nodeid, nodename and private interconnect
358 * address seperated by one or more white space.
359 * e.g.
360 * 1 node_a 192.168.111.3
361 * 2 node_b 192.168.111.5
362 *
363 * Any missing fields will result in an error.
364 */
365 int
meta_read_nodelist(int * nodecnt,mndiskset_membershiplist_t ** nl,md_error_t * ep)366 meta_read_nodelist(
367 int *nodecnt,
368 mndiskset_membershiplist_t **nl,
369 md_error_t *ep
370 )
371 {
372 FILE *fp = NULL;
373 char line[MAX_LINE_SIZE];
374 char *buf;
375 uint_t i;
376 int sz;
377 mndiskset_membershiplist_t **tailp = nl;
378
379 /* open file */
380 if ((fp = fopen(META_MNSET_NODELIST, "r")) == NULL) {
381 mndiskset_membershiplist_t *nlp;
382 struct hostent *hp;
383 int err = 0;
384
385 /* return this node with id of 1 */
386 nlp = *tailp = Zalloc(sizeof (*nlp));
387 tailp = &nlp->next;
388
389 *nodecnt = 1;
390 nlp->msl_node_id = 1;
391 buf = mynode();
392 sz = min(strlen(buf), sizeof (nlp->msl_node_name) - 1);
393 (void) strncpy(nlp->msl_node_name, buf, sz);
394 nlp->msl_node_name[sz] = '\0';
395
396 /* retrieve info about our host */
397 if ((hp = gethostbyname(buf)) == NULL) {
398 err = EADDRNOTAVAIL;
399 } else if (hp->h_addrtype != AF_INET) {
400 /* We only do IPv4 addresses, for now */
401 err = EPFNOSUPPORT;
402 } else if (*hp->h_addr_list == NULL) {
403 /* No addresses in the list */
404 err = EADDRNOTAVAIL;
405 } else {
406 /* We take the first address only */
407 struct in_addr in;
408
409 (void) memcpy(&in.s_addr, *hp->h_addr_list,
410 sizeof (struct in_addr));
411 (void) strncpy(nlp->msl_node_addr,
412 inet_ntoa(in), MD_MAX_NODENAME);
413 }
414
415 if (err) {
416 meta_free_nodelist(*nl);
417 return (mdsyserror(ep, err, buf));
418 }
419 return (0);
420 }
421
422 *nl = NULL;
423 *nodecnt = 0;
424
425 while ((fp != NULL) && ((buf = fgets(line, sizeof (line) - 1, fp)) !=
426 NULL)) {
427 mndiskset_membershiplist_t *nlp;
428
429 /* skip leading spaces */
430 while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
431 buf++;
432
433 /* skip comments and blank lines */
434 if (*buf == '\0' || *buf == '#')
435 continue;
436
437 /* allocate memory and set tail pointer */
438 nlp = *tailp = Zalloc(sizeof (*nlp));
439 tailp = &nlp->next;
440
441 /* parse node id */
442 nlp->msl_node_id = strtoul(buf, NULL, 0);
443 buf += i;
444
445 /* skip leading spaces */
446 while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
447 buf++;
448
449 /* fields missing, return error */
450 if (*buf == '\0' || *buf == '#') {
451 meta_free_nodelist(*nl);
452 *nl = NULL;
453 *nodecnt = 0;
454
455 /* close file and return */
456 if ((fp) && (fclose(fp) != 0))
457 return (mdsyserror(ep, errno,
458 META_MNSET_NODELIST));
459
460 return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
461 }
462
463 /* parse node name */
464 sz = min(i, sizeof (nlp->msl_node_name) - 1);
465 (void) strncpy(nlp->msl_node_name, buf, sz);
466 nlp->msl_node_name[sz] = '\0';
467 buf += i;
468
469 /* skip leading spaces */
470 while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
471 buf++;
472
473 /* fields missing, return error */
474 if (*buf == '\0' || *buf == '#') {
475 meta_free_nodelist(*nl);
476 *nl = NULL;
477 *nodecnt = 0;
478
479 /* close file and return */
480 if ((fp) && (fclose(fp) != 0))
481 return (mdsyserror(ep, errno,
482 META_MNSET_NODELIST));
483
484 return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
485 }
486
487 /* parse node address */
488 sz = min(i, sizeof (nlp->msl_node_addr) - 1);
489 (void) strncpy(nlp->msl_node_addr, buf, sz);
490 nlp->msl_node_addr[sz] = '\0';
491
492 ++*nodecnt;
493 }
494
495 /* close file */
496 if ((fp) && (fclose(fp) != 0)) {
497 meta_free_nodelist(*nl);
498 return (mdsyserror(ep, errno, META_MNSET_NODELIST));
499 }
500 return (0);
501 }
502
503 /*
504 * Populate the multi-node list file from a given list of node id's
505 * The nids must have only one node id in each cell. Range of node
506 * id's in the form 1-n are not allowed.
507 *
508 * Return Values: Zero - Success
509 * Non Zero - Failure
510 */
511 int
meta_write_nodelist(int nodecnt,char ** nids,md_error_t * ep)512 meta_write_nodelist(
513 int nodecnt,
514 char **nids,
515 md_error_t *ep
516 )
517 {
518 FILE *fp = NULL;
519 char name[MAX_LINE_SIZE], addr[MAX_LINE_SIZE];
520 uint_t i, nid;
521 struct in_addr ipaddr;
522 int err = 0;
523
524 /* check if we are running on clustering */
525 if ((err = sdssc_bind_library()) != SDSSC_OKAY) {
526 return (mdsyserror(ep, err, META_MNSET_NODELIST));
527 }
528
529 /* open file for writing */
530 if ((fp = fopen(META_MNSET_NODELIST, "w")) == NULL) {
531 return (mdsyserror(ep, errno, META_MNSET_NODELIST));
532 }
533
534 for (i = 0; i < nodecnt; i++) {
535 /* extract the node id */
536 errno = 0;
537 nid = strtoul(nids[i], NULL, 0);
538 if (errno != 0) {
539 if ((fp) && (fclose(fp) != 0))
540 return (mdsyserror(ep, errno,
541 META_MNSET_NODELIST));
542
543 return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
544 }
545
546 /* get node name */
547 (void) snprintf(name, sizeof (name), "%d", nid);
548 sdssc_cm_nid2nm(name);
549
550 /* finally get the private ip address */
551 (void) snprintf(addr, sizeof (addr), "%s", name);
552 if (sdssc_get_priv_ipaddr(addr, &ipaddr) != SDSSC_OKAY) {
553 if ((fp) && (fclose(fp) != 0))
554 return (mdsyserror(ep, errno,
555 META_MNSET_NODELIST));
556
557 return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
558 }
559
560 (void) fprintf(fp, "%d\t%s\t%s\n", nid, name,
561 inet_ntoa(ipaddr));
562 }
563
564 /* close file */
565 if ((fp) && (fclose(fp) != 0))
566 return (mdsyserror(ep, errno, META_MNSET_NODELIST));
567
568 return (0);
569 }
570
571 /*
572 * Free node list
573 */
574 void
meta_free_nodelist(mndiskset_membershiplist_t * nl)575 meta_free_nodelist(
576 mndiskset_membershiplist_t *nl
577 )
578 {
579 mndiskset_membershiplist_t *next = NULL;
580
581 for (/* void */; (nl != NULL); nl = next) {
582 next = nl->next;
583 Free(nl);
584 }
585 }
586
587 /*
588 * FUNCTION: meta_mn_send_setsync()
589 * INPUT: sp - setname
590 * mirnp - mirror name
591 * size - buffer size, 0 if none
592 * OUTPUT: ep - return error pointer
593 * RETURNS: return value from meta_mn_send_command()
594 * PURPOSE: Send a setsync command to all nodes to set resync status
595 */
596
597 int
meta_mn_send_setsync(mdsetname_t * sp,mdname_t * mirnp,daddr_t size,md_error_t * ep)598 meta_mn_send_setsync(
599 mdsetname_t *sp,
600 mdname_t *mirnp,
601 daddr_t size,
602 md_error_t *ep
603 )
604 {
605 md_mn_msg_setsync_t setsyncmsg;
606 int ret;
607 md_mn_result_t *resp = NULL;
608
609 setsyncmsg.setsync_mnum = meta_getminor(mirnp->dev);
610 setsyncmsg.setsync_copysize = size;
611 setsyncmsg.setsync_flags = 0;
612
613 /*
614 * We do not log the metasync command as it will have no effect on the
615 * underlying metadb state. If we have a master change the
616 * reconfiguration process will issue a new 'metasync' to all affected
617 * mirrors, so we would actually end up sending the message twice.
618 * Removing the logging of the message helps reduce the processing
619 * time required.
620 */
621 ret = mdmn_send_message(sp->setno, MD_MN_MSG_SETSYNC,
622 MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
623 (char *)&setsyncmsg, sizeof (setsyncmsg), &resp, ep);
624 if (resp != NULL) {
625 free_result(resp);
626 }
627
628 /*
629 * Unlike non-MN sets, the metasync command does not actually
630 * start a resync, it simply updates the state on all of the
631 * nodes. Therefore, to start a resync we send a resync starting
632 * message for the metadevice
633 */
634 if (ret == 0)
635 ret = meta_mn_send_resync_starting(mirnp, ep);
636 return (ret);
637 }
638
639 /*
640 * FUNCTION: meta_mn_send_metaclear_command()
641 * INPUT: sp - setname
642 * name - metadevice name
643 * options - command options
644 * pflag - clear all soft partitions for a given device
645 * OUTPUT: ep - return error pointer
646 * RETURNS: return value from meta_mn_send_command()
647 * PURPOSE: Send a metaclear command to all nodes with force(-f) and
648 * recurse(-r) options set if required. For hotspare pool and
649 * metadevices, the metadevice name is of the form setname/dxx or
650 * setname/hspxxx so a '-s' argument isn't required. If pflag is set
651 * the name refers to a metadevice or component and in the is case
652 * a '-s' argument is required to define the set.
653 */
654
655 int
meta_mn_send_metaclear_command(mdsetname_t * sp,char * name,mdcmdopts_t options,int pflag,md_error_t * ep)656 meta_mn_send_metaclear_command(
657 mdsetname_t *sp,
658 char *name,
659 mdcmdopts_t options,
660 int pflag,
661 md_error_t *ep
662 )
663 {
664 int newargc;
665 char **newargv;
666 int ret;
667
668 /*
669 * Allocate an array large enough to hold all of the possible
670 * metaclear arguments
671 */
672 newargv = Calloc(7, sizeof (char *));
673 newargv[0] = "metaclear";
674 newargc = 1;
675 if (pflag) {
676 newargv[newargc] = "-s";
677 newargc++;
678 newargv[newargc] = sp->setname;
679 newargc++;
680 }
681 if (options & MDCMD_FORCE) {
682 newargv[newargc] = "-f";
683 newargc++;
684 }
685 if (options & MDCMD_RECURSE) {
686 newargv[newargc] = "-r";
687 newargc++;
688 }
689 if (pflag) {
690 newargv[newargc] = "-p";
691 newargc++;
692 }
693 newargv[newargc] = name;
694 newargc++;
695
696 ret = meta_mn_send_command(sp, newargc, newargv,
697 MD_DISP_STDERR, NO_CONTEXT_STRING, ep);
698
699 free(newargv);
700 return (ret);
701 }
702
703 /*
704 * FUNCTION: meta_mn_send_resync_starting()
705 * INPUT: sp - setname
706 * mirnp - mirror name
707 * OUTPUT: ep - return error pointer
708 * RETURNS: return value from mdmn_send_message()
709 * PURPOSE: Send a resync starting message to all nodes.
710 */
711
712 int
meta_mn_send_resync_starting(mdname_t * mirnp,md_error_t * ep)713 meta_mn_send_resync_starting(
714 mdname_t *mirnp,
715 md_error_t *ep
716 )
717 {
718 int result;
719 md_mn_msg_resync_t resyncmsg;
720 md_mn_result_t *resp = NULL;
721 minor_t mnum = meta_getminor(mirnp->dev);
722
723 /*
724 * This message is never directly issued.
725 * So we launch it with a suspend override flag.
726 * If the commd is suspended, and this message comes
727 * along it must be sent due to replaying a command or similar.
728 * In that case we don't want this message to be blocked.
729 * If the commd is not suspended, the flag does no harm.
730 */
731 resyncmsg.msg_resync_mnum = mnum;
732 result = mdmn_send_message(MD_MIN2SET(mnum),
733 MD_MN_MSG_RESYNC_STARTING,
734 MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
735 (char *)&resyncmsg, sizeof (resyncmsg), &resp, ep);
736
737 if (resp != NULL) {
738 free_result(resp);
739 }
740 return (result);
741 }
742
743 /*
744 * FUNCTION: meta_mn_change_owner()
745 * INPUT: opp - pointer to parameter block
746 * setno - set number of mirror metadevice
747 * mnum - minor number of mirror metadevice
748 * owner - node ID of mirror owner
749 * flags - flag field for ioctl
750 * OUTPUT: opp - parameter block used to send ioctl
751 * RETURNS: int - 0 success, -1 error
752 * PURPOSE: issue an ioctl to change the ownership of the specified mirror
753 * to our node ID. We need to be the owner before any watermarks
754 * are committed to the device otherwise we'll enter a deadly
755 * embrace when attempting to write the watermark.
756 * This function can also be used so set the owner on a node to
757 * NULL. In this case the change is only made on the local node.
758 * In addition by setting the MD_MN_MM_CHOOSE_OWNER flag, the
759 * function can also be used to choose a mirror resync owner. This
760 * function should only be called on the master and it will
761 * select the owner and request it to become the owner.
762 */
763 int
meta_mn_change_owner(md_set_mmown_params_t ** opp,set_t setno,uint_t mnum,uint_t owner,uint_t flags)764 meta_mn_change_owner(
765 md_set_mmown_params_t **opp, /* Returned parameter block */
766 set_t setno, /* Mirror set number */
767 uint_t mnum, /* Minor number */
768 uint_t owner, /* Node ID of mirror owner */
769 uint_t flags /* Flags */
770 )
771 {
772 md_set_mmown_params_t *ownpar = *opp;
773 md_mn_own_status_t *ownstat = NULL;
774 struct timeval tvs, tve;
775 int n = 0;
776 int rval;
777
778 if (ownpar != NULL) {
779 (void) memset(ownpar, 0, sizeof (*ownpar));
780 } else {
781 ownpar = Zalloc(sizeof (*ownpar));
782 }
783 ownstat = Zalloc(sizeof (*ownstat));
784
785 ownpar->d.mnum = mnum;
786 ownpar->d.owner = owner;
787 ownpar->d.flags = flags;
788 MD_SETDRIVERNAME(ownpar, MD_MIRROR, setno);
789 MD_SETDRIVERNAME(ownstat, MD_MIRROR, setno);
790
791 /*
792 * Attempt to change the ownership to the specified node. We retry this
793 * up to 10 times if we receive EAGAIN from the metadevice. This only
794 * happens if the underlying metadevice is busy with outstanding i/o
795 * that requires ownership change.
796 */
797 while ((rval = metaioctl(MD_MN_SET_MM_OWNER, ownpar, &ownpar->mde,
798 NULL)) != 0) {
799 md_sys_error_t *ip =
800 &ownpar->mde.info.md_error_info_t_u.sys_error;
801 if (ip->errnum != EAGAIN)
802 break;
803 if (n++ >= 10)
804 break;
805 (void) sleep(1);
806 }
807
808 /*
809 * There is no need to wait for the ioctl completion if we are setting
810 * the owner to NULL or requesting the master to choose the owner
811 */
812 if ((owner == 0) || (flags & MD_MN_MM_CHOOSE_OWNER)) {
813 Free(ownstat);
814 *opp = ownpar;
815 return (0);
816 }
817
818 /*
819 * Wait for ioctl completion or a timeout to occur. If we
820 * timeout we fail the i/o request.
821 */
822 ownstat->mnum = ownpar->d.mnum;
823 (void) gettimeofday(&tvs, NULL);
824
825 while ((rval == 0) && !(ownstat->flags & MD_MN_MM_RESULT)) {
826 while ((rval = metaioctl(MD_MN_MM_OWNER_STATUS, ownstat,
827 &ownstat->mde, NULL)) != 0) {
828 (void) gettimeofday(&tve, NULL);
829 if ((tve.tv_sec - tvs.tv_sec) > OWNER_TIMEOUT) {
830 rval = -1;
831 break;
832 }
833 (void) sleep(1);
834 }
835 }
836
837 /* we did not not timeout but ioctl failed set rval */
838
839 if (rval == 0) {
840 rval = (ownstat->flags & MD_MN_MM_RES_FAIL) ? -1 : 0;
841 }
842
843 Free(ownstat);
844 *opp = ownpar;
845 return (rval);
846 }
847 /*
848 * special handling is required when running on a single node
849 * non-SC3.x environment. This function determines tests
850 * for that case.
851 *
852 * Return values:
853 * 0 - no nodes or joined or in a SC3.x env
854 * 1 - 1 node and not in SC3.x env
855 */
856
857 int
meta_mn_singlenode()858 meta_mn_singlenode()
859 {
860 md_error_t xep = mdnullerror;
861 int nodecnt;
862 int mnset_single_node = 0;
863 mndiskset_membershiplist_t *nl;
864
865 /*
866 * If running on SunCluster, then don't validate MN sets,
867 * this is done during a reconfig cycle since all nodes must
868 * take the same action.
869 *
870 * Only cleanup in case of a single node situation
871 * when not running on SunCluster. This single node
872 * situation occurs when the nodelist only contains
873 * this node and the MN setrecords only contain this
874 * node.
875 */
876 if (meta_read_nodelist(&nodecnt, &nl, &xep) == -1) {
877 nodecnt = 0; /* no nodes are alive */
878 nl = NULL;
879 mdclrerror(&xep);
880 } else {
881 /*
882 * If only 1 node in nodelist and not running
883 * on SunCluster, set single_node flag.
884 */
885 if ((nodecnt == 1) &&
886 (strcmp(nl->msl_node_name, mynode()) == 0) &&
887 ((sdssc_bind_library()) != SDSSC_OKAY)) {
888 mnset_single_node = 1;
889 }
890 meta_free_nodelist(nl);
891 }
892 return (mnset_single_node);
893 }
894
895 /*
896 * FUNCTION: meta_mn_send_get_tstate()
897 * INPUT: dev - dev_t of device
898 * OUTPUT: tstatep - tstate value
899 * ep - return error pointer
900 * RETURNS: return value from mdmn_send_message()
901 * PURPOSE: Send a message to the master to get ui_tstate for a given device.
902 */
903
904 int
meta_mn_send_get_tstate(md_dev64_t dev,uint_t * tstatep,md_error_t * ep)905 meta_mn_send_get_tstate(
906 md_dev64_t dev,
907 uint_t *tstatep,
908 md_error_t *ep
909 )
910 {
911 int result;
912 md_mn_msg_gettstate_t tstatemsg;
913 md_mn_result_t *resp = NULL;
914 minor_t mnum = meta_getminor(dev);
915
916 tstatemsg.gettstate_dev = dev;
917 result = mdmn_send_message(MD_MIN2SET(mnum),
918 MD_MN_MSG_GET_TSTATE,
919 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0,
920 (char *)&tstatemsg, sizeof (tstatemsg), &resp, ep);
921
922 if (result == 0)
923 *tstatep = resp->mmr_exitval;
924 else
925 /* If some error occurred set tstate to 0 */
926 *tstatep = 0;
927
928 if (resp != NULL) {
929 free_result(resp);
930 }
931 return (result);
932 }
933