xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c (revision 6a634c9dca3093f3922e4b7ab826d7bdf17bf78e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 
26 /*
27  * Just in case we're not in a build environment, make sure that
28  * TEXT_DOMAIN gets set to something.
29  */
30 #if !defined(TEXT_DOMAIN)
31 #define	TEXT_DOMAIN "SYS_TEST"
32 #endif
33 
34 #include <meta.h>
35 #include <sdssc.h>
36 #include <arpa/inet.h>
37 #include <sys/lvm/md_mddb.h>
38 
39 #define	MAX_LINE_SIZE 1024
40 
41 /*
42  * Maximum amount of time to spend waiting for an ownership change to complete.
43  */
44 static const int OWNER_TIMEOUT = 3;
45 
46 /*
47  * FUNCTION:	meta_is_mn_set()
48  * INPUT:       sp      - the set name
49  * OUTPUT:	ep	- return error pointer
50  * RETURNS:	int	- 1 if MultiNode set else 0
51  * PURPOSE:	checks if the set is a MultiNode set
52  */
53 int
meta_is_mn_set(mdsetname_t * sp,md_error_t * ep)54 meta_is_mn_set(
55 	mdsetname_t	*sp,
56 	md_error_t	*ep
57 )
58 {
59 	md_set_desc	*sd;
60 
61 	/* Local set cannot be MultiNode */
62 	if ((sp == NULL) || (sp->setname == NULL) ||
63 	    (strcmp(sp->setname, MD_LOCAL_NAME) == 0))
64 		return (0);
65 	sd = metaget_setdesc(sp, ep);
66 
67 	/*
68 	 * sd can be NULL if there is a difference between
69 	 * the setrecords and the setlistp caches. This can happen
70 	 * if this function is called while a set is being
71 	 * removed during a cluster reconfiguration.
72 	 */
73 	if (sd == NULL)
74 		return (0);
75 	if (sd->sd_flags & MD_SR_MN)
76 		return (1);
77 	return (0);
78 }
79 
80 /*
81  * FUNCTION:	meta_is_mn_name()
82  * INPUT:       spp     - ptr to the set name, if NULL the setname is derived
83  *			  from the metadevice name (eg set/d10 )
84  *		name	- the metadevice/hsp name
85  * OUTPUT:	ep	- return error pointer
86  * RETURNS:	int	- 1 if MultiNode set else 0
87  * PURPOSE:	checks if the metadevice is in a MultiNode set
88  */
89 int
meta_is_mn_name(mdsetname_t ** spp,char * name,md_error_t * ep)90 meta_is_mn_name(
91 	mdsetname_t	**spp,
92 	char		*name,
93 	md_error_t	*ep
94 )
95 {
96 	if (*spp == NULL) {
97 		char		*cname;
98 
99 		/*
100 		 * if the setname is specified in uname and *spp is
101 		 * not set, then it is setup using that set name value.
102 		 * If *spp is set and a setname specified in uname and
103 		 * the set names don't agree then cname will be
104 		 * returned as NULL
105 		 */
106 		cname = meta_canonicalize_check_set(spp, name, ep);
107 		if (cname == NULL) {
108 			mdclrerror(ep);
109 			return (0);
110 		}
111 
112 		Free(cname);
113 	}
114 
115 	if ((strcmp((*spp)->setname, MD_LOCAL_NAME) != 0) &&
116 	    (metaget_setdesc(*spp, ep) != NULL) &&
117 	    ((*spp)->setdesc->sd_flags & MD_SR_MN)) {
118 		return (1);
119 	}
120 	return (0);
121 }
122 
123 /*
124  * meta_ping_mnset(set_t setno)
125  * Send a test message for this set in order to make commd do some init stuff
126  * Don't bother changelog.
127  * If set is suspended, fail immediately.
128  */
129 void
meta_ping_mnset(set_t setno)130 meta_ping_mnset(set_t setno)
131 {
132 	char		*data = "test";
133 	md_error_t	mde = mdnullerror;
134 	md_mn_result_t	*resp = NULL;
135 
136 	(void) mdmn_send_message(setno, MD_MN_MSG_TEST2,
137 	    MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, 0, data,
138 	    sizeof (data), &resp, &mde);
139 
140 	if (resp != (md_mn_result_t *)NULL) {
141 		free_result(resp);
142 	}
143 }
144 
145 /*
146  *
147  * FUNCTION:	print_stderr
148  * INPUT:	errstr	- the error message returned by the command
149  *		context	- the context string from metainit -a
150  * PURPOSE:	called from meta_mn_send_command to print the error message
151  *		to stderr. When context is NO_CONTEXT_STRING, the errstr string
152  *		is output unchanged. When context is a string, it is the context
153  *		string for the metainit -a command and in this case the errstr
154  *		string has to be parsed to extract the command and node name
155  *		and to send a message to stderr in the format
156  *		command: node: context: error message
157  */
158 static void
print_stderr(char * errstr,char * context)159 print_stderr(
160 	char	*errstr,
161 	char	*context
162 )
163 {
164 	char	*command;
165 	char	*node;
166 	char	*message;
167 	int	length = strlen(errstr + 1);
168 
169 	if (context == NO_CONTEXT_STRING) {
170 		(void) fprintf(stderr, "%s", errstr);
171 	} else {
172 		command = Malloc(length);
173 		node = Malloc(length);
174 		message = Malloc(length);
175 		if (sscanf(errstr, "%[^:]: %[^:]: %[^\n]", command, node,
176 		    message) == 3) {
177 			(void) fprintf(stderr, "%s: %s: %s: %s\n", command,
178 			    node, context, message);
179 		} else {
180 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
181 			    "%s: Invalid format error message"), errstr);
182 		}
183 		Free(command);
184 		Free(node);
185 		Free(message);
186 	}
187 }
188 
189 /*
190  * FUNCTION:	meta_mn_send_command()
191  * INPUT:	sp	- the set name
192  *		argc	- number of arguments
193  *		argv	- arg list
194  *		flags	- some controlling flags
195  *		initall_context	- context string for metainit -a
196  * OUTPUT:	ep	- return error pointer
197  * RETURNS:	return exitval from mdmn_send_message
198  * PURPOSE:	sends the command to the master node for execution
199  */
200 int
meta_mn_send_command(mdsetname_t * sp,int argc,char * argv[],int flags,char * initall_context,md_error_t * ep)201 meta_mn_send_command(
202 	mdsetname_t	*sp,
203 	int		argc,
204 	char		*argv[],
205 	int		flags,
206 	char		*initall_context,
207 	md_error_t	*ep
208 )
209 {
210 	int		a;
211 	int		err;
212 	int		retval;
213 	int		send_message_flags = MD_MSGF_DEFAULT_FLAGS;
214 	int		send_message_type;
215 	char		*cmd;
216 	md_mn_result_t	*resp = NULL;
217 
218 	cmd = Malloc(1024);
219 	(void) strlcpy(cmd, argv[0], 1024);
220 	for (a = 1; a < argc; a++) {
221 		/* don't copy empty arguments */
222 		if (*argv[a] == '\0') {
223 			continue;
224 		}
225 		(void) strcat(cmd, " ");
226 		(void) strcat(cmd, argv[a]);
227 	}
228 	/*
229 	 * in dryrun mode stop on the first error
230 	 * use the CMD_RETRY message type if RETRY_BUSY flag set
231 	 */
232 	if (flags & MD_DRYRUN)
233 		send_message_flags |= MD_MSGF_STOP_ON_ERROR;
234 	if (flags & MD_NOLOG)
235 		send_message_flags |= MD_MSGF_NO_LOG;
236 	if (flags & MD_PANIC_WHEN_INCONSISTENT)
237 		send_message_flags |= MD_MSGF_PANIC_WHEN_INCONSISTENT;
238 	if (flags & MD_RETRY_BUSY)  {
239 		send_message_type = MD_MN_MSG_BC_CMD_RETRY;
240 	} else {
241 		send_message_type = MD_MN_MSG_BC_CMD;
242 	}
243 	err = mdmn_send_message(
244 	    sp->setno, send_message_type, send_message_flags, 0,
245 	    cmd, 1024, &resp, ep);
246 
247 	free(cmd);
248 
249 	if (err == 0) {
250 		/*
251 		 * stderr may be turned off by IGNORE_STDERR
252 		 * In dryrun we only print stderr if the exit_val is non-zero
253 		 */
254 		if ((resp->mmr_err_size != 0) &&
255 		    ((flags & MD_IGNORE_STDERR) == 0)) {
256 			if (((flags & MD_DRYRUN) == 0) ||
257 			    (resp->mmr_exitval != 0)) {
258 				print_stderr(resp->mmr_err, initall_context);
259 			}
260 		}
261 
262 		/*
263 		 * If dryrun is set, we don't display stdout,
264 		 * because the real run has yet to follow.
265 		 */
266 		if (((flags & MD_DRYRUN) == 0) && (resp->mmr_out_size != 0)) {
267 			(void) printf("%s", resp->mmr_out);
268 		}
269 		retval = resp->mmr_exitval;
270 		free_result(resp);
271 		return (retval);
272 	}
273 	if (resp != NULL) {
274 		if (resp->mmr_comm_state == MDMNE_CLASS_BUSY) {
275 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
276 			    "rpc.mdcommd currently busy. "
277 			    "Retry operation later.\n"));
278 		} else if (resp->mmr_comm_state == MDMNE_NOT_JOINED) {
279 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
280 			    "Node %s must join the %s multi-owner diskset to "
281 			    "issue commands.\n"
282 			    "To join, use: metaset -s %s -j\n"),
283 			    mynode(), sp->setname, sp->setname);
284 		} else if (resp->mmr_comm_state == MDMNE_LOG_FAIL) {
285 			mddb_config_t	c;
286 
287 			(void) memset(&c, 0, sizeof (c));
288 			c.c_setno = sp->setno;
289 			(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
290 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
291 			    "Command not attempted: Unable to log message "
292 			    "in set %s\n"), sp->setname);
293 			if (c.c_flags & MDDB_C_STALE) {
294 				(void) mdmddberror(ep, MDE_DB_STALE,
295 				    (minor_t)NODEV64, sp->setno, 0, NULL);
296 				mde_perror(ep, "");
297 			}
298 		} else {
299 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
300 			    "Command failed: Commd State %d "
301 			    "encountered.\n"), resp->mmr_comm_state);
302 		}
303 		free_result(resp);
304 	} else {
305 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
306 		    "Command failed: mdmn_send_message returned %d.\n"),
307 		    err);
308 	}
309 
310 
311 	return (1);
312 }
313 
314 /*
315  * FUNCTION:	meta_mn_send_suspend_writes()
316  * INPUT:	mnum	- minor num of mirror
317  * OUTPUT:	ep	- return error pointer
318  * RETURNS:	return value from mdmn_send_message()
319  * PURPOSE:	sends message to all nodes to suspend writes to the mirror.
320  */
321 int
meta_mn_send_suspend_writes(minor_t mnum,md_error_t * ep)322 meta_mn_send_suspend_writes(
323 	minor_t		mnum,
324 	md_error_t	*ep
325 )
326 {
327 	int			result;
328 	md_mn_msg_suspwr_t	suspwrmsg;
329 	md_mn_result_t		*resp = NULL;
330 
331 	suspwrmsg.msg_suspwr_mnum =  mnum;
332 	/*
333 	 * This message is never directly issued.
334 	 * So we launch it with a suspend override flag.
335 	 * If the commd is suspended, and this message comes
336 	 * along it must be sent due to replaying a command or similar.
337 	 * In that case we don't want this message to be blocked.
338 	 * If the commd is not suspended, the flag does no harm.
339 	 */
340 	result = mdmn_send_message(MD_MIN2SET(mnum),
341 	    MD_MN_MSG_SUSPEND_WRITES,
342 	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
343 	    (char *)&suspwrmsg, sizeof (suspwrmsg), &resp, ep);
344 	if (resp != NULL) {
345 		free_result(resp);
346 	}
347 	return (result);
348 }
349 
350 /*
351  * Parse the multi-node list file
352  *
353  * Return Values:	Zero	 - Success
354  *			Non Zero - Failure
355  *
356  * File content:	The content of the nodelist file should consist of
357  *			triplets of nodeid, nodename and private interconnect
358  *			address seperated by one or more white space.
359  * e.g.
360  *			1 node_a 192.168.111.3
361  *			2 node_b 192.168.111.5
362  *
363  *			Any missing fields will result in an error.
364  */
365 int
meta_read_nodelist(int * nodecnt,mndiskset_membershiplist_t ** nl,md_error_t * ep)366 meta_read_nodelist(
367 	int				*nodecnt,
368 	mndiskset_membershiplist_t	**nl,
369 	md_error_t			*ep
370 )
371 {
372 	FILE				*fp = NULL;
373 	char				line[MAX_LINE_SIZE];
374 	char				*buf;
375 	uint_t				i;
376 	int				sz;
377 	mndiskset_membershiplist_t	**tailp = nl;
378 
379 	/* open file */
380 	if ((fp = fopen(META_MNSET_NODELIST, "r")) == NULL) {
381 		mndiskset_membershiplist_t	*nlp;
382 		struct hostent *hp;
383 		int err = 0;
384 
385 		/* return this node with id of 1 */
386 		nlp = *tailp = Zalloc(sizeof (*nlp));
387 		tailp = &nlp->next;
388 
389 		*nodecnt = 1;
390 		nlp->msl_node_id = 1;
391 		buf = mynode();
392 		sz = min(strlen(buf), sizeof (nlp->msl_node_name) - 1);
393 		(void) strncpy(nlp->msl_node_name, buf, sz);
394 		nlp->msl_node_name[sz] = '\0';
395 
396 		/* retrieve info about our host */
397 		if ((hp = gethostbyname(buf)) == NULL) {
398 			err = EADDRNOTAVAIL;
399 		} else if (hp->h_addrtype != AF_INET) {
400 			/* We only do IPv4 addresses, for now */
401 			err = EPFNOSUPPORT;
402 		} else if (*hp->h_addr_list == NULL) {
403 			/* No addresses in the list */
404 			err = EADDRNOTAVAIL;
405 		} else {
406 			/* We take the first address only */
407 			struct in_addr in;
408 
409 			(void) memcpy(&in.s_addr, *hp->h_addr_list,
410 			    sizeof (struct in_addr));
411 			(void) strncpy(nlp->msl_node_addr,
412 			    inet_ntoa(in), MD_MAX_NODENAME);
413 		}
414 
415 		if (err) {
416 			meta_free_nodelist(*nl);
417 			return (mdsyserror(ep, err, buf));
418 		}
419 		return (0);
420 	}
421 
422 	*nl = NULL;
423 	*nodecnt = 0;
424 
425 	while ((fp != NULL) && ((buf = fgets(line, sizeof (line) - 1, fp)) !=
426 	    NULL)) {
427 		mndiskset_membershiplist_t	*nlp;
428 
429 		/* skip leading spaces */
430 		while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
431 			buf++;
432 
433 		/* skip comments and blank lines */
434 		if (*buf == '\0' || *buf == '#')
435 			continue;
436 
437 		/* allocate memory and set tail pointer */
438 		nlp = *tailp = Zalloc(sizeof (*nlp));
439 		tailp = &nlp->next;
440 
441 		/* parse node id */
442 		nlp->msl_node_id = strtoul(buf, NULL, 0);
443 		buf += i;
444 
445 		/* skip leading spaces */
446 		while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
447 			buf++;
448 
449 		/* fields missing, return error */
450 		if (*buf == '\0' || *buf == '#') {
451 			meta_free_nodelist(*nl);
452 			*nl = NULL;
453 			*nodecnt = 0;
454 
455 			/* close file and return */
456 			if ((fp) && (fclose(fp) != 0))
457 				return (mdsyserror(ep, errno,
458 				    META_MNSET_NODELIST));
459 
460 			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
461 		}
462 
463 		/* parse node name */
464 		sz = min(i, sizeof (nlp->msl_node_name) - 1);
465 		(void) strncpy(nlp->msl_node_name, buf, sz);
466 		nlp->msl_node_name[sz] = '\0';
467 		buf += i;
468 
469 		/* skip leading spaces */
470 		while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0)
471 			buf++;
472 
473 		/* fields missing, return error */
474 		if (*buf == '\0' || *buf == '#') {
475 			meta_free_nodelist(*nl);
476 			*nl = NULL;
477 			*nodecnt = 0;
478 
479 			/* close file and return */
480 			if ((fp) && (fclose(fp) != 0))
481 				return (mdsyserror(ep, errno,
482 				    META_MNSET_NODELIST));
483 
484 			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
485 		}
486 
487 		/* parse node address */
488 		sz = min(i, sizeof (nlp->msl_node_addr) - 1);
489 		(void) strncpy(nlp->msl_node_addr, buf, sz);
490 		nlp->msl_node_addr[sz] = '\0';
491 
492 		++*nodecnt;
493 	}
494 
495 	/* close file */
496 	if ((fp) && (fclose(fp) != 0)) {
497 		meta_free_nodelist(*nl);
498 		return (mdsyserror(ep, errno, META_MNSET_NODELIST));
499 	}
500 	return (0);
501 }
502 
503 /*
504  * Populate the multi-node list file from a given list of node id's
505  * The nids must have only one node id in each cell. Range of node
506  * id's in the form 1-n are not allowed.
507  *
508  * Return Values:	Zero	 - Success
509  *			Non Zero - Failure
510  */
511 int
meta_write_nodelist(int nodecnt,char ** nids,md_error_t * ep)512 meta_write_nodelist(
513 	int		nodecnt,
514 	char		**nids,
515 	md_error_t	*ep
516 )
517 {
518 	FILE		*fp = NULL;
519 	char		name[MAX_LINE_SIZE], addr[MAX_LINE_SIZE];
520 	uint_t		i, nid;
521 	struct in_addr	ipaddr;
522 	int		err = 0;
523 
524 	/* check if we are running on clustering */
525 	if ((err = sdssc_bind_library()) != SDSSC_OKAY) {
526 		return (mdsyserror(ep, err, META_MNSET_NODELIST));
527 	}
528 
529 	/* open file for writing */
530 	if ((fp = fopen(META_MNSET_NODELIST, "w")) == NULL) {
531 		return (mdsyserror(ep, errno, META_MNSET_NODELIST));
532 	}
533 
534 	for (i = 0; i < nodecnt; i++) {
535 		/* extract the node id */
536 		errno = 0;
537 		nid = strtoul(nids[i], NULL, 0);
538 		if (errno != 0) {
539 			if ((fp) && (fclose(fp) != 0))
540 				return (mdsyserror(ep, errno,
541 				    META_MNSET_NODELIST));
542 
543 			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
544 		}
545 
546 		/* get node name */
547 		(void) snprintf(name, sizeof (name), "%d", nid);
548 		sdssc_cm_nid2nm(name);
549 
550 		/* finally get the private ip address */
551 		(void) snprintf(addr, sizeof (addr), "%s", name);
552 		if (sdssc_get_priv_ipaddr(addr, &ipaddr) != SDSSC_OKAY) {
553 			if ((fp) && (fclose(fp) != 0))
554 				return (mdsyserror(ep, errno,
555 				    META_MNSET_NODELIST));
556 
557 			return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST));
558 		}
559 
560 		(void) fprintf(fp, "%d\t%s\t%s\n", nid, name,
561 		    inet_ntoa(ipaddr));
562 	}
563 
564 	/* close file */
565 	if ((fp) && (fclose(fp) != 0))
566 		return (mdsyserror(ep, errno, META_MNSET_NODELIST));
567 
568 	return (0);
569 }
570 
571 /*
572  * Free node list
573  */
574 void
meta_free_nodelist(mndiskset_membershiplist_t * nl)575 meta_free_nodelist(
576 	mndiskset_membershiplist_t	*nl
577 )
578 {
579 	mndiskset_membershiplist_t	*next = NULL;
580 
581 	for (/* void */; (nl != NULL); nl = next) {
582 		next = nl->next;
583 		Free(nl);
584 	}
585 }
586 
587 /*
588  * FUNCTION:	meta_mn_send_setsync()
589  * INPUT:	sp	- setname
590  *		mirnp	- mirror name
591  *		size	- buffer size, 0 if none
592  * OUTPUT:	ep	- return error pointer
593  * RETURNS:	return value from meta_mn_send_command()
594  * PURPOSE:  Send a setsync command to all nodes to set resync status
595  */
596 
597 int
meta_mn_send_setsync(mdsetname_t * sp,mdname_t * mirnp,daddr_t size,md_error_t * ep)598 meta_mn_send_setsync(
599 	mdsetname_t		*sp,
600 	mdname_t		*mirnp,
601 	daddr_t			size,
602 	md_error_t		*ep
603 )
604 {
605 	md_mn_msg_setsync_t	setsyncmsg;
606 	int			ret;
607 	md_mn_result_t		*resp = NULL;
608 
609 	setsyncmsg.setsync_mnum = meta_getminor(mirnp->dev);
610 	setsyncmsg.setsync_copysize = size;
611 	setsyncmsg.setsync_flags = 0;
612 
613 	/*
614 	 * We do not log the metasync command as it will have no effect on the
615 	 * underlying metadb state. If we have a master change the
616 	 * reconfiguration process will issue a new 'metasync' to all affected
617 	 * mirrors, so we would actually end up sending the message twice.
618 	 * Removing the logging of the message helps reduce the processing
619 	 * time required.
620 	 */
621 	ret = mdmn_send_message(sp->setno, MD_MN_MSG_SETSYNC,
622 	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
623 	    (char *)&setsyncmsg, sizeof (setsyncmsg), &resp, ep);
624 	if (resp != NULL) {
625 		free_result(resp);
626 	}
627 
628 	/*
629 	 * Unlike non-MN sets, the metasync command does not actually
630 	 * start a resync, it simply updates the state on all of the
631 	 * nodes. Therefore, to start a resync we send a resync starting
632 	 * message for the metadevice
633 	 */
634 	if (ret == 0)
635 		ret = meta_mn_send_resync_starting(mirnp, ep);
636 	return (ret);
637 }
638 
639 /*
640  * FUNCTION:	meta_mn_send_metaclear_command()
641  * INPUT:	sp	- setname
642  *		name	- metadevice name
643  *		options - command options
644  *		pflag	- clear all soft partitions for a given device
645  * OUTPUT:	ep	- return error pointer
646  * RETURNS:	return value from meta_mn_send_command()
647  * PURPOSE:  Send a metaclear command to all nodes with force(-f) and
648  *	     recurse(-r) options set if required. For hotspare pool and
649  *	     metadevices, the metadevice name is of the form setname/dxx or
650  *	     setname/hspxxx so a '-s' argument isn't required. If pflag is set
651  *	     the name refers to a metadevice or component and in the is case
652  *	     a '-s' argument is required to define the set.
653  */
654 
655 int
meta_mn_send_metaclear_command(mdsetname_t * sp,char * name,mdcmdopts_t options,int pflag,md_error_t * ep)656 meta_mn_send_metaclear_command(
657 	mdsetname_t		*sp,
658 	char			*name,
659 	mdcmdopts_t		options,
660 	int			pflag,
661 	md_error_t		*ep
662 )
663 {
664 	int	newargc;
665 	char	**newargv;
666 	int	ret;
667 
668 	/*
669 	 * Allocate an array large enough to hold all of the possible
670 	 * metaclear arguments
671 	 */
672 	newargv = Calloc(7, sizeof (char *));
673 	newargv[0] = "metaclear";
674 	newargc = 1;
675 	if (pflag) {
676 		newargv[newargc] = "-s";
677 		newargc++;
678 		newargv[newargc] = sp->setname;
679 		newargc++;
680 	}
681 	if (options & MDCMD_FORCE) {
682 		newargv[newargc] = "-f";
683 		newargc++;
684 	}
685 	if (options & MDCMD_RECURSE) {
686 		newargv[newargc] = "-r";
687 		newargc++;
688 	}
689 	if (pflag) {
690 		newargv[newargc] = "-p";
691 		newargc++;
692 	}
693 	newargv[newargc] = name;
694 	newargc++;
695 
696 	ret = meta_mn_send_command(sp, newargc, newargv,
697 	    MD_DISP_STDERR, NO_CONTEXT_STRING, ep);
698 
699 	free(newargv);
700 	return (ret);
701 }
702 
703 /*
704  * FUNCTION:	meta_mn_send_resync_starting()
705  * INPUT:	sp	- setname
706  *		mirnp	- mirror name
707  * OUTPUT:	ep	- return error pointer
708  * RETURNS:	return value from mdmn_send_message()
709  * PURPOSE:  Send a resync starting message to all nodes.
710  */
711 
712 int
meta_mn_send_resync_starting(mdname_t * mirnp,md_error_t * ep)713 meta_mn_send_resync_starting(
714 	mdname_t		*mirnp,
715 	md_error_t		*ep
716 )
717 {
718 	int			result;
719 	md_mn_msg_resync_t	resyncmsg;
720 	md_mn_result_t		*resp = NULL;
721 	minor_t			mnum = meta_getminor(mirnp->dev);
722 
723 	/*
724 	 * This message is never directly issued.
725 	 * So we launch it with a suspend override flag.
726 	 * If the commd is suspended, and this message comes
727 	 * along it must be sent due to replaying a command or similar.
728 	 * In that case we don't want this message to be blocked.
729 	 * If the commd is not suspended, the flag does no harm.
730 	 */
731 	resyncmsg.msg_resync_mnum =  mnum;
732 	result = mdmn_send_message(MD_MIN2SET(mnum),
733 	    MD_MN_MSG_RESYNC_STARTING,
734 	    MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
735 	    (char *)&resyncmsg, sizeof (resyncmsg), &resp, ep);
736 
737 	if (resp != NULL) {
738 		free_result(resp);
739 	}
740 	return (result);
741 }
742 
743 /*
744  * FUNCTION:	meta_mn_change_owner()
745  * INPUT:	opp	- pointer to parameter block
746  *		setno	- set number of mirror metadevice
747  *		mnum	- minor number of mirror metadevice
748  *		owner	- node ID of mirror owner
749  *		flags	- flag field for ioctl
750  * OUTPUT:	opp	- parameter block used to send ioctl
751  * RETURNS:	int	- 0 success, -1 error
752  * PURPOSE:	issue an ioctl to change the ownership of the specified mirror
753  *		to our node ID. We need to be the owner before any watermarks
754  *		are committed to the device otherwise we'll enter a deadly
755  *		embrace when attempting to write the watermark.
756  *		This function can also be used so set the owner on a node to
757  *		NULL. In this case the change is only made on the local node.
758  *		In addition by setting the MD_MN_MM_CHOOSE_OWNER flag, the
759  *		function can also be used to choose a mirror resync owner. This
760  *		function should only be called on the master and it will
761  *		select the owner and request it to become the owner.
762  */
763 int
meta_mn_change_owner(md_set_mmown_params_t ** opp,set_t setno,uint_t mnum,uint_t owner,uint_t flags)764 meta_mn_change_owner(
765 	md_set_mmown_params_t 	**opp,	/* Returned parameter block */
766 	set_t			setno,	/* Mirror set number */
767 	uint_t 			mnum,	/* Minor number */
768 	uint_t			owner,	/* Node ID of mirror owner */
769 	uint_t			flags	/* Flags */
770 )
771 {
772 	md_set_mmown_params_t	*ownpar = *opp;
773 	md_mn_own_status_t	*ownstat = NULL;
774 	struct timeval tvs, tve;
775 	int			n = 0;
776 	int			rval;
777 
778 	if (ownpar != NULL) {
779 		(void) memset(ownpar, 0, sizeof (*ownpar));
780 	} else {
781 		ownpar = Zalloc(sizeof (*ownpar));
782 	}
783 	ownstat = Zalloc(sizeof (*ownstat));
784 
785 	ownpar->d.mnum = mnum;
786 	ownpar->d.owner = owner;
787 	ownpar->d.flags = flags;
788 	MD_SETDRIVERNAME(ownpar, MD_MIRROR, setno);
789 	MD_SETDRIVERNAME(ownstat, MD_MIRROR, setno);
790 
791 	/*
792 	 * Attempt to change the ownership to the specified node. We retry this
793 	 * up to 10 times if we receive EAGAIN from the metadevice. This only
794 	 * happens if the underlying metadevice is busy with outstanding i/o
795 	 * that requires ownership change.
796 	 */
797 	while ((rval = metaioctl(MD_MN_SET_MM_OWNER, ownpar, &ownpar->mde,
798 	    NULL)) != 0) {
799 		md_sys_error_t	*ip =
800 		    &ownpar->mde.info.md_error_info_t_u.sys_error;
801 		if (ip->errnum != EAGAIN)
802 			break;
803 		if (n++ >= 10)
804 			break;
805 		(void) sleep(1);
806 	}
807 
808 	/*
809 	 * There is no need to wait for the ioctl completion if we are setting
810 	 * the owner to NULL or requesting the master to choose the owner
811 	 */
812 	if ((owner == 0) || (flags & MD_MN_MM_CHOOSE_OWNER)) {
813 		Free(ownstat);
814 		*opp = ownpar;
815 		return (0);
816 	}
817 
818 	/*
819 	 * Wait for ioctl completion or a timeout to occur. If we
820 	 * timeout we fail the i/o request.
821 	 */
822 	ownstat->mnum = ownpar->d.mnum;
823 	(void) gettimeofday(&tvs, NULL);
824 
825 	while ((rval == 0) && !(ownstat->flags & MD_MN_MM_RESULT)) {
826 		while ((rval = metaioctl(MD_MN_MM_OWNER_STATUS, ownstat,
827 		    &ownstat->mde, NULL)) != 0) {
828 			(void) gettimeofday(&tve, NULL);
829 			if ((tve.tv_sec - tvs.tv_sec) > OWNER_TIMEOUT) {
830 				rval = -1;
831 				break;
832 			}
833 			(void) sleep(1);
834 		}
835 	}
836 
837 	/* we did not not timeout but ioctl failed set rval */
838 
839 	if (rval == 0) {
840 		rval = (ownstat->flags & MD_MN_MM_RES_FAIL) ? -1 : 0;
841 	}
842 
843 	Free(ownstat);
844 	*opp = ownpar;
845 	return (rval);
846 }
847 /*
848  * special handling is required when running on a single node
849  * non-SC3.x environment.  This function determines tests
850  * for that case.
851  *
852  * Return values:
853  *	0 - no nodes or joined or in a SC3.x env
854  *	1 - 1 node and not in SC3.x env
855  */
856 
857 int
meta_mn_singlenode()858 meta_mn_singlenode()
859 {
860 	md_error_t			xep = mdnullerror;
861 	int				nodecnt;
862 	int				mnset_single_node = 0;
863 	mndiskset_membershiplist_t	*nl;
864 
865 	/*
866 	 * If running on SunCluster, then don't validate MN sets,
867 	 * this is done during a reconfig cycle since all nodes must
868 	 * take the same action.
869 	 *
870 	 * Only cleanup in case of a single node situation
871 	 * when not running on SunCluster.  This single node
872 	 * situation occurs when the nodelist only contains
873 	 * this node and the MN setrecords only contain this
874 	 * node.
875 	 */
876 	if (meta_read_nodelist(&nodecnt, &nl, &xep) == -1) {
877 		nodecnt = 0;  /* no nodes are alive */
878 		nl = NULL;
879 		mdclrerror(&xep);
880 	} else {
881 		/*
882 		 * If only 1 node in nodelist and not running
883 		 * on SunCluster, set single_node flag.
884 		 */
885 		if ((nodecnt == 1) &&
886 		    (strcmp(nl->msl_node_name, mynode()) == 0) &&
887 		    ((sdssc_bind_library()) != SDSSC_OKAY)) {
888 			mnset_single_node = 1;
889 		}
890 		meta_free_nodelist(nl);
891 	}
892 	return (mnset_single_node);
893 }
894 
895 /*
896  * FUNCTION:	meta_mn_send_get_tstate()
897  * INPUT:	dev	- dev_t of device
898  * OUTPUT:	tstatep - tstate value
899  *		ep	- return error pointer
900  * RETURNS:	return value from mdmn_send_message()
901  * PURPOSE:  Send a message to the master to get ui_tstate for a given device.
902  */
903 
904 int
meta_mn_send_get_tstate(md_dev64_t dev,uint_t * tstatep,md_error_t * ep)905 meta_mn_send_get_tstate(
906 	md_dev64_t		dev,
907 	uint_t			*tstatep,
908 	md_error_t		*ep
909 )
910 {
911 	int			result;
912 	md_mn_msg_gettstate_t	tstatemsg;
913 	md_mn_result_t		*resp = NULL;
914 	minor_t			mnum = meta_getminor(dev);
915 
916 	tstatemsg.gettstate_dev = dev;
917 	result = mdmn_send_message(MD_MIN2SET(mnum),
918 	    MD_MN_MSG_GET_TSTATE,
919 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0,
920 	    (char *)&tstatemsg, sizeof (tstatemsg), &resp, ep);
921 
922 	if (result == 0)
923 		*tstatep = resp->mmr_exitval;
924 	else
925 		/* If some error occurred set tstate to 0 */
926 		*tstatep = 0;
927 
928 	if (resp != NULL) {
929 		free_result(resp);
930 	}
931 	return (result);
932 }
933