xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_set_med.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Metadevice diskset interfaces
31  */
32 
33 #include "meta_set_prv.h"
34 #include <sys/lvm/md_crc.h>
35 #include <sys/lvm/mdmed.h>
36 
37 #include <sys/sysevent/eventdefs.h>
38 #include <sys/sysevent/svm.h>
39 
40 #define	MALSIZ	32
41 
42 static int
add_lst(char *** listp,char * item)43 add_lst(char ***listp, char *item)
44 {
45 	int	i, j;
46 
47 	if (*listp) {
48 		for (i = 0; (*listp)[i]; i++)
49 			/* void */;
50 	} else {
51 		*listp = (char **)Zalloc(MALSIZ * sizeof (char *));
52 		i = 0;
53 	}
54 
55 	(*listp)[i] = Strdup(item);
56 
57 	if ((++i % MALSIZ) == 0) {
58 		*listp = (char **)Realloc((void *)*listp,
59 			(i + MALSIZ) * sizeof (char *));
60 		for (j = i; j < (i + MALSIZ); j++)
61 			(*listp)[j] = (char *)NULL;
62 	}
63 	return (i);
64 }
65 
66 static int
del_lst(char *** listp)67 del_lst(char ***listp)
68 {
69 	int	i;
70 
71 	if (*listp) {
72 		for (i = 0; (*listp)[i]; i++)
73 			free((*listp)[i]);
74 		free(*listp);
75 		*listp = NULL;
76 		return (1);
77 	} else
78 		return (0);
79 }
80 
81 
82 static int
validate_med_nodes(mdsetname_t * sp,md_h_arr_t * mhp,md_error_t * ep)83 validate_med_nodes(
84 	mdsetname_t	*sp,
85 	md_h_arr_t	*mhp,
86 	md_error_t	*ep
87 )
88 {
89 	char		*hostname;
90 	char		*nodename;
91 	char		*nm;
92 	char		*cp;
93 	int		i, j;
94 
95 
96 	for (i = 0; i < MED_MAX_HOSTS; i++) {
97 		if (mhp->n_lst[i].a_cnt == 0)
98 			continue;
99 
100 		for (j = 0; j < mhp->n_lst[i].a_cnt; j++) {
101 			nm = mhp->n_lst[i].a_nm[j];
102 
103 			for (cp = nm; *cp; cp++)
104 				if (!isprint(*cp) ||
105 				    strchr(INVALID_IN_NAMES, *cp) != NULL)
106 					return (mddserror(ep,
107 					    MDE_DS_INVALIDMEDNAME,
108 					    sp->setno, nm, NULL, sp->setname));
109 
110 			if (clnt_med_hostname(nm, &hostname, ep))
111 				return (-1);
112 
113 			if (j == 0) {
114 				if (strcmp(nm, hostname) != 0) {
115 					Free(hostname);
116 					return (mddserror(ep,
117 					    MDE_DS_NOTNODENAME, sp->setno, nm,
118 					    NULL, sp->setname));
119 				}
120 				nodename = nm;
121 			} else {
122 				if (strcmp(nodename, hostname) != 0) {
123 					Free(hostname);
124 					return (mddserror(ep,
125 					    MDE_DS_ALIASNOMATCH, sp->setno, nm,
126 					    nodename, sp->setname));
127 				}
128 			}
129 			Free(hostname);
130 		}
131 	}
132 	return (0);
133 }
134 
135 /*
136  * Exported Entry Points
137  */
138 
139 int
meta_set_addmeds(mdsetname_t * sp,int node_c,char ** node_v,md_error_t * ep)140 meta_set_addmeds(
141 	mdsetname_t		*sp,
142 	int			node_c,
143 	char			**node_v,
144 	md_error_t		*ep
145 )
146 {
147 	md_set_desc		*sd = NULL;
148 	md_drive_desc		*dd = NULL;
149 	mddb_med_parm_t		mp;
150 	mddb_med_upd_parm_t	mup;
151 	md_h_arr_t		t;
152 	md_h_arr_t		rb_t;
153 	med_rec_t		medr;
154 	med_rec_t		rb_medr;
155 	char			*cp;
156 	char			**n_l = NULL;
157 	int			n_c = 0;
158 	int			i, j;
159 	sigset_t		oldsigs;
160 	md_setkey_t		*cl_sk;
161 	int			rb_level = 0;
162 	md_error_t		xep = mdnullerror;
163 	int			rval = 0;
164 	int			max_meds;
165 	md_mnnode_desc		*nd;
166 	int			suspend1_flag = 0;
167 	int			lock_flag = 0;
168 
169 	/* Initialize */
170 	(void) memset(&t, '\0', sizeof (t));
171 	t.n_cnt = node_c;
172 	mdclrerror(ep);
173 
174 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
175 		return (-1);
176 
177 	/* Make sure we own the set */
178 	if (meta_check_ownership(sp, ep) != 0)
179 		return (-1);
180 
181 	if ((max_meds = get_max_meds(ep)) == 0)
182 		return (-1);
183 
184 	/*
185 	 * The mediator information (which is part of the set record) is
186 	 * stored in the local mddbs of each node in the diskset.
187 	 * Each node's rpc.metad daemon reads in the set
188 	 * records from that node's local mddb and caches them
189 	 * internally. Any process needing diskset information contacts its
190 	 * local rpc.metad to get this information.  Since each node in the
191 	 * diskset is independently reading the set information from its local
192 	 * mddb, the set records in the local mddbs must stay
193 	 * in-sync, so that all nodes have a consistent view of the diskset.
194 	 *
195 	 * For a multinode diskset, explicitly verify that all nodes in the
196 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
197 	 * fail this operation since all nodes must be ALIVE in order to add
198 	 * the mediator information to the set record in their local mddb.
199 	 * If a panic of this node leaves the local mddbs set records
200 	 * out-of-sync, the reconfig cycle will fix the local mddbs and
201 	 * force them back into synchronization.
202 	 */
203 	if (MD_MNSET_DESC(sd)) {
204 		nd = sd->sd_nodelist;
205 		while (nd) {
206 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
207 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
208 					sp->setno,
209 					nd->nd_nodename, NULL, sp->setname);
210 				return (-1);
211 			}
212 			nd = nd->nd_next;
213 		}
214 	}
215 
216 	/* Parse the command line into a the md_h_arr_t structure */
217 	for (i = 0; i < t.n_cnt; i++) {
218 		cp = strtok(node_v[i], ",");
219 		j = 0;
220 		while (cp) {
221 			if (strlen(cp) > (size_t)MD_MAX_NODENAME)
222 				return (mddserror(ep, MDE_DS_NODENAMETOOLONG,
223 				    sp->setno, cp, NULL, sp->setname));
224 			if (j >= MAX_HOST_ADDRS)
225 				return (mddserror(ep, MDE_DS_TOOMANYALIAS,
226 				    sp->setno, cp, NULL, sp->setname));
227 
228 			(void) strcpy(t.n_lst[i].a_nm[j], cp);
229 
230 			j++;
231 
232 			cp = strtok(NULL, ",");
233 		}
234 		t.n_lst[i].a_cnt = j;
235 	}
236 
237 	/* Make a list of nodes to check */
238 	for (i = 0; i < t.n_cnt; i++)
239 		for (j = 0; j < t.n_lst[i].a_cnt; j++)
240 			n_c = add_lst(&n_l, t.n_lst[i].a_nm[j]);
241 
242 	/* Make sure that there are no redundant nodes */
243 	rval = nodesuniq(sp, n_c, n_l, ep);
244 
245 	(void) del_lst(&n_l);
246 
247 	if (rval != 0)
248 		return (rval);
249 
250 	/*
251 	 * Lock the set on current set members.
252 	 * Set locking done much earlier for MN diskset than for traditional
253 	 * diskset since lock_set and SUSPEND are used to protect against
254 	 * other metaset commands running on the other nodes.
255 	 */
256 	if (MD_MNSET_DESC(sd)) {
257 		/* Make sure we are blocking all signals */
258 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
259 			mdclrerror(&xep);
260 		nd = sd->sd_nodelist;
261 		/* All nodes are guaranteed to be ALIVE */
262 		while (nd) {
263 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
264 				rval = -1;
265 				goto out;
266 			}
267 			lock_flag = 1;
268 			nd = nd->nd_next;
269 		}
270 		/*
271 		 * Lock out other meta* commands by suspending
272 		 * class 1 messages across the diskset.
273 		 */
274 		nd = sd->sd_nodelist;
275 		/* All nodes are guaranteed to be ALIVE */
276 		while (nd) {
277 			if (clnt_mdcommdctl(nd->nd_nodename,
278 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
279 			    MD_MSCF_NO_FLAGS, ep)) {
280 				rval = -1;
281 				goto out;
282 			}
283 			suspend1_flag = 1;
284 			nd = nd->nd_next;
285 		}
286 	}
287 
288 	if (validate_med_nodes(sp, &t, ep)) {
289 		rval = -1;
290 		goto out;
291 	}
292 
293 	/* Check existing mediators against new, if any */
294 	if (sd->sd_med.n_cnt > 0) {
295 		for (i = 0; i < max_meds; i++)
296 			if (sd->sd_med.n_lst[i].a_cnt > 0)
297 				n_c = add_lst(&n_l,
298 				    sd->sd_med.n_lst[i].a_nm[0]);
299 
300 		for (i = 0; i < t.n_cnt; i++) {
301 			if (strinlst(t.n_lst[i].a_nm[0], n_c, n_l)) {
302 				(void) del_lst(&n_l);
303 				(void) mddserror(ep, MDE_DS_ISMED, sp->setno,
304 				    t.n_lst[i].a_nm[0], NULL,
305 				    sp->setname);
306 				rval = -1;
307 				goto out;
308 			}
309 		}
310 		(void) del_lst(&n_l);
311 	}
312 
313 	if ((t.n_cnt + sd->sd_med.n_cnt) > max_meds) {
314 		(void) mderror(ep, MDE_TOOMANYMED, NULL);
315 		rval = -1;
316 		goto out;
317 	}
318 
319 	/* Copy the current mediator list for rollback */
320 	rb_t = sd->sd_med;			/* structure assignment */
321 
322 	/* Setup the mediator record roll-back structure */
323 	(void) memset(&rb_medr, '\0', sizeof (med_rec_t));
324 	rb_medr.med_rec_mag = MED_REC_MAGIC;
325 	rb_medr.med_rec_rev = MED_REC_REV;
326 	rb_medr.med_rec_fl  = 0;
327 	rb_medr.med_rec_sn  = sp->setno;
328 	(void) strcpy(rb_medr.med_rec_snm, sp->setname);
329 	if (MD_MNSET_DESC(sd)) {
330 		/*
331 		 * For a MN diskset the mediator is not given a list of
332 		 * hosts in the set.  Instead a generic name (multiowner) is
333 		 * given to the mediator which will allow any node to access
334 		 * the mediator data as long as it provides the correct
335 		 * setname and set number.  In a MN diskset, the mediator
336 		 * data is only used when a first node joins the diskset
337 		 * and becomes the master of the MN diskset.
338 		 *
339 		 * The traditional diskset code keeps the host list in
340 		 * the mediator record up to date with respect to the host
341 		 * list in the traditional diskset.  This keeps an unauthorized
342 		 * node in the traditional diskset from accessing the data
343 		 * in the mediator record and being able to 'take' the
344 		 * diskset.
345 		 *
346 		 * This additional check is needed in the traditional diskset
347 		 * since a panic during the metaset command can leave
348 		 * the diskset with some nodes thinking that an
349 		 * action has occurred and other nodes thinking the opposite.
350 		 * A node may have really been removed from a diskset, but
351 		 * that node doesn't realize this so this node must be
352 		 * blocked from using the mediator data when attempting
353 		 * to 'take' the diskset.
354 		 * (Traditional diskset code has each node's rpc.metad
355 		 * cleaning up from an inconsistent state without any
356 		 * knowledge from the other nodes in the diskset).
357 		 *
358 		 * In the MN diskset, the reconfig steps force a consistent
359 		 * state across all nodes in the diskset, so no node
360 		 * needs to be blocked from accessing the mediator data.
361 		 * This allow the MN diskset to use a common 'nodename'
362 		 * in the mediator record.  This allows the mediator
363 		 * daemon to remain unchanged even though a large number of
364 		 * nodes are supported by the MN diskset.
365 		 */
366 		(void) strlcpy(rb_medr.med_rec_nodes[0], MED_MN_CALLER,
367 		    MD_MAX_NODENAME_PLUS_1);
368 	} else {
369 		for (i = 0; i < MD_MAXSIDES; i++)
370 			(void) strcpy(rb_medr.med_rec_nodes[i],
371 				sd->sd_nodes[i]);
372 	}
373 	rb_medr.med_rec_meds = sd->sd_med;	/* structure assigment */
374 	(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
375 	rb_medr.med_rec_foff = 0;
376 	crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);
377 
378 	/* Merge new mediators into the set record */
379 	for (i = 0; i < t.n_cnt; i++) {
380 		for (j = 0; j < max_meds; j++) {
381 			if (sd->sd_med.n_lst[j].a_cnt > 0)
382 				continue;
383 			sd->sd_med.n_lst[j] = t.n_lst[i];
384 			SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ADD, SVM_TAG_MEDIATOR,
385 			    sp->setno, j);
386 			sd->sd_med.n_cnt++;
387 			break;
388 		}
389 	}
390 
391 	/*
392 	 * Setup the kernel mediator list, which also validates that the
393 	 * hosts have valid IP addresses
394 	 */
395 	(void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
396 	mp.med_setno = sp->setno;
397 
398 	/* Copy the hostnames */
399 	if (meta_h2hi(&sd->sd_med, &mp.med, ep)) {
400 		rval = -1;
401 		goto out;
402 	}
403 
404 	/* Resolve the IP addresses for the host list */
405 	if (meta_med_hnm2ip(&mp.med, ep)) {
406 		rval = -1;
407 		goto out;
408 	}
409 
410 	/* Bring the mediator record up to date with the set record */
411 	medr = rb_medr;				/* structure assignment */
412 	medr.med_rec_meds = sd->sd_med;		/* structure assigment */
413 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
414 
415 	/* END CHECK CODE */
416 
417 	/* Lock the set on current set members */
418 	if (!(MD_MNSET_DESC(sd))) {
419 		/* all signals already blocked for MN disket */
420 		md_rb_sig_handling_on();
421 		for (i = 0; i < MD_MAXSIDES; i++) {
422 			/* Skip empty slots */
423 			if (sd->sd_nodes[i][0] == '\0')
424 				continue;
425 
426 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
427 				rval = -1;
428 				goto out;
429 			}
430 			lock_flag = 1;
431 		}
432 	}
433 
434 	RB_TEST(1, "meta_set_addmeds", ep)
435 
436 	RB_PREEMPT;
437 	rb_level = 1;	/* level 1 */
438 
439 	RB_TEST(2, "meta_set_addmeds", ep)
440 
441 	/*
442 	 * Add the new mediator information to all hosts in the set.
443 	 * For MN diskset, each node sends mediator list to its kernel.
444 	 */
445 	if (MD_MNSET_DESC(sd)) {
446 		nd = sd->sd_nodelist;
447 		while (nd) {
448 			/* All nodes are guaranteed to be ALIVE */
449 			if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep))
450 				goto rollback;
451 			nd = nd->nd_next;
452 		}
453 	} else  {
454 		for (i = 0; i < MD_MAXSIDES; i++) {
455 			/* Skip empty slots */
456 			if (sd->sd_nodes[i][0] == '\0')
457 				continue;
458 
459 			if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep))
460 				goto rollback;
461 		}
462 	}
463 
464 	RB_TEST(3, "meta_set_addmeds", ep)
465 
466 	RB_PREEMPT;
467 	rb_level = 2;	/* level 2 */
468 
469 	RB_TEST(4, "meta_set_addmeds", ep)
470 
471 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
472 	    ep)) == NULL) {
473 		if (! mdisok(ep))
474 			goto rollback;
475 	}
476 
477 	RB_TEST(5, "meta_set_addmeds", ep)
478 
479 	RB_PREEMPT;
480 	rb_level = 3;	/* level 3 */
481 
482 	RB_TEST(6, "meta_set_addmeds", ep)
483 
484 	/* Inform the mediator hosts of the new information */
485 	for (i = 0; i < max_meds; i++) {
486 		if (sd->sd_med.n_lst[i].a_cnt == 0)
487 			continue;
488 
489 		/* medr contains new mediator node list */
490 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
491 			goto rollback;
492 	}
493 
494 	RB_TEST(7, "meta_set_addmeds", ep)
495 
496 	RB_PREEMPT;
497 	rb_level = 4;	/* level 4 */
498 
499 	RB_TEST(8, "meta_set_addmeds", ep)
500 
501 	/* In MN diskset, mediator list updated in clnt_updmeds call */
502 	if (dd != NULL) {
503 		if (!(MD_MNSET_DESC(sd))) {
504 			if (metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde,
505 			    NULL) != 0) {
506 				(void) mdstealerror(ep, &mp.med_mde);
507 				goto rollback;
508 			}
509 		}
510 
511 		/*
512 		 * If only 50% mddbs available, mediator will be
513 		 * golden by this ioctl on a traditional diskset.
514 		 *
515 		 * On a MN disket, this only happens if the mediator
516 		 * add operation is executed on the master node.
517 		 * If a slave node is adding the mediator, the mediator
518 		 * won't be marked golden until the next mddb change.
519 		 */
520 		(void) memset(&mup, '\0', sizeof (mddb_med_upd_parm_t));
521 		mup.med_setno = sp->setno;
522 		if (metaioctl(MD_MED_UPD_MED, &mup, &mup.med_mde, NULL) != 0)
523 			mdclrerror(&mup.med_mde);
524 	}
525 
526 out:
527 	if (suspend1_flag) {
528 		/*
529 		 * Unlock diskset by resuming messages across the diskset.
530 		 * Just resume all classes so that resume is the same whether
531 		 * just one class was locked or all classes were locked.
532 		 */
533 		nd = sd->sd_nodelist;
534 		/* All nodes are guaranteed to be ALIVE */
535 		while (nd) {
536 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
537 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
538 				if (rval == 0)
539 					(void) mdstealerror(ep, &xep);
540 				rval = -1;
541 				mde_perror(ep, dgettext(TEXT_DOMAIN,
542 				    "Unable to resume rpc.mdcommd.\n"));
543 			}
544 			nd = nd->nd_next;
545 		}
546 		meta_ping_mnset(sp->setno);
547 	}
548 	if (lock_flag) {
549 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
550 		if (MD_MNSET_DESC(sd)) {
551 			nd = sd->sd_nodelist;
552 			while (nd) {
553 				/* All nodes are guaranteed to be ALIVE */
554 				if (clnt_unlock_set(nd->nd_nodename,
555 				    cl_sk, &xep)) {
556 					if (rval == 0)
557 						(void) mdstealerror(ep, &xep);
558 					rval = -1;
559 				}
560 				nd = nd->nd_next;
561 			}
562 		} else  {
563 			for (i = 0; i < MD_MAXSIDES; i++) {
564 				/* Skip empty slots */
565 				if (sd->sd_nodes[i][0] == '\0')
566 					continue;
567 
568 				if (clnt_unlock_set(sd->sd_nodes[i],
569 				    cl_sk, &xep)) {
570 					if (rval == 0)
571 						(void) mdstealerror(ep, &xep);
572 					rval = -1;
573 				}
574 			}
575 		}
576 		cl_set_setkey(NULL);
577 	}
578 
579 	metafreedrivedesc(&dd);
580 
581 	if (MD_MNSET_DESC(sd)) {
582 		/* release signals back to what they were on entry */
583 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
584 			mdclrerror(&xep);
585 	} else {
586 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
587 	}
588 
589 	return (rval);
590 
591 rollback:
592 	/* all signals already blocked for MN disket */
593 	if (!(MD_MNSET_DESC(sd))) {
594 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
595 			mdclrerror(&xep);
596 	}
597 
598 	rval = -1;
599 
600 	/*
601 	 * level 4
602 	 * In MN diskset, mediator list updated in clnt_updmeds call
603 	 */
604 	if (rb_level > 3 && (dd != NULL) && (!(MD_MNSET_DESC(sd)))) {
605 		(void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
606 		mp.med_setno = sp->setno;
607 		(void) meta_h2hi(&rb_t, &mp.med, &xep);
608 		mdclrerror(&xep);
609 		(void) meta_med_hnm2ip(&mp.med, &xep);
610 		mdclrerror(&xep);
611 		(void) metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL);
612 	}
613 
614 	/* level 3 */
615 	if (rb_level > 2) {
616 		for (i = 0; i < max_meds; i++) {
617 			if (sd->sd_med.n_lst[i].a_cnt == 0)
618 				continue;
619 
620 			/*
621 			 * rb_medr contains the rollback mediator node list.
622 			 * Send the rollback mediator information to the
623 			 * new mediator node list.  If a node had this RPC
624 			 * called, but its node is not in the mediator node
625 			 * list, rpc.metamedd will delete the mediator
626 			 * record on that node.
627 			 */
628 			if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
629 			    &rb_medr, &xep))
630 				mdclrerror(&xep);
631 		}
632 	}
633 
634 	/* level 2 */
635 	if (rb_level > 1) {
636 		metafreedrivedesc(&dd);
637 	}
638 
639 	/* level 1 */
640 	if (rb_level > 0) {
641 		/* Delete mediator information from all hosts in the set */
642 		if (MD_MNSET_DESC(sd)) {
643 			nd = sd->sd_nodelist;
644 			while (nd) {
645 				/* All nodes are guaranteed to be ALIVE */
646 				if (clnt_updmeds(nd->nd_nodename, sp, &rb_t,
647 				    &xep))
648 					mdclrerror(&xep);
649 				nd = nd->nd_next;
650 			}
651 		} else  {
652 			for (i = 0; i < MD_MAXSIDES; i++) {
653 				/* Skip empty slots */
654 				if (sd->sd_nodes[i][0] == '\0')
655 					continue;
656 
657 				if (clnt_updmeds(sd->sd_nodes[i], sp, &rb_t,
658 				    &xep))
659 					mdclrerror(&xep);
660 			}
661 		}
662 	}
663 
664 	/* level 0 */
665 	if (suspend1_flag) {
666 		/*
667 		 * Unlock diskset by resuming messages across the diskset.
668 		 * Just resume all classes so that resume is the same whether
669 		 * just one class was locked or all classes were locked.
670 		 */
671 		nd = sd->sd_nodelist;
672 		/* All nodes are guaranteed to be ALIVE */
673 		while (nd) {
674 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
675 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
676 				mdclrerror(&xep);
677 				mde_perror(ep, dgettext(TEXT_DOMAIN,
678 				    "Unable to resume rpc.mdcommd.\n"));
679 			}
680 			nd = nd->nd_next;
681 		}
682 		meta_ping_mnset(sp->setno);
683 	}
684 	if (lock_flag) {
685 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
686 		if (MD_MNSET_DESC(sd)) {
687 			nd = sd->sd_nodelist;
688 			while (nd) {
689 				/* All nodes are guaranteed to be ALIVE */
690 				if (clnt_unlock_set(nd->nd_nodename,
691 				    cl_sk, &xep)) {
692 					mdclrerror(&xep);
693 				}
694 				nd = nd->nd_next;
695 			}
696 		} else  {
697 			for (i = 0; i < MD_MAXSIDES; i++) {
698 				/* Skip empty slots */
699 				if (sd->sd_nodes[i][0] == '\0')
700 					continue;
701 
702 				if (clnt_unlock_set(sd->sd_nodes[i],
703 				    cl_sk, &xep)) {
704 					mdclrerror(&xep);
705 				}
706 			}
707 		}
708 		cl_set_setkey(NULL);
709 	}
710 
711 	/* release signals back to what they were on entry */
712 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
713 		mdclrerror(&xep);
714 
715 	if (!(MD_MNSET_DESC(sd))) {
716 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
717 	}
718 
719 	return (rval);
720 }
721 
722 int
meta_set_deletemeds(mdsetname_t * sp,int node_c,char ** node_v,int forceflg,md_error_t * ep)723 meta_set_deletemeds(
724 	mdsetname_t		*sp,
725 	int			node_c,
726 	char			**node_v,
727 	int			forceflg,
728 	md_error_t		*ep
729 )
730 {
731 	md_set_desc		*sd = NULL;
732 	md_drive_desc		*dd = NULL;
733 	mddb_med_parm_t		mp;
734 	md_h_arr_t		rb_t;
735 	med_rec_t		medr;
736 	med_rec_t		rb_medr;
737 	int			i, j;
738 	char			**n_l = NULL;
739 	int			n_c = 0;
740 	sigset_t		oldsigs;
741 	md_setkey_t		*cl_sk;
742 	int			rb_level = 0;
743 	md_error_t		xep = mdnullerror;
744 	int			rval = 0;
745 	int			max_meds;
746 	md_mnnode_desc		*nd;
747 	int			suspend1_flag = 0;
748 	int			lock_flag = 0;
749 
750 	mdclrerror(ep);
751 
752 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
753 		return (-1);
754 
755 	/* Make sure we own the set */
756 	if (meta_check_ownership(sp, ep) != 0)
757 		return (-1);
758 
759 	for (i = 0; i < node_c; i++)
760 		if (strchr(node_v[i], ',') != NULL)
761 			return (mderror(ep, MDE_ONLYNODENAME, node_v[i]));
762 
763 	if (nodesuniq(sp, node_c, node_v, ep))
764 		return (-1);
765 
766 	if ((max_meds = get_max_meds(ep)) == 0)
767 		return (-1);
768 
769 	/*
770 	 * The mediator information (which is part of the set record) is
771 	 * stored in the local mddbs of each node in the diskset.
772 	 * Each node's rpc.metad daemon reads in the set
773 	 * records from that node's local mddb and caches them
774 	 * internally. Any process needing diskset information contacts its
775 	 * local rpc.metad to get this information.  Since each node in the
776 	 * diskset is independently reading the set information from its local
777 	 * mddb, the set records in the local mddbs must stay
778 	 * in-sync, so that all nodes have a consistent view of the diskset.
779 	 *
780 	 * For a multinode diskset, explicitly verify that all nodes in the
781 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
782 	 * fail this operation since all nodes must be ALIVE in order to delete
783 	 * the mediator information from the set record in their local mddb.
784 	 * If a panic of this node leaves the local mddbs set records
785 	 * out-of-sync, the reconfig cycle will fix the local mddbs and
786 	 * force them back into synchronization.
787 	 */
788 	if (MD_MNSET_DESC(sd)) {
789 		nd = sd->sd_nodelist;
790 		while (nd) {
791 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
792 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
793 					sp->setno,
794 					nd->nd_nodename, NULL, sp->setname);
795 				return (-1);
796 			}
797 			nd = nd->nd_next;
798 		}
799 	}
800 
801 	if (sd->sd_med.n_cnt == 0)
802 		return (mderror(ep, MDE_NOMED, NULL));
803 
804 	/* Make a list of nodes to check */
805 	for (i = 0; i < max_meds; i++)
806 		if (sd->sd_med.n_lst[i].a_cnt > 0)
807 			n_c = add_lst(&n_l, sd->sd_med.n_lst[i].a_nm[0]);
808 
809 	for (i = 0; i < node_c; i++) {
810 		if (! strinlst(node_v[i], n_c, n_l)) {
811 			(void) del_lst(&n_l);
812 			return (mddserror(ep, MDE_DS_ISNOTMED, sp->setno,
813 			    node_v[i], NULL, sp->setname));
814 		}
815 	}
816 
817 	(void) del_lst(&n_l);
818 
819 	/* Save a copy of the current mediator information */
820 	rb_t = sd->sd_med;			/* structure assignment */
821 
822 	/* Setup the mediator record for rollback */
823 	(void) memset(&rb_medr, '\0', sizeof (med_rec_t));
824 	rb_medr.med_rec_mag = MED_REC_MAGIC;
825 	rb_medr.med_rec_rev = MED_REC_REV;
826 	rb_medr.med_rec_fl  = 0;
827 	rb_medr.med_rec_sn  = sp->setno;
828 	(void) strcpy(rb_medr.med_rec_snm, sp->setname);
829 	if (MD_MNSET_DESC(sd)) {
830 		/*
831 		 * In MN diskset, use a generic nodename, multiowner, in the
832 		 * mediator record which allows any node to access mediator
833 		 * information.  MN diskset reconfig cycle forces consistent
834 		 * view of set/node/drive/mediator information across all nodes
835 		 * in the MN diskset.  This allows the relaxation of
836 		 * node name checking in rpc.metamedd for MN disksets.
837 		 *
838 		 * In the traditional diskset, only a node that is in the
839 		 * mediator record's diskset nodelist can access mediator
840 		 * data.
841 		 */
842 		(void) strlcpy(rb_medr.med_rec_nodes[0], MED_MN_CALLER,
843 		    MD_MAX_NODENAME_PLUS_1);
844 	} else {
845 		for (i = 0; i < MD_MAXSIDES; i++)
846 			(void) strcpy(rb_medr.med_rec_nodes[i],
847 				sd->sd_nodes[i]);
848 	}
849 	rb_medr.med_rec_meds = sd->sd_med;	/* structure assignment */
850 	(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
851 	rb_medr.med_rec_foff = 0;
852 	crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);
853 
854 	/* Delete the mediators requested from the set */
855 	for (i = 0; i < node_c; i++) {
856 		for (j = 0; j < max_meds; j++) {
857 			if (sd->sd_med.n_lst[j].a_cnt == 0)
858 				continue;
859 			if (strcmp(node_v[i],
860 			    sd->sd_med.n_lst[j].a_nm[0]) != 0)
861 				continue;
862 			SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
863 			    SVM_TAG_MEDIATOR, sp->setno, j);
864 			(void) memset(&sd->sd_med.n_lst[j], '\0',
865 			    sizeof (md_h_t));
866 			sd->sd_med.n_cnt--;
867 			break;
868 		}
869 	}
870 
871 	medr = rb_medr;				/* structure assignment */
872 	medr.med_rec_meds = sd->sd_med;		/* structure assignment */
873 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
874 
875 	/* END CHECK CODE */
876 
877 	/* Lock the set on current set members */
878 	if (MD_MNSET_DESC(sd)) {
879 		/* Make sure we are blocking all signals */
880 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
881 			mdclrerror(&xep);
882 		/*
883 		 * Lock the set on current set members.
884 		 * lock_set and SUSPEND are used to protect against
885 		 * other metaset commands running on the other nodes.
886 		 */
887 		nd = sd->sd_nodelist;
888 		while (nd) {
889 			/* All nodes are guaranteed to be ALIVE */
890 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
891 				if (forceflg && strcmp(mynode(),
892 				    nd->nd_nodename) != 0) {
893 					mdclrerror(ep);
894 					nd = nd->nd_next;
895 					continue;
896 				}
897 				rval = -1;
898 				goto out;
899 			}
900 			lock_flag = 1;
901 			nd = nd->nd_next;
902 		}
903 		/*
904 		 * Lock out other meta* commands by suspending
905 		 * class 1 messages across the diskset.
906 		 */
907 		nd = sd->sd_nodelist;
908 		/* All nodes are guaranteed to be ALIVE */
909 		while (nd) {
910 			if (clnt_mdcommdctl(nd->nd_nodename,
911 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
912 			    MD_MSCF_NO_FLAGS, ep)) {
913 				rval = -1;
914 				goto out;
915 			}
916 			suspend1_flag = 1;
917 			nd = nd->nd_next;
918 		}
919 	} else  {
920 		md_rb_sig_handling_on();
921 		for (i = 0; i < MD_MAXSIDES; i++) {
922 			/* Skip empty slots */
923 			if (sd->sd_nodes[i][0] == '\0')
924 				continue;
925 
926 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
927 				if (forceflg &&
928 				    strcmp(mynode(), sd->sd_nodes[i]) != 0) {
929 					mdclrerror(ep);
930 					continue;
931 				}
932 				rval = -1;
933 				goto out;
934 			}
935 			lock_flag = 1;
936 		}
937 	}
938 
939 	RB_TEST(1, "meta_set_deletemeds", ep)
940 
941 	RB_PREEMPT;
942 	rb_level = 1;	/* level 1 */
943 
944 	RB_TEST(2, "meta_set_deletemeds", ep)
945 
946 	/* Update the mediator information on all hosts in the set */
947 	if (MD_MNSET_DESC(sd)) {
948 		nd = sd->sd_nodelist;
949 		while (nd) {
950 			/* All nodes are guaranteed to be ALIVE */
951 			if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med,
952 			    ep)) {
953 				if (forceflg && strcmp(mynode(),
954 				    nd->nd_nodename) != 0) {
955 					mdclrerror(ep);
956 					continue;
957 				}
958 				goto rollback;
959 			}
960 			nd = nd->nd_next;
961 		}
962 	} else  {
963 		for (i = 0; i < MD_MAXSIDES; i++) {
964 			/* Skip empty slots */
965 			if (sd->sd_nodes[i][0] == '\0')
966 				continue;
967 
968 			if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med,
969 			    ep)) {
970 				if (forceflg && strcmp(mynode(),
971 				    sd->sd_nodes[i]) != 0) {
972 					mdclrerror(ep);
973 					continue;
974 				}
975 				goto rollback;
976 			}
977 		}
978 	}
979 
980 	RB_TEST(3, "meta_set_deletemeds", ep)
981 
982 	RB_PREEMPT;
983 	rb_level = 2;	/* level 2 */
984 
985 	RB_TEST(5, "meta_set_deletemeds", ep)
986 
987 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
988 	    ep)) == NULL) {
989 		if (! mdisok(ep))
990 			goto rollback;
991 	}
992 
993 	RB_TEST(5, "meta_set_deletemeds", ep)
994 
995 	RB_PREEMPT;
996 	rb_level = 3;	/* level 3 */
997 
998 	RB_TEST(6, "meta_set_deletemeds", ep)
999 
1000 	if (dd != NULL) {
1001 		/*
1002 		 * Set up the parameters to the call to update the
1003 		 * kernel mediator list
1004 		 */
1005 		(void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
1006 		mp.med_setno = sp->setno;
1007 		if (meta_h2hi(&sd->sd_med, &mp.med, ep))
1008 			goto rollback;
1009 
1010 		/* Resolve the IP addresses for the host list */
1011 		if (meta_med_hnm2ip(&mp.med, ep))
1012 			goto rollback;
1013 
1014 		if (metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL) != 0) {
1015 			(void) mdstealerror(ep, &mp.med_mde);
1016 			goto rollback;
1017 		}
1018 	}
1019 
1020 	RB_TEST(7, "meta_set_deletemeds", ep)
1021 
1022 	RB_PREEMPT;
1023 	rb_level = 4;	/* level 4 */
1024 
1025 	RB_TEST(8, "meta_set_deletemeds", ep)
1026 
1027 	/* Inform the mediator hosts of the new status */
1028 	for (i = 0; i < max_meds; i++) {
1029 		if (rb_t.n_lst[i].a_cnt == 0)
1030 			continue;
1031 
1032 		/*
1033 		 * medr contains the new mediator node list.
1034 		 * Send the new mediator information to the
1035 		 * new mediator node list.  If a node had this RPC
1036 		 * called, but its node is no longer in the new mediator
1037 		 * node list, rpc.metamedd will delete the mediator
1038 		 * record on that node.
1039 		 */
1040 		if (clnt_med_upd_rec(&rb_t.n_lst[i], sp, &medr, ep)) {
1041 			if ((forceflg && mdanyrpcerror(ep)) ||
1042 			    mdisrpcerror(ep, RPC_PROGNOTREGISTERED)) {
1043 				mdclrerror(ep);
1044 				continue;
1045 			}
1046 			goto rollback;
1047 		}
1048 	}
1049 
1050 out:
1051 	if (dd)
1052 		metafreedrivedesc(&dd);
1053 
1054 	if (suspend1_flag) {
1055 		/*
1056 		 * Unlock diskset by resuming messages across the diskset.
1057 		 * Just resume all classes so that resume is the same whether
1058 		 * just one class was locked or all classes were locked.
1059 		 */
1060 		nd = sd->sd_nodelist;
1061 		/* All nodes are guaranteed to be ALIVE */
1062 		while (nd) {
1063 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1064 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1065 				if (rval == 0)
1066 					(void) mdstealerror(ep, &xep);
1067 				rval = -1;
1068 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1069 				    "Unable to resume rpc.mdcommd.\n"));
1070 			}
1071 			nd = nd->nd_next;
1072 		}
1073 		meta_ping_mnset(sp->setno);
1074 	}
1075 
1076 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1077 	if (lock_flag) {
1078 		if (MD_MNSET_DESC(sd)) {
1079 			nd = sd->sd_nodelist;
1080 			while (nd) {
1081 				/* All nodes are guaranteed to be ALIVE */
1082 				if (clnt_unlock_set(nd->nd_nodename,
1083 				    cl_sk, &xep)) {
1084 					if (forceflg &&
1085 					    strcmp(mynode(),
1086 					    nd->nd_nodename) != 0) {
1087 						mdclrerror(ep);
1088 						continue;
1089 					}
1090 					if (rval == 0)
1091 						(void) mdstealerror(ep, &xep);
1092 					rval = -1;
1093 				}
1094 				nd = nd->nd_next;
1095 			}
1096 		} else {
1097 			for (i = 0; i < MD_MAXSIDES; i++) {
1098 				/* Skip empty slots */
1099 				if (sd->sd_nodes[i][0] == '\0')
1100 					continue;
1101 
1102 				if (clnt_unlock_set(sd->sd_nodes[i],
1103 				    cl_sk, &xep)) {
1104 					if (forceflg &&
1105 					    strcmp(mynode(),
1106 					    sd->sd_nodes[i]) != 0) {
1107 						mdclrerror(ep);
1108 						continue;
1109 					}
1110 					if (rval == 0)
1111 						(void) mdstealerror(ep, &xep);
1112 					rval = -1;
1113 				}
1114 			}
1115 		}
1116 	}
1117 	cl_set_setkey(NULL);
1118 
1119 	if (MD_MNSET_DESC(sd)) {
1120 		/* release signals back to what they were on entry */
1121 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
1122 			mdclrerror(&xep);
1123 	} else {
1124 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1125 	}
1126 
1127 	return (rval);
1128 
1129 rollback:
1130 	/* all signals already blocked for MN disket */
1131 	if (!(MD_MNSET_DESC(sd))) {
1132 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
1133 			mdclrerror(&xep);
1134 	}
1135 
1136 	rval = -1;
1137 
1138 	(void) del_lst(&n_l);
1139 
1140 	/* level 4 */
1141 	if (rb_level > 4) {
1142 		for (i = 0; i < max_meds; i++) {
1143 			if (rb_t.n_lst[i].a_cnt == 0)
1144 				continue;
1145 
1146 			/*
1147 			 * rb_medr contains the rollback mediator node list.
1148 			 * Send the rollback mediator information to the
1149 			 * new mediator node list.  This will recreate the
1150 			 * mediator record on all nodes where the mediator
1151 			 * record had been removed.
1152 			 */
1153 			if (clnt_med_upd_rec(&rb_t.n_lst[i], sp, &rb_medr,
1154 			    &xep))
1155 				mdclrerror(&xep);
1156 		}
1157 	}
1158 
1159 	/* level 3 */
1160 	if (rb_level > 2 && dd != NULL) {
1161 		(void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
1162 		mp.med_setno = sp->setno;
1163 		(void) meta_h2hi(&rb_t, &mp.med, &xep);
1164 		mdclrerror(&xep);
1165 		(void) meta_med_hnm2ip(&mp.med, &xep);
1166 		mdclrerror(&xep);
1167 		(void) metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL);
1168 	}
1169 
1170 	/* level 2 */
1171 	if (rb_level > 1) {
1172 		metafreedrivedesc(&dd);
1173 	}
1174 
1175 	/* level 1 */
1176 	if (rb_level > 0) {
1177 		/* Delete mediator information from all hosts in the set */
1178 		if (MD_MNSET_DESC(sd)) {
1179 			nd = sd->sd_nodelist;
1180 			while (nd) {
1181 				/* All nodes are guaranteed to be ALIVE */
1182 				if (clnt_updmeds(nd->nd_nodename, sp, &rb_t,
1183 				    &xep))
1184 					mdclrerror(&xep);
1185 				nd = nd->nd_next;
1186 			}
1187 		} else  {
1188 			for (i = 0; i < MD_MAXSIDES; i++) {
1189 				/* Skip empty slots */
1190 				if (sd->sd_nodes[i][0] == '\0')
1191 					continue;
1192 
1193 				if (clnt_updmeds(sd->sd_nodes[i], sp, &rb_t,
1194 				    &xep))
1195 					mdclrerror(&xep);
1196 			}
1197 		}
1198 	}
1199 
1200 	/* level 0 */
1201 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1202 	/* Unlock the set */
1203 	/* Don't test lock flag since guaranteed to be set if in rollback */
1204 	if (MD_MNSET_DESC(sd)) {
1205 		/*
1206 		 * Unlock diskset by resuming messages across the diskset.
1207 		 * Just resume all classes so that resume is the same whether
1208 		 * just one class was locked or all classes were locked.
1209 		 */
1210 		if (suspend1_flag) {
1211 			/* All nodes are guaranteed to be ALIVE */
1212 			nd = sd->sd_nodelist;
1213 			while (nd) {
1214 				if (clnt_mdcommdctl(nd->nd_nodename,
1215 				    COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
1216 				    MD_MSCF_NO_FLAGS, &xep)) {
1217 				    mde_perror(&xep, dgettext(TEXT_DOMAIN,
1218 					"Unable to resume rpc.mdcommd.\n"));
1219 				    mdclrerror(&xep);
1220 				}
1221 				nd = nd->nd_next;
1222 			}
1223 			meta_ping_mnset(sp->setno);
1224 		}
1225 		nd = sd->sd_nodelist;
1226 		/* All nodes are guaranteed to be ALIVE */
1227 		while (nd) {
1228 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
1229 				mdclrerror(&xep);
1230 			nd = nd->nd_next;
1231 		}
1232 	} else  {
1233 		for (i = 0; i < MD_MAXSIDES; i++) {
1234 			/* Skip empty slots */
1235 			if (sd->sd_nodes[i][0] == '\0')
1236 				continue;
1237 
1238 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
1239 				mdclrerror(&xep);
1240 		}
1241 	}
1242 	cl_set_setkey(NULL);
1243 
1244 	/* release signals back to what they were on entry */
1245 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1246 		mdclrerror(&xep);
1247 
1248 	if (!(MD_MNSET_DESC(sd))) {
1249 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1250 	}
1251 
1252 	return (rval);
1253 }
1254