1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28
29 /*
30 * Metadevice diskset interfaces
31 */
32
33 #include "meta_set_prv.h"
34 #include <sys/lvm/md_crc.h>
35 #include <sys/lvm/mdmed.h>
36
37 #include <sys/sysevent/eventdefs.h>
38 #include <sys/sysevent/svm.h>
39
40 #define MALSIZ 32
41
42 static int
add_lst(char *** listp,char * item)43 add_lst(char ***listp, char *item)
44 {
45 int i, j;
46
47 if (*listp) {
48 for (i = 0; (*listp)[i]; i++)
49 /* void */;
50 } else {
51 *listp = (char **)Zalloc(MALSIZ * sizeof (char *));
52 i = 0;
53 }
54
55 (*listp)[i] = Strdup(item);
56
57 if ((++i % MALSIZ) == 0) {
58 *listp = (char **)Realloc((void *)*listp,
59 (i + MALSIZ) * sizeof (char *));
60 for (j = i; j < (i + MALSIZ); j++)
61 (*listp)[j] = (char *)NULL;
62 }
63 return (i);
64 }
65
66 static int
del_lst(char *** listp)67 del_lst(char ***listp)
68 {
69 int i;
70
71 if (*listp) {
72 for (i = 0; (*listp)[i]; i++)
73 free((*listp)[i]);
74 free(*listp);
75 *listp = NULL;
76 return (1);
77 } else
78 return (0);
79 }
80
81
82 static int
validate_med_nodes(mdsetname_t * sp,md_h_arr_t * mhp,md_error_t * ep)83 validate_med_nodes(
84 mdsetname_t *sp,
85 md_h_arr_t *mhp,
86 md_error_t *ep
87 )
88 {
89 char *hostname;
90 char *nodename;
91 char *nm;
92 char *cp;
93 int i, j;
94
95
96 for (i = 0; i < MED_MAX_HOSTS; i++) {
97 if (mhp->n_lst[i].a_cnt == 0)
98 continue;
99
100 for (j = 0; j < mhp->n_lst[i].a_cnt; j++) {
101 nm = mhp->n_lst[i].a_nm[j];
102
103 for (cp = nm; *cp; cp++)
104 if (!isprint(*cp) ||
105 strchr(INVALID_IN_NAMES, *cp) != NULL)
106 return (mddserror(ep,
107 MDE_DS_INVALIDMEDNAME,
108 sp->setno, nm, NULL, sp->setname));
109
110 if (clnt_med_hostname(nm, &hostname, ep))
111 return (-1);
112
113 if (j == 0) {
114 if (strcmp(nm, hostname) != 0) {
115 Free(hostname);
116 return (mddserror(ep,
117 MDE_DS_NOTNODENAME, sp->setno, nm,
118 NULL, sp->setname));
119 }
120 nodename = nm;
121 } else {
122 if (strcmp(nodename, hostname) != 0) {
123 Free(hostname);
124 return (mddserror(ep,
125 MDE_DS_ALIASNOMATCH, sp->setno, nm,
126 nodename, sp->setname));
127 }
128 }
129 Free(hostname);
130 }
131 }
132 return (0);
133 }
134
135 /*
136 * Exported Entry Points
137 */
138
139 int
meta_set_addmeds(mdsetname_t * sp,int node_c,char ** node_v,md_error_t * ep)140 meta_set_addmeds(
141 mdsetname_t *sp,
142 int node_c,
143 char **node_v,
144 md_error_t *ep
145 )
146 {
147 md_set_desc *sd = NULL;
148 md_drive_desc *dd = NULL;
149 mddb_med_parm_t mp;
150 mddb_med_upd_parm_t mup;
151 md_h_arr_t t;
152 md_h_arr_t rb_t;
153 med_rec_t medr;
154 med_rec_t rb_medr;
155 char *cp;
156 char **n_l = NULL;
157 int n_c = 0;
158 int i, j;
159 sigset_t oldsigs;
160 md_setkey_t *cl_sk;
161 int rb_level = 0;
162 md_error_t xep = mdnullerror;
163 int rval = 0;
164 int max_meds;
165 md_mnnode_desc *nd;
166 int suspend1_flag = 0;
167 int lock_flag = 0;
168
169 /* Initialize */
170 (void) memset(&t, '\0', sizeof (t));
171 t.n_cnt = node_c;
172 mdclrerror(ep);
173
174 if ((sd = metaget_setdesc(sp, ep)) == NULL)
175 return (-1);
176
177 /* Make sure we own the set */
178 if (meta_check_ownership(sp, ep) != 0)
179 return (-1);
180
181 if ((max_meds = get_max_meds(ep)) == 0)
182 return (-1);
183
184 /*
185 * The mediator information (which is part of the set record) is
186 * stored in the local mddbs of each node in the diskset.
187 * Each node's rpc.metad daemon reads in the set
188 * records from that node's local mddb and caches them
189 * internally. Any process needing diskset information contacts its
190 * local rpc.metad to get this information. Since each node in the
191 * diskset is independently reading the set information from its local
192 * mddb, the set records in the local mddbs must stay
193 * in-sync, so that all nodes have a consistent view of the diskset.
194 *
195 * For a multinode diskset, explicitly verify that all nodes in the
196 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
197 * fail this operation since all nodes must be ALIVE in order to add
198 * the mediator information to the set record in their local mddb.
199 * If a panic of this node leaves the local mddbs set records
200 * out-of-sync, the reconfig cycle will fix the local mddbs and
201 * force them back into synchronization.
202 */
203 if (MD_MNSET_DESC(sd)) {
204 nd = sd->sd_nodelist;
205 while (nd) {
206 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
207 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
208 sp->setno,
209 nd->nd_nodename, NULL, sp->setname);
210 return (-1);
211 }
212 nd = nd->nd_next;
213 }
214 }
215
216 /* Parse the command line into a the md_h_arr_t structure */
217 for (i = 0; i < t.n_cnt; i++) {
218 cp = strtok(node_v[i], ",");
219 j = 0;
220 while (cp) {
221 if (strlen(cp) > (size_t)MD_MAX_NODENAME)
222 return (mddserror(ep, MDE_DS_NODENAMETOOLONG,
223 sp->setno, cp, NULL, sp->setname));
224 if (j >= MAX_HOST_ADDRS)
225 return (mddserror(ep, MDE_DS_TOOMANYALIAS,
226 sp->setno, cp, NULL, sp->setname));
227
228 (void) strcpy(t.n_lst[i].a_nm[j], cp);
229
230 j++;
231
232 cp = strtok(NULL, ",");
233 }
234 t.n_lst[i].a_cnt = j;
235 }
236
237 /* Make a list of nodes to check */
238 for (i = 0; i < t.n_cnt; i++)
239 for (j = 0; j < t.n_lst[i].a_cnt; j++)
240 n_c = add_lst(&n_l, t.n_lst[i].a_nm[j]);
241
242 /* Make sure that there are no redundant nodes */
243 rval = nodesuniq(sp, n_c, n_l, ep);
244
245 (void) del_lst(&n_l);
246
247 if (rval != 0)
248 return (rval);
249
250 /*
251 * Lock the set on current set members.
252 * Set locking done much earlier for MN diskset than for traditional
253 * diskset since lock_set and SUSPEND are used to protect against
254 * other metaset commands running on the other nodes.
255 */
256 if (MD_MNSET_DESC(sd)) {
257 /* Make sure we are blocking all signals */
258 if (procsigs(TRUE, &oldsigs, &xep) < 0)
259 mdclrerror(&xep);
260 nd = sd->sd_nodelist;
261 /* All nodes are guaranteed to be ALIVE */
262 while (nd) {
263 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
264 rval = -1;
265 goto out;
266 }
267 lock_flag = 1;
268 nd = nd->nd_next;
269 }
270 /*
271 * Lock out other meta* commands by suspending
272 * class 1 messages across the diskset.
273 */
274 nd = sd->sd_nodelist;
275 /* All nodes are guaranteed to be ALIVE */
276 while (nd) {
277 if (clnt_mdcommdctl(nd->nd_nodename,
278 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
279 MD_MSCF_NO_FLAGS, ep)) {
280 rval = -1;
281 goto out;
282 }
283 suspend1_flag = 1;
284 nd = nd->nd_next;
285 }
286 }
287
288 if (validate_med_nodes(sp, &t, ep)) {
289 rval = -1;
290 goto out;
291 }
292
293 /* Check existing mediators against new, if any */
294 if (sd->sd_med.n_cnt > 0) {
295 for (i = 0; i < max_meds; i++)
296 if (sd->sd_med.n_lst[i].a_cnt > 0)
297 n_c = add_lst(&n_l,
298 sd->sd_med.n_lst[i].a_nm[0]);
299
300 for (i = 0; i < t.n_cnt; i++) {
301 if (strinlst(t.n_lst[i].a_nm[0], n_c, n_l)) {
302 (void) del_lst(&n_l);
303 (void) mddserror(ep, MDE_DS_ISMED, sp->setno,
304 t.n_lst[i].a_nm[0], NULL,
305 sp->setname);
306 rval = -1;
307 goto out;
308 }
309 }
310 (void) del_lst(&n_l);
311 }
312
313 if ((t.n_cnt + sd->sd_med.n_cnt) > max_meds) {
314 (void) mderror(ep, MDE_TOOMANYMED, NULL);
315 rval = -1;
316 goto out;
317 }
318
319 /* Copy the current mediator list for rollback */
320 rb_t = sd->sd_med; /* structure assignment */
321
322 /* Setup the mediator record roll-back structure */
323 (void) memset(&rb_medr, '\0', sizeof (med_rec_t));
324 rb_medr.med_rec_mag = MED_REC_MAGIC;
325 rb_medr.med_rec_rev = MED_REC_REV;
326 rb_medr.med_rec_fl = 0;
327 rb_medr.med_rec_sn = sp->setno;
328 (void) strcpy(rb_medr.med_rec_snm, sp->setname);
329 if (MD_MNSET_DESC(sd)) {
330 /*
331 * For a MN diskset the mediator is not given a list of
332 * hosts in the set. Instead a generic name (multiowner) is
333 * given to the mediator which will allow any node to access
334 * the mediator data as long as it provides the correct
335 * setname and set number. In a MN diskset, the mediator
336 * data is only used when a first node joins the diskset
337 * and becomes the master of the MN diskset.
338 *
339 * The traditional diskset code keeps the host list in
340 * the mediator record up to date with respect to the host
341 * list in the traditional diskset. This keeps an unauthorized
342 * node in the traditional diskset from accessing the data
343 * in the mediator record and being able to 'take' the
344 * diskset.
345 *
346 * This additional check is needed in the traditional diskset
347 * since a panic during the metaset command can leave
348 * the diskset with some nodes thinking that an
349 * action has occurred and other nodes thinking the opposite.
350 * A node may have really been removed from a diskset, but
351 * that node doesn't realize this so this node must be
352 * blocked from using the mediator data when attempting
353 * to 'take' the diskset.
354 * (Traditional diskset code has each node's rpc.metad
355 * cleaning up from an inconsistent state without any
356 * knowledge from the other nodes in the diskset).
357 *
358 * In the MN diskset, the reconfig steps force a consistent
359 * state across all nodes in the diskset, so no node
360 * needs to be blocked from accessing the mediator data.
361 * This allow the MN diskset to use a common 'nodename'
362 * in the mediator record. This allows the mediator
363 * daemon to remain unchanged even though a large number of
364 * nodes are supported by the MN diskset.
365 */
366 (void) strlcpy(rb_medr.med_rec_nodes[0], MED_MN_CALLER,
367 MD_MAX_NODENAME_PLUS_1);
368 } else {
369 for (i = 0; i < MD_MAXSIDES; i++)
370 (void) strcpy(rb_medr.med_rec_nodes[i],
371 sd->sd_nodes[i]);
372 }
373 rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */
374 (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
375 rb_medr.med_rec_foff = 0;
376 crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);
377
378 /* Merge new mediators into the set record */
379 for (i = 0; i < t.n_cnt; i++) {
380 for (j = 0; j < max_meds; j++) {
381 if (sd->sd_med.n_lst[j].a_cnt > 0)
382 continue;
383 sd->sd_med.n_lst[j] = t.n_lst[i];
384 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ADD, SVM_TAG_MEDIATOR,
385 sp->setno, j);
386 sd->sd_med.n_cnt++;
387 break;
388 }
389 }
390
391 /*
392 * Setup the kernel mediator list, which also validates that the
393 * hosts have valid IP addresses
394 */
395 (void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
396 mp.med_setno = sp->setno;
397
398 /* Copy the hostnames */
399 if (meta_h2hi(&sd->sd_med, &mp.med, ep)) {
400 rval = -1;
401 goto out;
402 }
403
404 /* Resolve the IP addresses for the host list */
405 if (meta_med_hnm2ip(&mp.med, ep)) {
406 rval = -1;
407 goto out;
408 }
409
410 /* Bring the mediator record up to date with the set record */
411 medr = rb_medr; /* structure assignment */
412 medr.med_rec_meds = sd->sd_med; /* structure assigment */
413 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
414
415 /* END CHECK CODE */
416
417 /* Lock the set on current set members */
418 if (!(MD_MNSET_DESC(sd))) {
419 /* all signals already blocked for MN disket */
420 md_rb_sig_handling_on();
421 for (i = 0; i < MD_MAXSIDES; i++) {
422 /* Skip empty slots */
423 if (sd->sd_nodes[i][0] == '\0')
424 continue;
425
426 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
427 rval = -1;
428 goto out;
429 }
430 lock_flag = 1;
431 }
432 }
433
434 RB_TEST(1, "meta_set_addmeds", ep)
435
436 RB_PREEMPT;
437 rb_level = 1; /* level 1 */
438
439 RB_TEST(2, "meta_set_addmeds", ep)
440
441 /*
442 * Add the new mediator information to all hosts in the set.
443 * For MN diskset, each node sends mediator list to its kernel.
444 */
445 if (MD_MNSET_DESC(sd)) {
446 nd = sd->sd_nodelist;
447 while (nd) {
448 /* All nodes are guaranteed to be ALIVE */
449 if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep))
450 goto rollback;
451 nd = nd->nd_next;
452 }
453 } else {
454 for (i = 0; i < MD_MAXSIDES; i++) {
455 /* Skip empty slots */
456 if (sd->sd_nodes[i][0] == '\0')
457 continue;
458
459 if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep))
460 goto rollback;
461 }
462 }
463
464 RB_TEST(3, "meta_set_addmeds", ep)
465
466 RB_PREEMPT;
467 rb_level = 2; /* level 2 */
468
469 RB_TEST(4, "meta_set_addmeds", ep)
470
471 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
472 ep)) == NULL) {
473 if (! mdisok(ep))
474 goto rollback;
475 }
476
477 RB_TEST(5, "meta_set_addmeds", ep)
478
479 RB_PREEMPT;
480 rb_level = 3; /* level 3 */
481
482 RB_TEST(6, "meta_set_addmeds", ep)
483
484 /* Inform the mediator hosts of the new information */
485 for (i = 0; i < max_meds; i++) {
486 if (sd->sd_med.n_lst[i].a_cnt == 0)
487 continue;
488
489 /* medr contains new mediator node list */
490 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
491 goto rollback;
492 }
493
494 RB_TEST(7, "meta_set_addmeds", ep)
495
496 RB_PREEMPT;
497 rb_level = 4; /* level 4 */
498
499 RB_TEST(8, "meta_set_addmeds", ep)
500
501 /* In MN diskset, mediator list updated in clnt_updmeds call */
502 if (dd != NULL) {
503 if (!(MD_MNSET_DESC(sd))) {
504 if (metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde,
505 NULL) != 0) {
506 (void) mdstealerror(ep, &mp.med_mde);
507 goto rollback;
508 }
509 }
510
511 /*
512 * If only 50% mddbs available, mediator will be
513 * golden by this ioctl on a traditional diskset.
514 *
515 * On a MN disket, this only happens if the mediator
516 * add operation is executed on the master node.
517 * If a slave node is adding the mediator, the mediator
518 * won't be marked golden until the next mddb change.
519 */
520 (void) memset(&mup, '\0', sizeof (mddb_med_upd_parm_t));
521 mup.med_setno = sp->setno;
522 if (metaioctl(MD_MED_UPD_MED, &mup, &mup.med_mde, NULL) != 0)
523 mdclrerror(&mup.med_mde);
524 }
525
526 out:
527 if (suspend1_flag) {
528 /*
529 * Unlock diskset by resuming messages across the diskset.
530 * Just resume all classes so that resume is the same whether
531 * just one class was locked or all classes were locked.
532 */
533 nd = sd->sd_nodelist;
534 /* All nodes are guaranteed to be ALIVE */
535 while (nd) {
536 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
537 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
538 if (rval == 0)
539 (void) mdstealerror(ep, &xep);
540 rval = -1;
541 mde_perror(ep, dgettext(TEXT_DOMAIN,
542 "Unable to resume rpc.mdcommd.\n"));
543 }
544 nd = nd->nd_next;
545 }
546 meta_ping_mnset(sp->setno);
547 }
548 if (lock_flag) {
549 cl_sk = cl_get_setkey(sp->setno, sp->setname);
550 if (MD_MNSET_DESC(sd)) {
551 nd = sd->sd_nodelist;
552 while (nd) {
553 /* All nodes are guaranteed to be ALIVE */
554 if (clnt_unlock_set(nd->nd_nodename,
555 cl_sk, &xep)) {
556 if (rval == 0)
557 (void) mdstealerror(ep, &xep);
558 rval = -1;
559 }
560 nd = nd->nd_next;
561 }
562 } else {
563 for (i = 0; i < MD_MAXSIDES; i++) {
564 /* Skip empty slots */
565 if (sd->sd_nodes[i][0] == '\0')
566 continue;
567
568 if (clnt_unlock_set(sd->sd_nodes[i],
569 cl_sk, &xep)) {
570 if (rval == 0)
571 (void) mdstealerror(ep, &xep);
572 rval = -1;
573 }
574 }
575 }
576 cl_set_setkey(NULL);
577 }
578
579 metafreedrivedesc(&dd);
580
581 if (MD_MNSET_DESC(sd)) {
582 /* release signals back to what they were on entry */
583 if (procsigs(FALSE, &oldsigs, &xep) < 0)
584 mdclrerror(&xep);
585 } else {
586 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
587 }
588
589 return (rval);
590
591 rollback:
592 /* all signals already blocked for MN disket */
593 if (!(MD_MNSET_DESC(sd))) {
594 if (procsigs(TRUE, &oldsigs, &xep) < 0)
595 mdclrerror(&xep);
596 }
597
598 rval = -1;
599
600 /*
601 * level 4
602 * In MN diskset, mediator list updated in clnt_updmeds call
603 */
604 if (rb_level > 3 && (dd != NULL) && (!(MD_MNSET_DESC(sd)))) {
605 (void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
606 mp.med_setno = sp->setno;
607 (void) meta_h2hi(&rb_t, &mp.med, &xep);
608 mdclrerror(&xep);
609 (void) meta_med_hnm2ip(&mp.med, &xep);
610 mdclrerror(&xep);
611 (void) metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL);
612 }
613
614 /* level 3 */
615 if (rb_level > 2) {
616 for (i = 0; i < max_meds; i++) {
617 if (sd->sd_med.n_lst[i].a_cnt == 0)
618 continue;
619
620 /*
621 * rb_medr contains the rollback mediator node list.
622 * Send the rollback mediator information to the
623 * new mediator node list. If a node had this RPC
624 * called, but its node is not in the mediator node
625 * list, rpc.metamedd will delete the mediator
626 * record on that node.
627 */
628 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
629 &rb_medr, &xep))
630 mdclrerror(&xep);
631 }
632 }
633
634 /* level 2 */
635 if (rb_level > 1) {
636 metafreedrivedesc(&dd);
637 }
638
639 /* level 1 */
640 if (rb_level > 0) {
641 /* Delete mediator information from all hosts in the set */
642 if (MD_MNSET_DESC(sd)) {
643 nd = sd->sd_nodelist;
644 while (nd) {
645 /* All nodes are guaranteed to be ALIVE */
646 if (clnt_updmeds(nd->nd_nodename, sp, &rb_t,
647 &xep))
648 mdclrerror(&xep);
649 nd = nd->nd_next;
650 }
651 } else {
652 for (i = 0; i < MD_MAXSIDES; i++) {
653 /* Skip empty slots */
654 if (sd->sd_nodes[i][0] == '\0')
655 continue;
656
657 if (clnt_updmeds(sd->sd_nodes[i], sp, &rb_t,
658 &xep))
659 mdclrerror(&xep);
660 }
661 }
662 }
663
664 /* level 0 */
665 if (suspend1_flag) {
666 /*
667 * Unlock diskset by resuming messages across the diskset.
668 * Just resume all classes so that resume is the same whether
669 * just one class was locked or all classes were locked.
670 */
671 nd = sd->sd_nodelist;
672 /* All nodes are guaranteed to be ALIVE */
673 while (nd) {
674 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
675 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
676 mdclrerror(&xep);
677 mde_perror(ep, dgettext(TEXT_DOMAIN,
678 "Unable to resume rpc.mdcommd.\n"));
679 }
680 nd = nd->nd_next;
681 }
682 meta_ping_mnset(sp->setno);
683 }
684 if (lock_flag) {
685 cl_sk = cl_get_setkey(sp->setno, sp->setname);
686 if (MD_MNSET_DESC(sd)) {
687 nd = sd->sd_nodelist;
688 while (nd) {
689 /* All nodes are guaranteed to be ALIVE */
690 if (clnt_unlock_set(nd->nd_nodename,
691 cl_sk, &xep)) {
692 mdclrerror(&xep);
693 }
694 nd = nd->nd_next;
695 }
696 } else {
697 for (i = 0; i < MD_MAXSIDES; i++) {
698 /* Skip empty slots */
699 if (sd->sd_nodes[i][0] == '\0')
700 continue;
701
702 if (clnt_unlock_set(sd->sd_nodes[i],
703 cl_sk, &xep)) {
704 mdclrerror(&xep);
705 }
706 }
707 }
708 cl_set_setkey(NULL);
709 }
710
711 /* release signals back to what they were on entry */
712 if (procsigs(FALSE, &oldsigs, &xep) < 0)
713 mdclrerror(&xep);
714
715 if (!(MD_MNSET_DESC(sd))) {
716 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
717 }
718
719 return (rval);
720 }
721
722 int
meta_set_deletemeds(mdsetname_t * sp,int node_c,char ** node_v,int forceflg,md_error_t * ep)723 meta_set_deletemeds(
724 mdsetname_t *sp,
725 int node_c,
726 char **node_v,
727 int forceflg,
728 md_error_t *ep
729 )
730 {
731 md_set_desc *sd = NULL;
732 md_drive_desc *dd = NULL;
733 mddb_med_parm_t mp;
734 md_h_arr_t rb_t;
735 med_rec_t medr;
736 med_rec_t rb_medr;
737 int i, j;
738 char **n_l = NULL;
739 int n_c = 0;
740 sigset_t oldsigs;
741 md_setkey_t *cl_sk;
742 int rb_level = 0;
743 md_error_t xep = mdnullerror;
744 int rval = 0;
745 int max_meds;
746 md_mnnode_desc *nd;
747 int suspend1_flag = 0;
748 int lock_flag = 0;
749
750 mdclrerror(ep);
751
752 if ((sd = metaget_setdesc(sp, ep)) == NULL)
753 return (-1);
754
755 /* Make sure we own the set */
756 if (meta_check_ownership(sp, ep) != 0)
757 return (-1);
758
759 for (i = 0; i < node_c; i++)
760 if (strchr(node_v[i], ',') != NULL)
761 return (mderror(ep, MDE_ONLYNODENAME, node_v[i]));
762
763 if (nodesuniq(sp, node_c, node_v, ep))
764 return (-1);
765
766 if ((max_meds = get_max_meds(ep)) == 0)
767 return (-1);
768
769 /*
770 * The mediator information (which is part of the set record) is
771 * stored in the local mddbs of each node in the diskset.
772 * Each node's rpc.metad daemon reads in the set
773 * records from that node's local mddb and caches them
774 * internally. Any process needing diskset information contacts its
775 * local rpc.metad to get this information. Since each node in the
776 * diskset is independently reading the set information from its local
777 * mddb, the set records in the local mddbs must stay
778 * in-sync, so that all nodes have a consistent view of the diskset.
779 *
780 * For a multinode diskset, explicitly verify that all nodes in the
781 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
782 * fail this operation since all nodes must be ALIVE in order to delete
783 * the mediator information from the set record in their local mddb.
784 * If a panic of this node leaves the local mddbs set records
785 * out-of-sync, the reconfig cycle will fix the local mddbs and
786 * force them back into synchronization.
787 */
788 if (MD_MNSET_DESC(sd)) {
789 nd = sd->sd_nodelist;
790 while (nd) {
791 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
792 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
793 sp->setno,
794 nd->nd_nodename, NULL, sp->setname);
795 return (-1);
796 }
797 nd = nd->nd_next;
798 }
799 }
800
801 if (sd->sd_med.n_cnt == 0)
802 return (mderror(ep, MDE_NOMED, NULL));
803
804 /* Make a list of nodes to check */
805 for (i = 0; i < max_meds; i++)
806 if (sd->sd_med.n_lst[i].a_cnt > 0)
807 n_c = add_lst(&n_l, sd->sd_med.n_lst[i].a_nm[0]);
808
809 for (i = 0; i < node_c; i++) {
810 if (! strinlst(node_v[i], n_c, n_l)) {
811 (void) del_lst(&n_l);
812 return (mddserror(ep, MDE_DS_ISNOTMED, sp->setno,
813 node_v[i], NULL, sp->setname));
814 }
815 }
816
817 (void) del_lst(&n_l);
818
819 /* Save a copy of the current mediator information */
820 rb_t = sd->sd_med; /* structure assignment */
821
822 /* Setup the mediator record for rollback */
823 (void) memset(&rb_medr, '\0', sizeof (med_rec_t));
824 rb_medr.med_rec_mag = MED_REC_MAGIC;
825 rb_medr.med_rec_rev = MED_REC_REV;
826 rb_medr.med_rec_fl = 0;
827 rb_medr.med_rec_sn = sp->setno;
828 (void) strcpy(rb_medr.med_rec_snm, sp->setname);
829 if (MD_MNSET_DESC(sd)) {
830 /*
831 * In MN diskset, use a generic nodename, multiowner, in the
832 * mediator record which allows any node to access mediator
833 * information. MN diskset reconfig cycle forces consistent
834 * view of set/node/drive/mediator information across all nodes
835 * in the MN diskset. This allows the relaxation of
836 * node name checking in rpc.metamedd for MN disksets.
837 *
838 * In the traditional diskset, only a node that is in the
839 * mediator record's diskset nodelist can access mediator
840 * data.
841 */
842 (void) strlcpy(rb_medr.med_rec_nodes[0], MED_MN_CALLER,
843 MD_MAX_NODENAME_PLUS_1);
844 } else {
845 for (i = 0; i < MD_MAXSIDES; i++)
846 (void) strcpy(rb_medr.med_rec_nodes[i],
847 sd->sd_nodes[i]);
848 }
849 rb_medr.med_rec_meds = sd->sd_med; /* structure assignment */
850 (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
851 rb_medr.med_rec_foff = 0;
852 crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);
853
854 /* Delete the mediators requested from the set */
855 for (i = 0; i < node_c; i++) {
856 for (j = 0; j < max_meds; j++) {
857 if (sd->sd_med.n_lst[j].a_cnt == 0)
858 continue;
859 if (strcmp(node_v[i],
860 sd->sd_med.n_lst[j].a_nm[0]) != 0)
861 continue;
862 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
863 SVM_TAG_MEDIATOR, sp->setno, j);
864 (void) memset(&sd->sd_med.n_lst[j], '\0',
865 sizeof (md_h_t));
866 sd->sd_med.n_cnt--;
867 break;
868 }
869 }
870
871 medr = rb_medr; /* structure assignment */
872 medr.med_rec_meds = sd->sd_med; /* structure assignment */
873 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
874
875 /* END CHECK CODE */
876
877 /* Lock the set on current set members */
878 if (MD_MNSET_DESC(sd)) {
879 /* Make sure we are blocking all signals */
880 if (procsigs(TRUE, &oldsigs, &xep) < 0)
881 mdclrerror(&xep);
882 /*
883 * Lock the set on current set members.
884 * lock_set and SUSPEND are used to protect against
885 * other metaset commands running on the other nodes.
886 */
887 nd = sd->sd_nodelist;
888 while (nd) {
889 /* All nodes are guaranteed to be ALIVE */
890 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
891 if (forceflg && strcmp(mynode(),
892 nd->nd_nodename) != 0) {
893 mdclrerror(ep);
894 nd = nd->nd_next;
895 continue;
896 }
897 rval = -1;
898 goto out;
899 }
900 lock_flag = 1;
901 nd = nd->nd_next;
902 }
903 /*
904 * Lock out other meta* commands by suspending
905 * class 1 messages across the diskset.
906 */
907 nd = sd->sd_nodelist;
908 /* All nodes are guaranteed to be ALIVE */
909 while (nd) {
910 if (clnt_mdcommdctl(nd->nd_nodename,
911 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
912 MD_MSCF_NO_FLAGS, ep)) {
913 rval = -1;
914 goto out;
915 }
916 suspend1_flag = 1;
917 nd = nd->nd_next;
918 }
919 } else {
920 md_rb_sig_handling_on();
921 for (i = 0; i < MD_MAXSIDES; i++) {
922 /* Skip empty slots */
923 if (sd->sd_nodes[i][0] == '\0')
924 continue;
925
926 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
927 if (forceflg &&
928 strcmp(mynode(), sd->sd_nodes[i]) != 0) {
929 mdclrerror(ep);
930 continue;
931 }
932 rval = -1;
933 goto out;
934 }
935 lock_flag = 1;
936 }
937 }
938
939 RB_TEST(1, "meta_set_deletemeds", ep)
940
941 RB_PREEMPT;
942 rb_level = 1; /* level 1 */
943
944 RB_TEST(2, "meta_set_deletemeds", ep)
945
946 /* Update the mediator information on all hosts in the set */
947 if (MD_MNSET_DESC(sd)) {
948 nd = sd->sd_nodelist;
949 while (nd) {
950 /* All nodes are guaranteed to be ALIVE */
951 if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med,
952 ep)) {
953 if (forceflg && strcmp(mynode(),
954 nd->nd_nodename) != 0) {
955 mdclrerror(ep);
956 continue;
957 }
958 goto rollback;
959 }
960 nd = nd->nd_next;
961 }
962 } else {
963 for (i = 0; i < MD_MAXSIDES; i++) {
964 /* Skip empty slots */
965 if (sd->sd_nodes[i][0] == '\0')
966 continue;
967
968 if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med,
969 ep)) {
970 if (forceflg && strcmp(mynode(),
971 sd->sd_nodes[i]) != 0) {
972 mdclrerror(ep);
973 continue;
974 }
975 goto rollback;
976 }
977 }
978 }
979
980 RB_TEST(3, "meta_set_deletemeds", ep)
981
982 RB_PREEMPT;
983 rb_level = 2; /* level 2 */
984
985 RB_TEST(5, "meta_set_deletemeds", ep)
986
987 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
988 ep)) == NULL) {
989 if (! mdisok(ep))
990 goto rollback;
991 }
992
993 RB_TEST(5, "meta_set_deletemeds", ep)
994
995 RB_PREEMPT;
996 rb_level = 3; /* level 3 */
997
998 RB_TEST(6, "meta_set_deletemeds", ep)
999
1000 if (dd != NULL) {
1001 /*
1002 * Set up the parameters to the call to update the
1003 * kernel mediator list
1004 */
1005 (void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
1006 mp.med_setno = sp->setno;
1007 if (meta_h2hi(&sd->sd_med, &mp.med, ep))
1008 goto rollback;
1009
1010 /* Resolve the IP addresses for the host list */
1011 if (meta_med_hnm2ip(&mp.med, ep))
1012 goto rollback;
1013
1014 if (metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL) != 0) {
1015 (void) mdstealerror(ep, &mp.med_mde);
1016 goto rollback;
1017 }
1018 }
1019
1020 RB_TEST(7, "meta_set_deletemeds", ep)
1021
1022 RB_PREEMPT;
1023 rb_level = 4; /* level 4 */
1024
1025 RB_TEST(8, "meta_set_deletemeds", ep)
1026
1027 /* Inform the mediator hosts of the new status */
1028 for (i = 0; i < max_meds; i++) {
1029 if (rb_t.n_lst[i].a_cnt == 0)
1030 continue;
1031
1032 /*
1033 * medr contains the new mediator node list.
1034 * Send the new mediator information to the
1035 * new mediator node list. If a node had this RPC
1036 * called, but its node is no longer in the new mediator
1037 * node list, rpc.metamedd will delete the mediator
1038 * record on that node.
1039 */
1040 if (clnt_med_upd_rec(&rb_t.n_lst[i], sp, &medr, ep)) {
1041 if ((forceflg && mdanyrpcerror(ep)) ||
1042 mdisrpcerror(ep, RPC_PROGNOTREGISTERED)) {
1043 mdclrerror(ep);
1044 continue;
1045 }
1046 goto rollback;
1047 }
1048 }
1049
1050 out:
1051 if (dd)
1052 metafreedrivedesc(&dd);
1053
1054 if (suspend1_flag) {
1055 /*
1056 * Unlock diskset by resuming messages across the diskset.
1057 * Just resume all classes so that resume is the same whether
1058 * just one class was locked or all classes were locked.
1059 */
1060 nd = sd->sd_nodelist;
1061 /* All nodes are guaranteed to be ALIVE */
1062 while (nd) {
1063 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1064 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1065 if (rval == 0)
1066 (void) mdstealerror(ep, &xep);
1067 rval = -1;
1068 mde_perror(ep, dgettext(TEXT_DOMAIN,
1069 "Unable to resume rpc.mdcommd.\n"));
1070 }
1071 nd = nd->nd_next;
1072 }
1073 meta_ping_mnset(sp->setno);
1074 }
1075
1076 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1077 if (lock_flag) {
1078 if (MD_MNSET_DESC(sd)) {
1079 nd = sd->sd_nodelist;
1080 while (nd) {
1081 /* All nodes are guaranteed to be ALIVE */
1082 if (clnt_unlock_set(nd->nd_nodename,
1083 cl_sk, &xep)) {
1084 if (forceflg &&
1085 strcmp(mynode(),
1086 nd->nd_nodename) != 0) {
1087 mdclrerror(ep);
1088 continue;
1089 }
1090 if (rval == 0)
1091 (void) mdstealerror(ep, &xep);
1092 rval = -1;
1093 }
1094 nd = nd->nd_next;
1095 }
1096 } else {
1097 for (i = 0; i < MD_MAXSIDES; i++) {
1098 /* Skip empty slots */
1099 if (sd->sd_nodes[i][0] == '\0')
1100 continue;
1101
1102 if (clnt_unlock_set(sd->sd_nodes[i],
1103 cl_sk, &xep)) {
1104 if (forceflg &&
1105 strcmp(mynode(),
1106 sd->sd_nodes[i]) != 0) {
1107 mdclrerror(ep);
1108 continue;
1109 }
1110 if (rval == 0)
1111 (void) mdstealerror(ep, &xep);
1112 rval = -1;
1113 }
1114 }
1115 }
1116 }
1117 cl_set_setkey(NULL);
1118
1119 if (MD_MNSET_DESC(sd)) {
1120 /* release signals back to what they were on entry */
1121 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1122 mdclrerror(&xep);
1123 } else {
1124 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1125 }
1126
1127 return (rval);
1128
1129 rollback:
1130 /* all signals already blocked for MN disket */
1131 if (!(MD_MNSET_DESC(sd))) {
1132 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1133 mdclrerror(&xep);
1134 }
1135
1136 rval = -1;
1137
1138 (void) del_lst(&n_l);
1139
1140 /* level 4 */
1141 if (rb_level > 4) {
1142 for (i = 0; i < max_meds; i++) {
1143 if (rb_t.n_lst[i].a_cnt == 0)
1144 continue;
1145
1146 /*
1147 * rb_medr contains the rollback mediator node list.
1148 * Send the rollback mediator information to the
1149 * new mediator node list. This will recreate the
1150 * mediator record on all nodes where the mediator
1151 * record had been removed.
1152 */
1153 if (clnt_med_upd_rec(&rb_t.n_lst[i], sp, &rb_medr,
1154 &xep))
1155 mdclrerror(&xep);
1156 }
1157 }
1158
1159 /* level 3 */
1160 if (rb_level > 2 && dd != NULL) {
1161 (void) memset(&mp, '\0', sizeof (mddb_med_parm_t));
1162 mp.med_setno = sp->setno;
1163 (void) meta_h2hi(&rb_t, &mp.med, &xep);
1164 mdclrerror(&xep);
1165 (void) meta_med_hnm2ip(&mp.med, &xep);
1166 mdclrerror(&xep);
1167 (void) metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL);
1168 }
1169
1170 /* level 2 */
1171 if (rb_level > 1) {
1172 metafreedrivedesc(&dd);
1173 }
1174
1175 /* level 1 */
1176 if (rb_level > 0) {
1177 /* Delete mediator information from all hosts in the set */
1178 if (MD_MNSET_DESC(sd)) {
1179 nd = sd->sd_nodelist;
1180 while (nd) {
1181 /* All nodes are guaranteed to be ALIVE */
1182 if (clnt_updmeds(nd->nd_nodename, sp, &rb_t,
1183 &xep))
1184 mdclrerror(&xep);
1185 nd = nd->nd_next;
1186 }
1187 } else {
1188 for (i = 0; i < MD_MAXSIDES; i++) {
1189 /* Skip empty slots */
1190 if (sd->sd_nodes[i][0] == '\0')
1191 continue;
1192
1193 if (clnt_updmeds(sd->sd_nodes[i], sp, &rb_t,
1194 &xep))
1195 mdclrerror(&xep);
1196 }
1197 }
1198 }
1199
1200 /* level 0 */
1201 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1202 /* Unlock the set */
1203 /* Don't test lock flag since guaranteed to be set if in rollback */
1204 if (MD_MNSET_DESC(sd)) {
1205 /*
1206 * Unlock diskset by resuming messages across the diskset.
1207 * Just resume all classes so that resume is the same whether
1208 * just one class was locked or all classes were locked.
1209 */
1210 if (suspend1_flag) {
1211 /* All nodes are guaranteed to be ALIVE */
1212 nd = sd->sd_nodelist;
1213 while (nd) {
1214 if (clnt_mdcommdctl(nd->nd_nodename,
1215 COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
1216 MD_MSCF_NO_FLAGS, &xep)) {
1217 mde_perror(&xep, dgettext(TEXT_DOMAIN,
1218 "Unable to resume rpc.mdcommd.\n"));
1219 mdclrerror(&xep);
1220 }
1221 nd = nd->nd_next;
1222 }
1223 meta_ping_mnset(sp->setno);
1224 }
1225 nd = sd->sd_nodelist;
1226 /* All nodes are guaranteed to be ALIVE */
1227 while (nd) {
1228 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
1229 mdclrerror(&xep);
1230 nd = nd->nd_next;
1231 }
1232 } else {
1233 for (i = 0; i < MD_MAXSIDES; i++) {
1234 /* Skip empty slots */
1235 if (sd->sd_nodes[i][0] == '\0')
1236 continue;
1237
1238 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
1239 mdclrerror(&xep);
1240 }
1241 }
1242 cl_set_setkey(NULL);
1243
1244 /* release signals back to what they were on entry */
1245 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1246 mdclrerror(&xep);
1247
1248 if (!(MD_MNSET_DESC(sd))) {
1249 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1250 }
1251
1252 return (rval);
1253 }
1254