1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <wait.h>
30 #include <sys/time.h>
31 #include <syslog.h>
32
33 #include <meta.h>
34 #include <sys/lvm/mdio.h>
35 #include <sys/lvm/md_mddb.h>
36 #include <sys/lvm/md_mirror.h>
37
38 #define MAX_N_ARGS 64
39 #define MAX_ARG_LEN 1024
40 #define MAX_SLEEPS 99
41 #define SLEEP_MOD 5
42
43 /* we reserve 1024 bytes for stdout and the same for stderr */
44 #define MAX_OUT 1024
45 #define MAX_ERR 1024
46 #define JUNK 128 /* used to flush stdout and stderr */
47
48
49 /*ARGSUSED*/
50 void
mdmn_do_cmd(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)51 mdmn_do_cmd(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
52 {
53
54 /*
55 * We are given one string containing all the arguments
56 * For execvp() we have to regenerate the arguments again
57 */
58 int arg; /* argument that is currently been built */
59 int index; /* runs through arg above */
60 int i; /* helper for for loop */
61 char *argv[MAX_N_ARGS]; /* argument array for execvp */
62 char *cp; /* runs through the given command line string */
63 char *command = NULL; /* the command we call locally */
64 int pout[2]; /* pipe for stdout */
65 int perr[2]; /* pipe for stderr */
66 pid_t pid; /* process id */
67
68 cp = msg->msg_event_data;
69 arg = 0;
70 index = 0;
71
72 /* init the args array alloc the first one and null out the rest */
73 argv[0] = Malloc(MAX_ARG_LEN);
74 for (i = 1; i < MAX_N_ARGS; i++) {
75 argv[i] = NULL;
76 }
77
78 resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
79
80 while (*cp != '\0') {
81 if (arg == MAX_N_ARGS) {
82 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
83 "PANIC: too many arguments specified\n"));
84 resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
85 goto out;
86 }
87 if (index == MAX_ARG_LEN) {
88 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
89 "PANIC: argument too long\n"));
90 resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
91 goto out;
92 }
93
94 if ((*cp != ' ') && (*cp != '\t')) {
95 /*
96 * No space or tab: copy char into current
97 * argv and advance both pointers
98 */
99
100 argv[arg][index] = *cp;
101 cp++; /* next char in command line */
102 index++; /* next char in argument */
103 } else {
104 /*
105 * space or tab: terminate current argv,
106 * advance arg, reset pointer into arg,
107 * advance pointer in command line
108 */
109 argv[arg][index] = '\0';
110 arg++; /* next argument */
111 argv[arg] = Malloc(MAX_ARG_LEN);
112 cp++; /* next char in command line */
113 index = 0; /* starts at char 0 */
114 }
115 }
116 /* terminate the last real argument */
117 argv[arg][index] = '\0';
118 /* the last argument is an NULL pointer */
119 argv[++arg] = NULL;
120 if (pipe(pout) < 0) {
121 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
122 "PANIC: pipe failed\n"));
123 resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
124 goto out;
125 }
126 if (pipe(perr) < 0) {
127 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
128 "PANIC: pipe failed\n"));
129 (void) close(pout[0]);
130 (void) close(pout[1]);
131 resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
132 goto out;
133 }
134 command = Strdup(argv[0]);
135 (void) strcat(argv[0], ".rpc_call");
136 pid = fork1();
137 if (pid == (pid_t)-1) {
138 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
139 "PANIC: fork failed\n"));
140 resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
141 (void) close(pout[0]);
142 (void) close(pout[1]);
143 (void) close(perr[0]);
144 (void) close(perr[1]);
145 goto out;
146 } else if (pid == (pid_t)0) {
147 /* child */
148 (void) close(0);
149 /* close the reading channels of pout and perr */
150 (void) close(pout[0]);
151 (void) close(perr[0]);
152 /* redirect stdout */
153 if (dup2(pout[1], 1) < 0) {
154 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
155 "PANIC: dup2 failed\n"));
156 resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
157 return;
158 }
159
160 /* redirect stderr */
161 if (dup2(perr[1], 2) < 0) {
162 (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
163 "PANIC: dup2 failed\n"));
164 resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
165 return;
166 }
167
168 (void) execvp(command, (char *const *)argv);
169 perror("execvp");
170 _exit(1);
171 } else {
172 /* parent process */
173 int stat_loc;
174 char *out, *err; /* for stdout and stderr of child */
175 int i; /* index into the aboves */
176 char junk[JUNK];
177 int out_done = 0;
178 int err_done = 0;
179 int out_read = 0;
180 int err_read = 0;
181 int maxfd;
182 fd_set rset;
183
184
185 /* close the writing channels of pout and perr */
186 (void) close(pout[1]);
187 (void) close(perr[1]);
188 resp->mmr_out = Malloc(MAX_OUT);
189 resp->mmr_err = Malloc(MAX_ERR);
190 resp->mmr_out_size = MAX_OUT;
191 resp->mmr_err_size = MAX_ERR;
192 out = resp->mmr_out;
193 err = resp->mmr_err;
194 FD_ZERO(&rset);
195 while ((out_done == 0) || (err_done == 0)) {
196 FD_SET(pout[0], &rset);
197 FD_SET(perr[0], &rset);
198 maxfd = max(pout[0], perr[0]) + 1;
199 (void) select(maxfd, &rset, NULL, NULL, NULL);
200
201 /*
202 * Did the child produce some output to stdout?
203 * If so, read it until we either reach the end of the
204 * output or until we read MAX_OUT bytes.
205 * Whatever comes first.
206 * In case we already read MAX_OUT bytes we simply
207 * read away the output into a junk buffer.
208 * Just to make the child happy
209 */
210 if (FD_ISSET(pout[0], &rset)) {
211 if (MAX_OUT - out_read - 1 > 0) {
212 i = read(pout[0], out,
213 MAX_OUT - out_read);
214 out_read += i;
215 out += i;
216 } else {
217 /* buffer full, empty stdout */
218 i = read(pout[0], junk, JUNK);
219 }
220 if (i == 0) {
221 /* stdout is closed by child */
222 out_done++;
223 }
224 }
225 /* same comment as above | sed -e 's/stdout/stderr/' */
226 if (FD_ISSET(perr[0], &rset)) {
227 if (MAX_ERR - err_read - 1 > 0) {
228 i = read(perr[0], err,
229 MAX_ERR - err_read);
230 err_read += i;
231 err += i;
232 } else {
233 /* buffer full, empty stderr */
234 i = read(perr[0], junk, JUNK);
235 }
236 if (i == 0) {
237 /* stderr is closed by child */
238 err_done++;
239 }
240 }
241 }
242 resp->mmr_out[out_read] = '\0';
243 resp->mmr_err[err_read] = '\0';
244
245 while (waitpid(pid, &stat_loc, 0) < 0) {
246 if (errno != EINTR) {
247 resp->mmr_comm_state = MDMNE_HANDLER_FAILED;
248 break;
249 }
250 }
251 if (errno == 0)
252 resp->mmr_exitval = WEXITSTATUS(stat_loc);
253
254 (void) close(pout[0]);
255 (void) close(perr[0]);
256 }
257 out:
258 for (i = 0; i < MAX_N_ARGS; i++) {
259 if (argv[i] != NULL) {
260 free(argv[i]);
261 }
262 }
263 if (command != NULL) {
264 Free(command);
265 }
266 }
267
268 /*
269 * This is for checking if a metadevice is opened, and for
270 * locking in case it is not and for
271 * unlocking a locked device
272 */
273 /*ARGSUSED*/
274 void
mdmn_do_clu(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)275 mdmn_do_clu(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
276 {
277 if (msg->msg_type == MD_MN_MSG_CLU_CHECK) {
278 md_isopen_t *d;
279 int ret;
280
281 resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
282 resp->mmr_out_size = 0;
283 resp->mmr_err_size = 0;
284 resp->mmr_out = NULL;
285 resp->mmr_err = NULL;
286 d = (md_isopen_t *)(void *)msg->msg_event_data;
287 ret = metaioctl(MD_IOCISOPEN, d, &(d->mde), NULL);
288 /*
289 * In case the ioctl succeeded, return the open state of
290 * the metadevice. Otherwise we return the error the ioctl
291 * produced. As this is not zero, no attempt is made to
292 * remove/rename the metadevice later
293 */
294
295 if (ret == 0) {
296 resp->mmr_exitval = d->isopen;
297 } else {
298 /*
299 * When doing a metaclear, one node after the other
300 * does the two steps:
301 * - check on all nodes if this md is opened.
302 * - remove the md locally.
303 * When the 2nd node asks all nodes if the md is
304 * open it starts with the first node.
305 * As this already removed the md, the check
306 * returns MDE_UNIT_NOT_SETUP.
307 * In order to not keep the 2nd node from proceeding,
308 * we map this to an Ok.
309 */
310 if (mdismderror(&(d->mde), MDE_UNIT_NOT_SETUP)) {
311 mdclrerror(&(d->mde));
312 ret = 0;
313 }
314
315 resp->mmr_exitval = ret;
316 }
317 }
318 }
319
320 /* handler for MD_MN_MSG_REQUIRE_OWNER */
321 /*ARGSUSED*/
322 void
mdmn_do_req_owner(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)323 mdmn_do_req_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
324 {
325 md_set_mmown_params_t setown;
326 md_mn_req_owner_t *d;
327 int ret, n = 0;
328
329 resp->mmr_out_size = 0;
330 resp->mmr_err_size = 0;
331 resp->mmr_out = NULL;
332 resp->mmr_err = NULL;
333 resp->mmr_comm_state = MDMNE_ACK;
334 d = (md_mn_req_owner_t *)(void *)msg->msg_event_data;
335
336 (void) memset(&setown, 0, sizeof (setown));
337 MD_SETDRIVERNAME(&setown, MD_MIRROR, MD_MIN2SET(d->mnum))
338 setown.d.mnum = d->mnum;
339 setown.d.owner = d->owner;
340
341 /* Retry ownership change if we get EAGAIN returned */
342 while ((ret = metaioctl(MD_MN_SET_MM_OWNER, &setown, &setown.mde, NULL))
343 != 0) {
344 md_sys_error_t *ip =
345 &setown.mde.info.md_error_info_t_u.sys_error;
346 if (ip->errnum != EAGAIN) {
347 break;
348 }
349 if (n++ >= 10) {
350 break;
351 }
352 (void) sleep(1);
353 }
354
355 resp->mmr_exitval = ret;
356 }
357
358 /*
359 * handler for MD_MN_MSG_CHOOSE_OWNER
360 * This is called when a mirror resync has no owner. The master node generates
361 * this message which is not broadcast to the other nodes. The message is
362 * required as the kernel does not have access to the nodelist for the set.
363 */
364 /*ARGSUSED*/
365 void
mdmn_do_choose_owner(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)366 mdmn_do_choose_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
367 {
368 md_mn_msg_chowner_t chownermsg;
369 md_mn_msg_chooseid_t *d;
370 int ret = 0;
371 int nodecnt;
372 int nodeno;
373 uint_t nodeid;
374 uint_t myflags;
375 set_t setno;
376 mdsetname_t *sp;
377 md_set_desc *sd;
378 md_mnnode_desc *nd;
379 md_error_t mde = mdnullerror;
380 md_mn_result_t *resp1 = NULL;
381
382 resp->mmr_out_size = 0;
383 resp->mmr_err_size = 0;
384 resp->mmr_out = NULL;
385 resp->mmr_err = NULL;
386 resp->mmr_comm_state = MDMNE_ACK;
387 d = (md_mn_msg_chooseid_t *)(void *)msg->msg_event_data;
388
389 /*
390 * The node to be chosen will be the resync count for the set
391 * modulo the number of live nodes in the set
392 */
393 setno = MD_MIN2SET(d->msg_chooseid_mnum);
394 if ((sp = metasetnosetname(setno, &mde)) == NULL) {
395 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
396 "MD_MN_MSG_CHOOSE_OWNER: Invalid setno %d\n"), setno);
397 resp->mmr_exitval = 1;
398 return;
399 }
400 if ((sd = metaget_setdesc(sp, &mde)) == NULL) {
401 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
402 "MD_MN_MSG_CHOOSE_OWNER: Invalid set pointer\n"));
403 resp->mmr_exitval = 1;
404 return;
405 }
406
407 /* Count the number of live nodes */
408 nodecnt = 0;
409 nd = sd->sd_nodelist;
410 while (nd) {
411 if (nd->nd_flags & MD_MN_NODE_ALIVE)
412 nodecnt++;
413 nd = nd->nd_next;
414 }
415 nodeno = (d->msg_chooseid_rcnt%nodecnt);
416
417 /*
418 * If we've been called with msg_chooseid_set_node set TRUE then we
419 * are simply re-setting the owner id to ensure consistency across
420 * the cluster.
421 * If the flag is reset (B_FALSE) we are requesting a new owner to be
422 * determined.
423 */
424 if (d->msg_chooseid_set_node) {
425 nodeid = d->msg_chooseid_rcnt;
426 } else {
427 /* scan the nodelist looking for the required node */
428 nodecnt = 0;
429 nd = sd->sd_nodelist;
430 while (nd) {
431 if (nd->nd_flags & MD_MN_NODE_ALIVE) {
432 if (nodecnt == nodeno)
433 break;
434 nodecnt++;
435 }
436 nd = nd->nd_next;
437 }
438 nodeid = nd->nd_nodeid;
439 }
440
441 /* Send message to all nodes to make ownership change */
442 chownermsg.msg_chowner_mnum = d->msg_chooseid_mnum;
443 chownermsg.msg_chowner_nodeid = nodeid;
444 myflags = MD_MSGF_NO_LOG;
445
446 /* inherit some flags from the parent message */
447 myflags |= msg->msg_flags & MD_MSGF_INHERIT_BITS;
448
449 ret = mdmn_send_message(MD_MIN2SET(d->msg_chooseid_mnum),
450 MD_MN_MSG_CHANGE_OWNER, myflags, 0, (char *)&chownermsg,
451 sizeof (chownermsg), &resp1, &mde);
452 if (resp1 != NULL)
453 free_result(resp1);
454 resp->mmr_exitval = ret;
455 }
456
457 /*
458 * Handler for MD_MN_MSG_CHANGE_OWNER
459 * This is called when we are perfoming a resync and wish to change from
460 * no mirror owner to an owner chosen by the master.
461 * This mesage is only relevant for the new owner, the message will be
462 * ignored by all other nodes
463 */
464 /*ARGSUSED*/
465 void
mdmn_do_change_owner(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)466 mdmn_do_change_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
467 {
468 md_set_mmown_params_t setown;
469 md_mn_msg_chowner_t *d;
470 int ret = 0;
471 set_t setno;
472 mdsetname_t *sp;
473 md_set_desc *sd;
474 md_error_t mde = mdnullerror;
475
476 resp->mmr_out_size = 0;
477 resp->mmr_err_size = 0;
478 resp->mmr_out = NULL;
479 resp->mmr_err = NULL;
480 resp->mmr_comm_state = MDMNE_ACK;
481 d = (md_mn_msg_chowner_t *)(void *)msg->msg_event_data;
482
483 setno = MD_MIN2SET(d->msg_chowner_mnum);
484 if ((sp = metasetnosetname(setno, &mde)) == NULL) {
485 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
486 "MD_MN_MSG_CHANGE_OWNER: Invalid setno %d\n"), setno);
487 resp->mmr_exitval = 1;
488 return;
489 }
490 if ((sd = metaget_setdesc(sp, &mde)) == NULL) {
491 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
492 "MD_MN_MSG_CHANGE_OWNER: Invalid set pointer\n"));
493 resp->mmr_exitval = 1;
494 return;
495 }
496
497 if (d->msg_chowner_nodeid == sd->sd_mn_mynode->nd_nodeid) {
498 /*
499 * If we are the chosen owner, issue ioctl to make the
500 * ownership change
501 */
502 (void) memset(&setown, 0, sizeof (md_set_mmown_params_t));
503 setown.d.mnum = d->msg_chowner_mnum;
504 setown.d.owner = d->msg_chowner_nodeid;
505 setown.d.flags = MD_MN_MM_SPAWN_THREAD;
506 MD_SETDRIVERNAME(&setown, MD_MIRROR,
507 MD_MIN2SET(d->msg_chowner_mnum));
508
509 /*
510 * Single shot at changing the the owner, if it fails EAGAIN,
511 * another node must have become the owner while we are in the
512 * process of making this choice.
513 */
514
515 ret = metaioctl(MD_MN_SET_MM_OWNER, &setown,
516 &(setown.mde), NULL);
517 if (ret == EAGAIN)
518 ret = 0;
519 }
520 resp->mmr_exitval = ret;
521 }
522
523 /* handler for MD_MN_MSG_SUSPEND_WRITES */
524 /*ARGSUSED*/
525 void
mdmn_do_susp_write(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)526 mdmn_do_susp_write(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
527 {
528 /* Suspend writes to a region of a mirror */
529 md_suspend_wr_params_t suspwr_ioc;
530 md_mn_msg_suspwr_t *d;
531 int ret;
532
533 resp->mmr_out_size = 0;
534 resp->mmr_err_size = 0;
535 resp->mmr_out = NULL;
536 resp->mmr_err = NULL;
537 resp->mmr_comm_state = MDMNE_ACK;
538 d = (md_mn_msg_suspwr_t *)(void *)msg->msg_event_data;
539
540 (void) memset(&suspwr_ioc, 0, sizeof (md_suspend_wr_params_t));
541 MD_SETDRIVERNAME(&suspwr_ioc, MD_MIRROR,
542 MD_MIN2SET(d->msg_suspwr_mnum));
543 suspwr_ioc.mnum = d->msg_suspwr_mnum;
544 ret = metaioctl(MD_MN_SUSPEND_WRITES, &suspwr_ioc,
545 &(suspwr_ioc.mde), NULL);
546 resp->mmr_exitval = ret;
547 }
548
549 /*
550 * handler for MD_MN_MSG_STATE_UPDATE_RESWR
551 * This functions update a submirror component state and then resumes writes
552 * to the mirror
553 */
554 /*ARGSUSED*/
555 void
mdmn_do_state_upd_reswr(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)556 mdmn_do_state_upd_reswr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
557 {
558 /* Update the state of the component of a mirror */
559 md_set_state_params_t setstate_ioc;
560 md_mn_msg_stch_t *d;
561 int ret;
562
563 resp->mmr_out_size = 0;
564 resp->mmr_err_size = 0;
565 resp->mmr_out = NULL;
566 resp->mmr_err = NULL;
567 resp->mmr_comm_state = MDMNE_ACK;
568 d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data;
569
570 (void) memset(&setstate_ioc, 0, sizeof (md_set_state_params_t));
571 MD_SETDRIVERNAME(&setstate_ioc, MD_MIRROR,
572 MD_MIN2SET(d->msg_stch_mnum));
573 setstate_ioc.mnum = d->msg_stch_mnum;
574 setstate_ioc.sm = d->msg_stch_sm;
575 setstate_ioc.comp = d->msg_stch_comp;
576 setstate_ioc.state = d->msg_stch_new_state;
577 setstate_ioc.hs_id = d->msg_stch_hs_id;
578 ret = metaioctl(MD_MN_SET_STATE, &setstate_ioc,
579 &(setstate_ioc.mde), NULL);
580 resp->mmr_exitval = ret;
581 }
582
583 /*
584 * submessage generator for MD_MN_MSG_STATE_UPDATE and MD_MN_MSG_STATE_UPDATE2
585 * This generates 2 messages, the first is SUSPEND_WRITES and
586 * depending on the type of the original message the second one is
587 * either STATE_UPDATE_RESWR or STATE_UPDATE_RESWR2 which actually does
588 * the same, but runs on a higher class.
589 */
590 int
mdmn_smgen_state_upd(md_mn_msg_t * msg,md_mn_msg_t * msglist[])591 mdmn_smgen_state_upd(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
592 {
593 md_mn_msg_t *nmsg;
594 md_mn_msg_stch_t *d;
595 md_mn_msg_stch_t *stch_data;
596 md_mn_msg_suspwr_t *suspwr_data;
597
598 d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data;
599
600 nmsg = Zalloc(sizeof (md_mn_msg_t));
601 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
602
603 nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */
604 nmsg->msg_setno = msg->msg_setno;
605 nmsg->msg_type = MD_MN_MSG_SUSPEND_WRITES;
606 nmsg->msg_event_size = sizeof (md_mn_msg_suspwr_t);
607 nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_suspwr_t));
608 suspwr_data = (md_mn_msg_suspwr_t *)(void *)nmsg->msg_event_data;
609 suspwr_data->msg_suspwr_mnum = d->msg_stch_mnum;
610 msglist[0] = nmsg;
611
612 nmsg = Zalloc(sizeof (md_mn_msg_t));
613 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
614
615 nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */
616 nmsg->msg_setno = msg->msg_setno;
617 if (msg->msg_type == MD_MN_MSG_STATE_UPDATE2) {
618 nmsg->msg_type = MD_MN_MSG_STATE_UPDATE_RESWR2;
619 } else {
620 nmsg->msg_type = MD_MN_MSG_STATE_UPDATE_RESWR;
621 }
622 nmsg->msg_event_size = sizeof (md_mn_msg_stch_t);
623 nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_stch_t));
624 stch_data = (md_mn_msg_stch_t *)(void *)nmsg->msg_event_data;
625 stch_data->msg_stch_mnum = d->msg_stch_mnum;
626 stch_data->msg_stch_sm = d->msg_stch_sm;
627 stch_data->msg_stch_comp = d->msg_stch_comp;
628 stch_data->msg_stch_new_state = d->msg_stch_new_state;
629 stch_data->msg_stch_hs_id = d->msg_stch_hs_id;
630 msglist[1] = nmsg;
631 return (2); /* Return the number of submessages generated */
632 }
633
634 /*
635 * handler for MD_MN_MSG_ALLOCATE_HOTSPARE and MD_MN_MSG_ALLOCATE_HOTSPARE2
636 * This sends a message to all nodes requesting them to allocate a hotspare
637 * for the specified component. The component is specified by the mnum of
638 * the mirror, the submirror index and the component index.
639 */
640 /*ARGSUSED*/
641 void
mdmn_do_allocate_hotspare(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)642 mdmn_do_allocate_hotspare(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
643 {
644 /* Allocate a hotspare for a mirror component */
645 md_alloc_hotsp_params_t allochsp_ioc;
646 md_mn_msg_allochsp_t *d;
647 int ret;
648
649 resp->mmr_out_size = 0;
650 resp->mmr_err_size = 0;
651 resp->mmr_out = NULL;
652 resp->mmr_err = NULL;
653 resp->mmr_comm_state = MDMNE_ACK;
654 d = (md_mn_msg_allochsp_t *)((void *)(msg->msg_event_data));
655
656 (void) memset(&allochsp_ioc, 0,
657 sizeof (md_alloc_hotsp_params_t));
658 MD_SETDRIVERNAME(&allochsp_ioc, MD_MIRROR,
659 MD_MIN2SET(d->msg_allochsp_mnum));
660 allochsp_ioc.mnum = d->msg_allochsp_mnum;
661 allochsp_ioc.sm = d->msg_allochsp_sm;
662 allochsp_ioc.comp = d->msg_allochsp_comp;
663 allochsp_ioc.hs_id = d->msg_allochsp_hs_id;
664 ret = metaioctl(MD_MN_ALLOCATE_HOTSPARE, &allochsp_ioc,
665 &(allochsp_ioc.mde), NULL);
666 resp->mmr_exitval = ret;
667 }
668
669 /*
670 * handler for MD_MN_MSG_RESYNC_STARTING,MD_MN_MSG_RESYNC_FIRST,
671 * MD_MN_MSG_RESYNC_NEXT, MD_MN_MSG_RESYNC_FINISH, MD_MN_MSG_RESYNC_PHASE_DONE
672 */
673 /*ARGSUSED*/
674 void
mdmn_do_resync(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)675 mdmn_do_resync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
676 {
677 md_mn_msg_resync_t *d;
678 md_mn_rs_params_t respar;
679 mddb_setflags_config_t sf;
680 md_error_t ep = mdnullerror;
681 mdsetname_t *sp;
682 int ret;
683 int smi;
684 int start_flag = 1;
685 int sleep_count = 0;
686 unsigned int sleep_time = 2;
687
688 resp->mmr_out_size = 0;
689 resp->mmr_err_size = 0;
690 resp->mmr_out = NULL;
691 resp->mmr_err = NULL;
692 resp->mmr_comm_state = MDMNE_ACK;
693 d = (md_mn_msg_resync_t *)((void *)(msg->msg_event_data));
694
695 (void) memset(&respar, 0, sizeof (respar));
696 MD_SETDRIVERNAME(&respar, MD_MIRROR,
697 MD_MIN2SET(d->msg_resync_mnum))
698 respar.msg_type = (int)msg->msg_type;
699 respar.mnum = d->msg_resync_mnum;
700 respar.rs_type = d->msg_resync_type;
701 respar.rs_start = d->msg_resync_start;
702 respar.rs_size = d->msg_resync_rsize;
703 respar.rs_done = d->msg_resync_done;
704 respar.rs_2_do = d->msg_resync_2_do;
705 respar.rs_originator = d->msg_originator;
706 respar.rs_flags = d->msg_resync_flags;
707
708 for (smi = 0; smi < NMIRROR; smi++) {
709 respar.rs_sm_state[smi] = d->msg_sm_state[smi];
710 respar.rs_sm_flags[smi] = d->msg_sm_flags[smi];
711 }
712
713 /*
714 * Prior to running the resync thread first check that the start_step
715 * flag (MD_SET_MN_START_RC) added by metaclust's MC_START step has been
716 * removed from the set record flags. Ordinarily, this would be removed
717 * at MC_STEP4 in metaclust - need to ensure this has happened on all
718 * nodes.
719 */
720 (void) memset(&sf, 0, sizeof (sf));
721 sf.sf_setno = MD_MIN2SET(d->msg_resync_mnum);
722 sf.sf_flags = MDDB_NM_GET;
723 /* Use magic to help protect ioctl against attack. */
724 sf.sf_magic = MDDB_SETFLAGS_MAGIC;
725 if ((sp = metasetnosetname(sf.sf_setno, &ep)) == NULL) {
726 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
727 "MDMN_DO_RESYNC: Invalid setno = %d\n"),
728 sf.sf_setno);
729 (void) mdstealerror(&(resp->mmr_ep), &ep);
730 resp->mmr_exitval = -1;
731 return;
732 }
733
734 /* start_flag always true initially */
735 while (start_flag) {
736 if (metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) != 0) {
737 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
738 "MDMN_DO_RESYNC: Could not get start_step "
739 "flag for set %s - returning\n"),
740 sp->setname);
741 (void) mdstealerror(&(resp->mmr_ep), &sf.sf_mde);
742 resp->mmr_exitval = -1;
743 return;
744 }
745
746 /* metaioctl returns successfully - is start flag cleared? */
747 if (sf.sf_setflags & MD_SET_MN_START_RC) {
748 start_flag = 1;
749 (void) sleep(sleep_time);
750 sleep_count++;
751 if ((sleep_count == 1) ||
752 (sleep_count % SLEEP_MOD) == 0) {
753 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
754 "MDMN_DO_RESYNC: Waiting for start_step "
755 "flag for set %s to be cleared\n"),
756 sp->setname);
757 }
758 if (sleep_count == MAX_SLEEPS) {
759 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
760 "MDMN_DO_RESYNC: Could not clear "
761 "start_step flag for set %s "
762 "- returning\n"), sp->setname);
763 resp->mmr_exitval = -1;
764 return;
765 }
766 } else {
767 start_flag = 0;
768 }
769 }
770
771 ret = metaioctl(MD_MN_RESYNC, &respar, &respar.mde, NULL);
772 if (ret) {
773 (void) mdstealerror(&(resp->mmr_ep), &respar.mde);
774 }
775 resp->mmr_exitval = ret;
776 }
777
778 /*
779 * handler for MD_MN_MSG_SETSYNC
780 */
781 /*ARGSUSED*/
782 void
mdmn_do_setsync(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)783 mdmn_do_setsync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
784 {
785 md_mn_msg_setsync_t *d;
786 md_resync_ioctl_t ri;
787 int ret;
788
789 resp->mmr_out_size = 0;
790 resp->mmr_err_size = 0;
791 resp->mmr_out = NULL;
792 resp->mmr_err = NULL;
793 resp->mmr_comm_state = MDMNE_ACK;
794 d = (md_mn_msg_setsync_t *)((void *)(msg->msg_event_data));
795
796 (void) memset(&ri, 0, sizeof (ri));
797 MD_SETDRIVERNAME(&ri, MD_MIRROR, MD_MIN2SET(d->setsync_mnum))
798 ri.ri_mnum = d->setsync_mnum;
799 ri.ri_copysize = d->setsync_copysize;
800 ri.ri_flags = d->setsync_flags;
801
802 ret = metaioctl(MD_MN_SETSYNC, &ri, &ri.mde, NULL);
803
804 resp->mmr_exitval = ret;
805 }
806
807 /*
808 * handler for MD_MN_MSG_SET_CAP. As this handler can deal with both mirrors
809 * and soft partitions, the driver name that is required for the ioctl call
810 * is included in the message.
811 */
812 /*ARGSUSED*/
813 void
mdmn_do_set_cap(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)814 mdmn_do_set_cap(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
815 {
816 md_mn_msg_setcap_t *d;
817 md_mn_setcap_params_t setcap_ioc;
818 minor_t mnum;
819 int ret;
820
821 resp->mmr_out_size = 0;
822 resp->mmr_err_size = 0;
823 resp->mmr_out = NULL;
824 resp->mmr_err = NULL;
825 resp->mmr_comm_state = MDMNE_ACK;
826 d = (md_mn_msg_setcap_t *)((void *)(msg->msg_event_data));
827 mnum = d->msg_setcap_mnum;
828
829 (void) memset(&setcap_ioc, 0, sizeof (setcap_ioc));
830
831 MD_SETDRIVERNAME(&setcap_ioc, d->msg_setcap_driver, MD_MIN2SET(mnum));
832 setcap_ioc.mnum = mnum;
833 setcap_ioc.sc_set = d->msg_setcap_set;
834
835 ret = metaioctl(MD_MN_SET_CAP, &setcap_ioc, &setcap_ioc.mde, NULL);
836
837 resp->mmr_exitval = ret;
838 }
839
840 /*
841 * Dummy handler for various CLASS0 messages like
842 * MD_MN_MSG_VERBOSITY / MD_MN_MSG_RESUME / MD_MN_MSG_SUSPEND ...
843 */
844 /*ARGSUSED*/
845 void
mdmn_do_dummy(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)846 mdmn_do_dummy(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
847 {
848 resp->mmr_out_size = 0;
849 resp->mmr_err_size = 0;
850 resp->mmr_out = NULL;
851 resp->mmr_err = NULL;
852 resp->mmr_exitval = 0;
853 resp->mmr_comm_state = MDMNE_ACK;
854 }
855
856 /*
857 * Overall description of mdcommd support that keeps all nodes in-sync
858 * with the ondisk diskset mddbs.
859 *
860 * All configuration changes to the mddb - addition/deletion of metadevices
861 * or replicas must use a CLASS1 message to block out these changes.
862 * Changes to the state of existing replicas do not need to block CLASS1
863 * since there is no conflict when just updating the state of a replica.
864 *
865 * Error encountered when master writes to mddbs:
866 * As the master updates parts of the mddbs, flags are updated describing
867 * what has been written. When all locks are dropped (either in
868 * mddb_setexit or mdioctl), a PARSE message will be generated to all
869 * nodes with an index list of known good mddbs and the parse flags.
870 * The master node ignore the parse message since it sent it.
871 * The slave nodes re-read in the changed part of the mddb using the list
872 * of known good replicas that was passed.
873 * PARSE message does not block CLASS1.
874 * The PARSE message must be the highest class message. Since this
875 * message could be sent on any ioctl, this PARSE message class must
876 * be higher than any other class message that could issue an ioctl.
877 *
878 * Master Slave1 Slave2
879 * Handles_error
880 * PARSE PARSE PARSE
881 *
882 *
883 * Add/Delete mddbs can occur from the following commands:
884 * metadb -s set_name -a/-d
885 * metaset -s set_name -a/-d disk
886 * metaset -s set_name -b
887 *
888 * The metadb/metaset command is run on the node executing the command
889 * and sends an ATTACH/DETACH message to the master node blocking CLASS1
890 * messages on all nodes until this message is finished. The master
891 * node generates 3 submessages of BLOCK, SM_ATTACH/SM_DETACH, UNBLOCK.
892 * The BLOCK message is only run on the master node and will BLOCK
893 * the PARSE messages from being sent to the nodes.
894 * The SM_ATTACH/SM_DETACH message is run on all nodes and actually adds or
895 * removes the replica(s) from the given disk slice.
896 * The UNBLOCK message is only run on the master node and allows the
897 * sending of PARSE messages.
898 *
899 * Master Slave1 Slave2
900 * Add mddb cmd
901 * ATTACH msg to master
902 * BLOCK
903 * ATTACH ATTACH ATTACH
904 * UNBLOCK
905 * PARSE PARSE PARSE
906 * ATTACH msg finished
907 *
908 * Add/Delete host side information from the following commands:
909 * metaset -s set_name -a/-d -h
910 *
911 * The metaset command is run on the node executing the command and
912 * sends a DB_NEWSIDE/DB_DELSIDE message and a MD_NEWSIDE/MD_DELSIDE
913 * message whenever a host is added to or deleted from the diskset.
914 *
915 * The side information contains the major name and minor number
916 * associated with a disk slice from a certain node's perspective
917 * in an (failed) effort to support clustered systems that don't have the
918 * same device name for a physical device. (The original designers of
919 * SVM eventually took the shortcut of assuming that all device names
920 * are the same on all systems, but left the side information in the
921 * mddb and namespace.) The side information is used for disk slices
922 * that contain mddbs and/or are components for metadevices.
923 *
924 * The DB_NEWSIDE/DELSIDE command adds or deletes the side information
925 * for each mddb for the host being added or deleted.
926 * The MD_ADDSIDE/MD_DELSIDE command adds or deletes the side information
927 * for all disk slice components that are in the namespace records for
928 * the host being added or deleted.
929 *
930 * The DB_NEWSIDE/DB_DELSIDE message does not change any mddb records
931 * and only needs to be executed on the master node since the slave
932 * nodes will be brought up to date by the PARSE message that is
933 * generated as a result of a change to the mddb.
934 * The MD_ADDSIDE/MD_DELSIDE message does modify the records in the mddb
935 * and needs to be run on all nodes. The message must block class1
936 * messages so that record changing commands don't interfere.
937 *
938 * Master Slave1 Slave2
939 * Add host
940 * DB_NEWSIDE msg to master
941 * DB_NEWSIDE
942 * PARSE PARSE PARSE
943 * DB_NEWSIDE msg finished
944 * MD_NEWSIDE msg to master
945 * MD_NEWSIDE MD_NEWSIDE MD_NEWSIDE
946 * MD_NEWSIDE msg finished
947 *
948 *
949 * Optimized resync record failure:
950 * When any node sees a failure to write an optimized resync record
951 * that node notifies the master node of the replica that failed.
952 * The master node handles the error and updates the rest of the
953 * nodes using a PARSE message. The PARSE message also calls
954 * fixoptrecord on each slave node causing each node to fix up
955 * the optimized resync records that are owned by that node (the mirror
956 * owner code also sets the optimized resync record owner). The master
957 * node will fix up all optimized resync records that have no owner or
958 * are owned by the master node.
959 *
960 * Master Slave1 Slave2
961 * Optimized Record Failure
962 * OPTRECERR msg to master
963 * Master handles opt rec failure
964 * PARSE PARSE PARSE
965 * OPTRECERR msg finished
966 * Slave rewrites optimized record
967 *
968 */
969
970 /*
971 * Handler for MD_MN_MSG_MDDB_PARSE which send parse messages to the
972 * slave nodes in order to keep the incore view of the mddbs the
973 * same on all nodes.
974 *
975 * Since master node generated the mddb parse message, do nothing
976 * if this is the master node.
977 *
978 * If this is a slave node, send the parse message down to the kernel
979 * where this node will re-read in parts of the mddbs.
980 *
981 */
982 void
mdmn_do_mddb_parse(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)983 mdmn_do_mddb_parse(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
984 {
985 md_mn_msg_mddb_parse_t *d;
986 mddb_parse_parm_t mpp;
987 int ret = 0;
988 int i;
989
990 resp->mmr_out_size = 0;
991 resp->mmr_err_size = 0;
992 resp->mmr_out = NULL;
993 resp->mmr_err = NULL;
994 resp->mmr_comm_state = MDMNE_ACK;
995 d = (md_mn_msg_mddb_parse_t *)((void *)(msg->msg_event_data));
996
997 if (flags & MD_MSGF_ON_MASTER)
998 return;
999
1000 (void) memset(&mpp, 0, sizeof (mpp));
1001 mpp.c_setno = msg->msg_setno;
1002 mpp.c_parse_flags = d->msg_parse_flags;
1003 for (i = 0; i < MDDB_NLB; i++) {
1004 mpp.c_lb_flags[i] = d->msg_lb_flags[i];
1005 }
1006 ret = metaioctl(MD_MN_MDDB_PARSE, &mpp, &mpp.c_mde, NULL);
1007 if (ret)
1008 (void) mdstealerror(&(resp->mmr_ep), &mpp.c_mde);
1009
1010 resp->mmr_exitval = ret;
1011 }
1012
1013 /*
1014 * Handler for MD_MN_MSG_MDDB_BLOCK which blocks the generation
1015 * of parse messages from this node.
1016 *
1017 * This is needed when attaching/detaching mddbs on the master and the
1018 * slave node is unable to handle a parse message until the slave node
1019 * has done the attach/detach of the mddbs. So, master node will block
1020 * the parse messages, execute the attach/detach on all nodes and
1021 * then unblock the parse messages which causes the parse message to
1022 * be sent to all nodes.
1023 */
1024 /*ARGSUSED*/
1025 void
mdmn_do_mddb_block(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1026 mdmn_do_mddb_block(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1027 {
1028 md_mn_msg_mddb_block_t *d;
1029 mddb_block_parm_t mbp;
1030 int ret;
1031
1032 resp->mmr_out_size = 0;
1033 resp->mmr_err_size = 0;
1034 resp->mmr_out = NULL;
1035 resp->mmr_err = NULL;
1036 resp->mmr_comm_state = MDMNE_ACK;
1037 d = (md_mn_msg_mddb_block_t *)((void *)(msg->msg_event_data));
1038
1039 (void) memset(&mbp, 0, sizeof (mbp));
1040 mbp.c_setno = msg->msg_setno;
1041 mbp.c_blk_flags = d->msg_block_flags;
1042 ret = metaioctl(MD_MN_MDDB_BLOCK, &mbp, &mbp.c_mde, NULL);
1043 if (ret)
1044 (void) mdstealerror(&(resp->mmr_ep), &mbp.c_mde);
1045
1046 resp->mmr_exitval = ret;
1047 }
1048
1049 /*
1050 * Submessage generator for MD_MN_MSG_META_DB_ATTACH which generates
1051 * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_ATTACH
1052 * message on all nodes and then an UNBLOCK message on the master only.
1053 */
1054 int
mdmn_smgen_mddb_attach(md_mn_msg_t * msg,md_mn_msg_t * msglist[])1055 mdmn_smgen_mddb_attach(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
1056 {
1057 md_mn_msg_t *nmsg;
1058 md_mn_msg_meta_db_attach_t *d;
1059 md_mn_msg_meta_db_attach_t *attach_d;
1060 md_mn_msg_mddb_block_t *block_d;
1061
1062 d = (md_mn_msg_meta_db_attach_t *)(void *)msg->msg_event_data;
1063
1064 nmsg = Zalloc(sizeof (md_mn_msg_t));
1065 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1066
1067 nmsg->msg_flags = (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1068 nmsg->msg_setno = msg->msg_setno;
1069 nmsg->msg_type = MD_MN_MSG_MDDB_BLOCK;
1070 nmsg->msg_event_size = sizeof (md_mn_msg_mddb_block_t);
1071 nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_mddb_block_t));
1072 block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1073 block_d->msg_block_flags = MDDB_BLOCK_PARSE;
1074 msglist[0] = nmsg;
1075
1076 nmsg = Zalloc(sizeof (md_mn_msg_t));
1077 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1078
1079 /* Don't log submessages and panic on inconsistent results */
1080 nmsg->msg_flags = MD_MSGF_NO_LOG |
1081 MD_MSGF_PANIC_WHEN_INCONSISTENT;
1082 nmsg->msg_setno = msg->msg_setno;
1083 nmsg->msg_type = MD_MN_MSG_SM_MDDB_ATTACH;
1084 nmsg->msg_event_size = sizeof (md_mn_msg_meta_db_attach_t);
1085 nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_meta_db_attach_t));
1086 attach_d = (md_mn_msg_meta_db_attach_t *)
1087 (void *)nmsg->msg_event_data;
1088 attach_d->msg_l_dev = d->msg_l_dev;
1089 attach_d->msg_cnt = d->msg_cnt;
1090 attach_d->msg_dbsize = d->msg_dbsize;
1091 (void) strncpy(attach_d->msg_dname, d->msg_dname, 16);
1092 attach_d->msg_splitname = d->msg_splitname;
1093 attach_d->msg_options = d->msg_options;
1094 msglist[1] = nmsg;
1095
1096 nmsg = Zalloc(sizeof (md_mn_msg_t));
1097 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1098
1099 nmsg->msg_flags = (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1100 nmsg->msg_setno = msg->msg_setno;
1101 nmsg->msg_type = MD_MN_MSG_MDDB_BLOCK;
1102 nmsg->msg_event_size = sizeof (md_mn_msg_mddb_block_t);
1103 nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_mddb_block_t));
1104 block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1105 block_d->msg_block_flags = MDDB_UNBLOCK_PARSE;
1106 msglist[2] = nmsg;
1107
1108 return (3); /* Return the number of submessages generated */
1109 }
1110
1111 /*
1112 * Submessage generator for MD_MN_MSG_META_DB_DETACH which generates
1113 * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_DETACH
1114 * message on all nodes and then an UNBLOCK message on the master only.
1115 */
1116 int
mdmn_smgen_mddb_detach(md_mn_msg_t * msg,md_mn_msg_t * msglist[])1117 mdmn_smgen_mddb_detach(md_mn_msg_t *msg, md_mn_msg_t *msglist[])
1118 {
1119 md_mn_msg_t *nmsg;
1120 md_mn_msg_meta_db_detach_t *d;
1121 md_mn_msg_meta_db_detach_t *detach_d;
1122 md_mn_msg_mddb_block_t *block_d;
1123
1124 d = (md_mn_msg_meta_db_detach_t *)(void *)msg->msg_event_data;
1125
1126 nmsg = Zalloc(sizeof (md_mn_msg_t));
1127 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1128
1129 nmsg->msg_flags = (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1130 nmsg->msg_setno = msg->msg_setno;
1131 nmsg->msg_type = MD_MN_MSG_MDDB_BLOCK;
1132 nmsg->msg_event_size = sizeof (md_mn_msg_mddb_block_t);
1133 nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_mddb_block_t));
1134 block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1135 block_d->msg_block_flags = MDDB_BLOCK_PARSE;
1136 msglist[0] = nmsg;
1137
1138 nmsg = Zalloc(sizeof (md_mn_msg_t));
1139 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1140
1141 /* Don't log submessages and panic on inconsistent results */
1142 nmsg->msg_flags = MD_MSGF_NO_LOG |
1143 MD_MSGF_PANIC_WHEN_INCONSISTENT;
1144 nmsg->msg_setno = msg->msg_setno;
1145 nmsg->msg_type = MD_MN_MSG_SM_MDDB_DETACH;
1146 nmsg->msg_event_size = sizeof (md_mn_msg_meta_db_detach_t);
1147 nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_meta_db_detach_t));
1148 detach_d = (md_mn_msg_meta_db_detach_t *)
1149 (void *)nmsg->msg_event_data;
1150 detach_d->msg_splitname = d->msg_splitname;
1151 msglist[1] = nmsg;
1152
1153 nmsg = Zalloc(sizeof (md_mn_msg_t));
1154 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1155
1156 nmsg->msg_flags = (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST);
1157 nmsg->msg_setno = msg->msg_setno;
1158 nmsg->msg_type = MD_MN_MSG_MDDB_BLOCK;
1159 nmsg->msg_event_size = sizeof (md_mn_msg_mddb_block_t);
1160 nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_mddb_block_t));
1161 block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data;
1162 block_d->msg_block_flags = MDDB_UNBLOCK_PARSE;
1163 msglist[2] = nmsg;
1164
1165 return (3); /* Return the number of submessages generated */
1166 }
1167
1168 /*
1169 * Handler for MD_MN_MSG_SM_MDDB_ATTACH which is used to attach mddbs.
1170 *
1171 * Used when running:
1172 * metadb -s set_name -a
1173 * metaset -s set_name -a/-d disk
1174 * metaset -s set_name -b
1175 */
1176 /*ARGSUSED*/
1177 void
mdmn_do_sm_mddb_attach(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1178 mdmn_do_sm_mddb_attach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1179 {
1180 md_mn_msg_meta_db_attach_t *d;
1181 struct mddb_config c;
1182 int i;
1183 int ret = 0;
1184 md_error_t ep = mdnullerror;
1185 char *name, *add_name;
1186 mdname_t *np;
1187 mdsetname_t *sp;
1188
1189 resp->mmr_out_size = 0;
1190 resp->mmr_err_size = 0;
1191 resp->mmr_out = NULL;
1192 resp->mmr_err = NULL;
1193 resp->mmr_comm_state = MDMNE_ACK;
1194 d = (md_mn_msg_meta_db_attach_t *)((void *)(msg->msg_event_data));
1195
1196 (void) memset(&c, 0, sizeof (c));
1197 c.c_setno = msg->msg_setno;
1198 c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1199 (void) strncpy(c.c_locator.l_driver, d->msg_dname,
1200 sizeof (c.c_locator.l_driver));
1201 c.c_devname = d->msg_splitname;
1202 c.c_locator.l_mnum = meta_getminor(d->msg_l_dev);
1203 c.c_multi_node = 1;
1204 if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1205 (void) mdstealerror(&(resp->mmr_ep), &ep);
1206 resp->mmr_exitval = -1;
1207 return;
1208 }
1209 (void) strcpy(c.c_setname, sp->setname);
1210 c.c_sideno = getmyside(sp, &ep);
1211 if (c.c_sideno == MD_SIDEWILD) {
1212 (void) mdstealerror(&(resp->mmr_ep), &ep);
1213 resp->mmr_exitval = -1;
1214 return;
1215 }
1216
1217 name = splicename(&d->msg_splitname);
1218 np = metaname(&sp, name, LOGICAL_DEVICE, &ep);
1219 Free(name);
1220 if (np == NULL) {
1221 (void) mdstealerror(&(resp->mmr_ep), &ep);
1222 resp->mmr_exitval = -1;
1223 return;
1224 }
1225 /*
1226 * All nodes in MN diskset must do meta_check_replica
1227 * since this causes the shared namespace to be
1228 * populated by the md driver names while checking
1229 * to see if this device is already in use as a
1230 * metadevice.
1231 */
1232 if (meta_check_replica(sp, np, d->msg_options, 0,
1233 (d->msg_cnt * d->msg_dbsize), &ep)) {
1234 (void) mdstealerror(&(resp->mmr_ep), &ep);
1235 resp->mmr_exitval = -1;
1236 return;
1237 }
1238
1239 for (i = 0; i < d->msg_cnt; i++) {
1240 c.c_locator.l_blkno = i * d->msg_dbsize + 16;
1241 if (setup_med_cfg(sp, &c,
1242 (d->msg_options & MDCHK_SET_FORCE), &ep)) {
1243 ret = -1;
1244 (void) mdstealerror(&(resp->mmr_ep), &ep);
1245 break;
1246 }
1247 ret = metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL);
1248 /* If newdev was successful, continue with attach */
1249 if (ret == 0) {
1250 if (meta_db_addsidenms(sp, np, c.c_locator.l_blkno,
1251 DB_ADDSIDENMS_NO_BCAST, &ep)) {
1252 ret = -1;
1253 (void) mdstealerror(&(resp->mmr_ep), &ep);
1254 break;
1255 }
1256 } else {
1257 (void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1258 break;
1259 }
1260 }
1261 add_name = splicename(&d->msg_splitname);
1262 if ((np = metaname(&sp, add_name, LOGICAL_DEVICE, &ep)) != NULL) {
1263 meta_invalidate_name(np);
1264 } else {
1265 ret = -1;
1266 (void) mdstealerror(&(resp->mmr_ep), &ep);
1267 }
1268 Free(add_name);
1269
1270 resp->mmr_exitval = ret;
1271 }
1272
1273 /*
1274 * Handler for MD_MN_MSG_SM_MDDB_DETACH which is used to detach mddbs.
1275 *
1276 * Used when running:
1277 * metadb -s set_name -d
1278 * metaset -s set_name -a/-d disk
1279 * metaset -s set_name -b
1280 */
1281 /*ARGSUSED*/
1282 void
mdmn_do_sm_mddb_detach(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1283 mdmn_do_sm_mddb_detach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1284 {
1285 md_mn_msg_meta_db_detach_t *d;
1286 struct mddb_config c;
1287 int i;
1288 int ret = 0;
1289 md_error_t ep = mdnullerror;
1290 char *name, *del_name;
1291 mdname_t *np;
1292 mdsetname_t *sp;
1293
1294 resp->mmr_out_size = 0;
1295 resp->mmr_err_size = 0;
1296 resp->mmr_out = NULL;
1297 resp->mmr_err = NULL;
1298 resp->mmr_comm_state = MDMNE_ACK;
1299 d = (md_mn_msg_meta_db_detach_t *)((void *)(msg->msg_event_data));
1300
1301 if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1302 (void) mdstealerror(&(resp->mmr_ep), &ep);
1303 resp->mmr_exitval = -1;
1304 return;
1305 }
1306
1307 (void) memset(&c, 0, sizeof (c));
1308 c.c_setno = msg->msg_setno;
1309 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1310 resp->mmr_exitval = -1;
1311 (void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1312 return;
1313 }
1314 i = 0;
1315 del_name = splicename(&d->msg_splitname);
1316 while (i < c.c_dbcnt) {
1317 c.c_id = i;
1318 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1319 ret = -1;
1320 (void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1321 break;
1322 }
1323 name = splicename(&c.c_devname);
1324 if (strcmp(name, del_name) != 0) {
1325 Free(name);
1326 i++;
1327 continue;
1328 }
1329 Free(name);
1330 /* Found a match - delete mddb */
1331 if (metaioctl(MD_DB_DELDEV, &c, &c.c_mde, NULL) != 0) {
1332 ret = -1;
1333 (void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1334 break;
1335 }
1336 /* Not incrementing "i" intentionally (dbcnt is changed) */
1337 }
1338 if ((np = metaname(&sp, del_name, LOGICAL_DEVICE, &ep)) != NULL) {
1339 meta_invalidate_name(np);
1340 } else {
1341 ret = -1;
1342 (void) mdstealerror(&(resp->mmr_ep), &ep);
1343 }
1344 Free(del_name);
1345
1346 resp->mmr_exitval = ret;
1347 }
1348
1349 /*
1350 * Handler for MD_MN_MSG_META_DB_NEWSIDE which is used to update the
1351 * side information for each diskset mddb when a new host has been
1352 * added to the diskset. The side information is the /dev/dsk/ctds name
1353 * that the new node would use to access each mddb.
1354 *
1355 * Since this routine makes no changes to the records in the diskset mddb,
1356 * this routine only needs to be run on the master node. The master node's
1357 * kernel code will detect that portions of the mddb have changed and
1358 * will send a parse message to all nodes to re-parse parts of the mddb.
1359 *
1360 * Used when running:
1361 * metaset -s set_name -a -h new_hostname
1362 */
1363 /*ARGSUSED*/
1364 void
mdmn_do_meta_db_newside(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1365 mdmn_do_meta_db_newside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1366 {
1367 md_mn_msg_meta_db_newside_t *d;
1368 struct mddb_config c;
1369 int ret = 0;
1370 mdsetname_t *sp;
1371 md_error_t ep = mdnullerror;
1372
1373 resp->mmr_out_size = 0;
1374 resp->mmr_err_size = 0;
1375 resp->mmr_out = NULL;
1376 resp->mmr_err = NULL;
1377 resp->mmr_comm_state = MDMNE_ACK;
1378 d = (md_mn_msg_meta_db_newside_t *)((void *)(msg->msg_event_data));
1379
1380 (void) memset(&c, 0, sizeof (c));
1381 c.c_setno = msg->msg_setno;
1382 c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1383 c.c_locator.l_blkno = d->msg_blkno;
1384 (void) strncpy(c.c_locator.l_driver, d->msg_dname,
1385 sizeof (c.c_locator.l_driver));
1386 c.c_devname = d->msg_splitname;
1387 c.c_locator.l_mnum = d->msg_mnum;
1388 c.c_multi_node = 1;
1389 if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1390 (void) mdstealerror(&(resp->mmr_ep), &ep);
1391 resp->mmr_exitval = -1;
1392 return;
1393 }
1394 (void) strcpy(c.c_setname, sp->setname);
1395 c.c_sideno = d->msg_sideno;
1396
1397 if ((ret = metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL)) != 0) {
1398 (void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1399 }
1400 resp->mmr_exitval = ret;
1401 }
1402
1403 /*
1404 * Handler for MD_MN_MSG_META_DB_DELSIDE which is used to remove the
1405 * side information for each diskset mddb when a host has been
1406 * deleted from the diskset. The side information is the /dev/dsk/ctds name
1407 * that the node would use to access each mddb.
1408 *
1409 * Since this routine makes no changes to the records in the diskset mddb,
1410 * this routine only needs to be run on the master node. The master node's
1411 * kernel code will detect that portions of the mddb have changed and
1412 * will send a parse message to all nodes to re-parse parts of the mddb.
1413 *
1414 * Used when running:
1415 * metaset -s set_name -d -h hostname
1416 */
1417 /*ARGSUSED*/
1418 void
mdmn_do_meta_db_delside(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1419 mdmn_do_meta_db_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1420 {
1421 md_mn_msg_meta_db_delside_t *d;
1422 mddb_config_t c;
1423 int ret = 0;
1424 mdsetname_t *sp;
1425 md_error_t ep = mdnullerror;
1426
1427 resp->mmr_out_size = 0;
1428 resp->mmr_err_size = 0;
1429 resp->mmr_out = NULL;
1430 resp->mmr_err = NULL;
1431 resp->mmr_comm_state = MDMNE_ACK;
1432 d = (md_mn_msg_meta_db_delside_t *)((void *)(msg->msg_event_data));
1433
1434 (void) memset(&c, 0, sizeof (c));
1435 c.c_setno = msg->msg_setno;
1436 c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev);
1437 c.c_locator.l_blkno = d->msg_blkno;
1438 c.c_multi_node = 1;
1439 if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) {
1440 (void) mdstealerror(&(resp->mmr_ep), &ep);
1441 resp->mmr_exitval = -1;
1442 return;
1443 }
1444 (void) strcpy(c.c_setname, sp->setname);
1445 c.c_sideno = d->msg_sideno;
1446
1447 if ((ret = metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL)) != 0) {
1448 (void) mdstealerror(&(resp->mmr_ep), &c.c_mde);
1449 }
1450 resp->mmr_exitval = ret;
1451 }
1452
1453 /*
1454 * Handler for MD_MN_MSG_META_MD_ADDSIDE which is used to add the
1455 * side information for each diskset metadevice component (if that
1456 * component is a disk) when a host has been added to the diskset.
1457 * The side information is the /dev/dsk/ctds name that the node would
1458 * use to access the metadevice component.
1459 *
1460 * This routine makes changes to the mddb records and must be run
1461 * on all nodes.
1462 *
1463 * Used when running:
1464 * metaset -s set_name -a -h new_hostname
1465 */
1466 /*ARGSUSED*/
1467 void
mdmn_do_meta_md_addside(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1468 mdmn_do_meta_md_addside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1469 {
1470 md_mn_msg_meta_md_addside_t *d;
1471 mdnm_params_t nm;
1472 mdsetname_t *sp;
1473 char *cname, *dname;
1474 minor_t mnum;
1475 int done, i;
1476 md_error_t ep = mdnullerror;
1477
1478 resp->mmr_out_size = 0;
1479 resp->mmr_err_size = 0;
1480 resp->mmr_out = NULL;
1481 resp->mmr_err = NULL;
1482 resp->mmr_comm_state = MDMNE_ACK;
1483 d = (md_mn_msg_meta_md_addside_t *)((void *)(msg->msg_event_data));
1484
1485 (void) memset(&nm, 0, sizeof (nm));
1486 if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1487 (void) mdstealerror(&(resp->mmr_ep), &ep);
1488 resp->mmr_exitval = -1;
1489 return;
1490 }
1491 /* While loop continues until IOCNXTKEY_NM gives nm.key of KEYWILD */
1492 /*CONSTCOND*/
1493 while (1) {
1494 char *drvnm = NULL;
1495
1496 nm.mde = mdnullerror;
1497 nm.setno = msg->msg_setno;
1498 nm.side = d->msg_otherside;
1499 if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) {
1500 (void) mdstealerror(&(resp->mmr_ep), &nm.mde);
1501 resp->mmr_exitval = -1;
1502 return;
1503 }
1504
1505 /* Normal exit path is to eventually get a KEYWILD */
1506 if (nm.key == MD_KEYWILD) {
1507 resp->mmr_exitval = 0;
1508 return;
1509 }
1510
1511 /*
1512 * Okay we have a valid key
1513 * Let's see if it is hsp or not
1514 */
1515 nm.devname = (uintptr_t)meta_getnmentbykey(msg->msg_setno,
1516 d->msg_otherside, nm.key, &drvnm, NULL, NULL, &ep);
1517 if (nm.devname == NULL || drvnm == NULL) {
1518 if (nm.devname)
1519 Free((void *)(uintptr_t)nm.devname);
1520 if (drvnm)
1521 Free((void *)(uintptr_t)drvnm);
1522 (void) mdstealerror(&(resp->mmr_ep), &ep);
1523 resp->mmr_exitval = -1;
1524 return;
1525 }
1526
1527 /*
1528 * If it is hsp add here
1529 */
1530 if (strcmp(drvnm, MD_HOTSPARES) == 0) {
1531 if (add_name(sp, d->msg_sideno, nm.key, MD_HOTSPARES,
1532 minor(NODEV), (char *)(uintptr_t)nm.devname,
1533 NULL, NULL, &ep) == -1) {
1534 Free((void *)(uintptr_t)nm.devname);
1535 Free((void *)(uintptr_t)drvnm);
1536 (void) mdstealerror(&(resp->mmr_ep), &ep);
1537 resp->mmr_exitval = -1;
1538 return;
1539 } else {
1540 Free((void *)(uintptr_t)nm.devname);
1541 Free((void *)(uintptr_t)drvnm);
1542 continue;
1543 }
1544 }
1545
1546 nm.side = d->msg_sideno;
1547 if ((done = meta_getside_devinfo(sp,
1548 (char *)(uintptr_t)nm.devname,
1549 d->msg_sideno, &cname, &dname, &mnum, &ep)) == -1) {
1550 (void) mdstealerror(&(resp->mmr_ep), &ep);
1551 Free((void *)(uintptr_t)nm.devname);
1552 resp->mmr_exitval = -1;
1553 return;
1554 }
1555
1556 Free((void *)(uintptr_t)nm.devname);
1557 Free((void *)(uintptr_t)drvnm);
1558
1559 if (done != 1) {
1560 Free(cname);
1561 Free(dname);
1562 resp->mmr_exitval = -1;
1563 return;
1564 }
1565
1566 /*
1567 * The device reference count can be greater than 1 if
1568 * more than one softpart is configured on top of the
1569 * same device. If this is the case then we want to
1570 * increment the count to sync up with the other sides.
1571 */
1572 for (i = 0; i < nm.ref_count; i++) {
1573 if (add_name(sp, d->msg_sideno, nm.key, dname, mnum,
1574 cname, NULL, NULL, &ep) == -1) {
1575 (void) mdstealerror(&(resp->mmr_ep), &ep);
1576 Free(cname);
1577 Free(dname);
1578 resp->mmr_exitval = -1;
1579 return;
1580 }
1581 }
1582 Free(cname);
1583 Free(dname);
1584 }
1585
1586 /*NOTREACHED*/
1587 }
1588 /*
1589 * Handler for MD_MN_MSG_META_MD_DELSIDE which is used to delete the
1590 * side information for each diskset metadevice component (if that
1591 * component is a disk) when a host has been removed from the diskset.
1592 * The side information is the /dev/dsk/ctds name that the node would
1593 * use to access the metadevice component.
1594 *
1595 * This routine makes changes to the mddb records and must be run
1596 * on all nodes.
1597 *
1598 * Used when running:
1599 * metaset -s set_name -d -h hostname
1600 */
1601 /*ARGSUSED*/
1602 void
mdmn_do_meta_md_delside(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1603 mdmn_do_meta_md_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1604 {
1605 md_mn_msg_meta_md_delside_t *d;
1606 mdnm_params_t nm;
1607 mdsetname_t *sp;
1608 md_error_t ep = mdnullerror;
1609 int i;
1610
1611 resp->mmr_out_size = 0;
1612 resp->mmr_err_size = 0;
1613 resp->mmr_out = NULL;
1614 resp->mmr_err = NULL;
1615 resp->mmr_comm_state = MDMNE_ACK;
1616 d = (md_mn_msg_meta_md_delside_t *)((void *)(msg->msg_event_data));
1617
1618 if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) {
1619 (void) mdstealerror(&(resp->mmr_ep), &ep);
1620 resp->mmr_exitval = -1;
1621 return;
1622 }
1623
1624 (void) memset(&nm, 0, sizeof (nm));
1625 nm.key = MD_KEYWILD;
1626 /*CONSTCOND*/
1627 while (1) {
1628 nm.mde = mdnullerror;
1629 nm.setno = msg->msg_setno;
1630 nm.side = MD_SIDEWILD;
1631 if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) {
1632 (void) mdstealerror(&(resp->mmr_ep), &nm.mde);
1633 resp->mmr_exitval = -1;
1634 return;
1635 }
1636
1637 /* Normal exit path is to eventually get a KEYWILD */
1638 if (nm.key == MD_KEYWILD) {
1639 resp->mmr_exitval = 0;
1640 return;
1641 }
1642
1643 /*
1644 * The device reference count can be greater than 1 if
1645 * more than one softpart is configured on top of the
1646 * same device. If this is the case then we want to
1647 * decrement the count to zero so the entry can be
1648 * actually removed.
1649 */
1650 for (i = 0; i < nm.ref_count; i++) {
1651 if (del_name(sp, d->msg_sideno, nm.key, &ep) == -1) {
1652 (void) mdstealerror(&(resp->mmr_ep), &ep);
1653 resp->mmr_exitval = -1;
1654 return;
1655 }
1656 }
1657 }
1658
1659 /*NOTREACHED*/
1660 }
1661
1662 /*
1663 * Handler for MD_MN_MSG_MDDB_OPTRECERR which is used to notify
1664 * the master node that a node has seen an error when attempting to
1665 * write to the optimized resync records that reside on 2 of the diskset
1666 * mddbs. Master node will mark the failed replica in error and this
1667 * will send a parse message to all nodes to re-read parts of the mddb
1668 * and to fix their optimized resync records based on this information.
1669 */
1670 /*ARGSUSED*/
1671 void
mdmn_do_mddb_optrecerr(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1672 mdmn_do_mddb_optrecerr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1673 {
1674 md_mn_msg_mddb_optrecerr_t *d;
1675 mddb_optrec_parm_t mop;
1676 int ret;
1677 int i;
1678
1679 resp->mmr_out_size = 0;
1680 resp->mmr_err_size = 0;
1681 resp->mmr_out = NULL;
1682 resp->mmr_err = NULL;
1683 resp->mmr_comm_state = MDMNE_ACK;
1684 d = (md_mn_msg_mddb_optrecerr_t *)((void *)(msg->msg_event_data));
1685
1686 (void) memset(&mop, 0, sizeof (mop));
1687 mop.c_setno = msg->msg_setno;
1688 for (i = 0; i < 2; i++) {
1689 mop.c_recerr[i] = d->msg_recerr[i];
1690 }
1691 ret = metaioctl(MD_MN_MDDB_OPTRECFIX, &mop, &mop.c_mde, NULL);
1692 if (ret)
1693 (void) mdstealerror(&(resp->mmr_ep), &mop.c_mde);
1694
1695 resp->mmr_exitval = ret;
1696 }
1697
1698 int
mdmn_smgen_test6(md_mn_msg_t * msg,md_mn_msg_t ** msglist)1699 mdmn_smgen_test6(md_mn_msg_t *msg, md_mn_msg_t **msglist)
1700 {
1701 md_mn_msg_t *nmsg;
1702
1703 nmsg = Zalloc(sizeof (md_mn_msg_t));
1704 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1705
1706 nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */
1707 nmsg->msg_setno = msg->msg_setno;
1708 nmsg->msg_type = MD_MN_MSG_TEST2;
1709 nmsg->msg_event_size = sizeof ("test2");
1710 nmsg->msg_event_data = Strdup("test2");
1711 msglist[0] = nmsg;
1712
1713 nmsg = Zalloc(sizeof (md_mn_msg_t));
1714 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1715
1716 nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */
1717 nmsg->msg_setno = msg->msg_setno;
1718 nmsg->msg_type = MD_MN_MSG_TEST2;
1719 nmsg->msg_event_size = sizeof ("test2");
1720 nmsg->msg_event_data = Strdup("test2");
1721 msglist[1] = nmsg;
1722
1723 nmsg = Zalloc(sizeof (md_mn_msg_t));
1724 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1725
1726 nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */
1727 nmsg->msg_setno = msg->msg_setno;
1728 nmsg->msg_type = MD_MN_MSG_TEST3;
1729 nmsg->msg_event_size = sizeof ("test3");
1730 nmsg->msg_event_data = Strdup("test3");
1731 msglist[2] = nmsg;
1732
1733 nmsg = Zalloc(sizeof (md_mn_msg_t));
1734 MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid));
1735
1736 nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */
1737 nmsg->msg_setno = msg->msg_setno;
1738 nmsg->msg_type = MD_MN_MSG_TEST4;
1739 nmsg->msg_event_size = sizeof ("test4");
1740 nmsg->msg_event_data = Strdup("test4");
1741 msglist[3] = nmsg;
1742
1743 return (4); /* Return the number of submessages generated */
1744 }
1745
1746 /*
1747 * This is to send an MD_IOCSET ioctl to all nodes to create a soft
1748 * partition.
1749 */
1750 /*ARGSUSED*/
1751 void
mdmn_do_iocset(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1752 mdmn_do_iocset(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1753 {
1754 md_mn_msg_iocset_t *d;
1755 int ret;
1756 set_t setno;
1757 mdsetname_t *sp;
1758 mdname_t *np;
1759 md_error_t mde = mdnullerror;
1760
1761 resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1762 resp->mmr_out_size = 0;
1763 resp->mmr_err_size = 0;
1764 resp->mmr_out = NULL;
1765 resp->mmr_err = NULL;
1766 d = (md_mn_msg_iocset_t *)(void *)msg->msg_event_data;
1767
1768 setno = MD_MIN2SET(d->iocset_params.mnum);
1769 if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1770 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1771 "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno);
1772 resp->mmr_exitval = 1;
1773 return;
1774 }
1775
1776 /*
1777 * Device should be in the namespace already
1778 */
1779 if ((np = metamnumname(&sp, d->iocset_params.mnum, 1, &mde)) == NULL) {
1780 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1781 "MD_MN_MSG_IOCSET: Invalid mnum %d\n"),
1782 d->iocset_params.mnum);
1783 resp->mmr_exitval = 1;
1784 return;
1785 }
1786
1787 /*
1788 * Create unit structure
1789 */
1790 d->iocset_params.mdp = (uintptr_t)&d->unit; /* set pointer to unit */
1791 ret = metaioctl(MD_IOCSET, &(d->iocset_params), &mde, np->cname);
1792 resp->mmr_exitval = ret;
1793 }
1794
1795 /*
1796 * This is to update the status of a softpart
1797 */
1798 /*ARGSUSED*/
1799 void
mdmn_do_sp_setstat(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1800 mdmn_do_sp_setstat(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1801 {
1802 md_mn_msg_sp_setstat_t *d;
1803 int ret;
1804 set_t setno;
1805 mdsetname_t *sp;
1806 minor_t mnum;
1807 md_error_t mde = mdnullerror;
1808
1809 resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1810 resp->mmr_out_size = 0;
1811 resp->mmr_err_size = 0;
1812 resp->mmr_out = NULL;
1813 resp->mmr_err = NULL;
1814 d = (md_mn_msg_sp_setstat_t *)(void *)msg->msg_event_data;
1815
1816 mnum = d->sp_setstat_mnum;
1817 setno = MD_MIN2SET(mnum);
1818 if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1819 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1820 "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno);
1821 resp->mmr_exitval = 1;
1822 return;
1823 }
1824
1825 ret = meta_sp_setstatus(sp, &mnum, 1, d->sp_setstat_status, &mde);
1826 resp->mmr_exitval = ret;
1827 }
1828
1829 /*
1830 * This is to add a key to the namespace
1831 */
1832 /*ARGSUSED*/
1833 void
mdmn_do_addkeyname(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1834 mdmn_do_addkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1835 {
1836 md_mn_msg_addkeyname_t *d;
1837 int ret;
1838 set_t setno;
1839 mdsetname_t *sp;
1840 md_error_t mde = mdnullerror;
1841 mdname_t *compnp;
1842
1843 resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1844 resp->mmr_out_size = 0;
1845 resp->mmr_err_size = 0;
1846 resp->mmr_out = NULL;
1847 resp->mmr_err = NULL;
1848 d = (md_mn_msg_addkeyname_t *)(void *)msg->msg_event_data;
1849
1850 setno = d->addkeyname_setno;
1851 if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1852 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1853 "MD_MN_ADDKEYNAME: Invalid setno %d\n"), setno);
1854 resp->mmr_exitval = -1;
1855 return;
1856 }
1857
1858 compnp = metaname(&sp, d->addkeyname_name, UNKNOWN, &mde);
1859 if (compnp != NULL) {
1860 ret = add_key_name(sp, compnp, NULL, &mde);
1861 if (ret < 0)
1862 resp->mmr_exitval = -1;
1863 else
1864 resp->mmr_exitval = compnp->key;
1865 } else {
1866 resp->mmr_exitval = -1;
1867 }
1868 }
1869
1870 /*
1871 * This is to delete a key from the namespace
1872 */
1873 /*ARGSUSED*/
1874 void
mdmn_do_delkeyname(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1875 mdmn_do_delkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1876 {
1877 md_mn_msg_delkeyname_t *d;
1878 int ret;
1879 set_t setno;
1880 mdsetname_t *sp;
1881 md_error_t mde = mdnullerror;
1882 mdname_t *compnp;
1883
1884 resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1885 resp->mmr_out_size = 0;
1886 resp->mmr_err_size = 0;
1887 resp->mmr_out = NULL;
1888 resp->mmr_err = NULL;
1889 d = (md_mn_msg_delkeyname_t *)(void *)msg->msg_event_data;
1890
1891 setno = d->delkeyname_setno;
1892 if ((sp = metasetnosetname(setno, &mde)) == NULL) {
1893 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1894 "MD_MN_DELKEYNAME: Invalid setno %d\n"), setno);
1895 resp->mmr_exitval = -1;
1896 return;
1897 }
1898
1899 compnp = metadevname(&sp, d->delkeyname_dev, &mde);
1900 if (compnp != NULL) {
1901 /*
1902 * Reset the key value for the name. This is required because
1903 * any previous call of del_key_name for the same component
1904 * will have resulted in the key value being reset to MD_KEYBAD
1905 * even though there may still be references to this component.
1906 */
1907 compnp->key = d->delkeyname_key;
1908 ret = del_key_name(sp, compnp, &mde);
1909 resp->mmr_exitval = ret;
1910 } else {
1911 resp->mmr_exitval = -1;
1912 }
1913 }
1914
1915 /*
1916 * This is to get the value of tstate from the master node. We use this
1917 * to get the ABR state of a metadevice from the master.
1918 */
1919 /*ARGSUSED*/
1920 void
mdmn_do_get_tstate(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1921 mdmn_do_get_tstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1922 {
1923 md_mn_msg_gettstate_t *d;
1924 int ret;
1925 uint_t tstate;
1926 md_error_t mde = mdnullerror;
1927
1928 resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
1929 resp->mmr_out_size = 0;
1930 resp->mmr_err_size = 0;
1931 resp->mmr_out = NULL;
1932 resp->mmr_err = NULL;
1933 d = (md_mn_msg_gettstate_t *)(void *)msg->msg_event_data;
1934
1935 ret = meta_get_tstate(d->gettstate_dev, &tstate, &mde);
1936 if (ret != 0) {
1937 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1938 "MD_MN_GET_TSTATE: Invalid dev %llx\n"), d->gettstate_dev);
1939 tstate = 0;
1940 }
1941 resp->mmr_exitval = tstate;
1942 }
1943
1944 /*
1945 * This is to get the mirror ABR state and the state of its submirrors from
1946 * the master node. We need this to ensure consistent output from metastat
1947 * when a new node joins the cluster during a resync. Without this the
1948 * submirror status will be incorrect until the whole resync is complete which
1949 * may take days for very large metadevices.
1950 */
1951 /*ARGSUSED*/
1952 void
mdmn_do_get_mirstate(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)1953 mdmn_do_get_mirstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
1954 {
1955 md_mn_msg_mir_state_t *d;
1956 md_mn_msg_mir_state_res_t *res; /* Results */
1957 set_t setno;
1958 mdsetname_t *sp; /* Set name */
1959 mdname_t *mirnp; /* Mirror name */
1960 md_error_t mde = mdnullerror;
1961 mm_unit_t *mm; /* Mirror */
1962 int smi;
1963 uint_t tstate;
1964
1965 resp->mmr_comm_state = MDMNE_ACK;
1966 resp->mmr_out_size = sizeof (md_mn_msg_mir_state_res_t);
1967 resp->mmr_err_size = 0;
1968 resp->mmr_out = Malloc(resp->mmr_out_size);
1969 resp->mmr_err = NULL;
1970 d = (md_mn_msg_mir_state_t *)(void *)msg->msg_event_data;
1971 res = (md_mn_msg_mir_state_res_t *)(void *)resp->mmr_out;
1972
1973 /* Validate set information from minor number */
1974 setno = MD_MIN2SET(d->mir_state_mnum);
1975 sp = metasetnosetname(setno, &mde);
1976 if (sp == NULL) {
1977 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1978 "MD_MN_GET_MIRROR_STATE: Invalid set %d\n"), setno);
1979 resp->mmr_exitval = 1; /* Failure */
1980 Free(resp->mmr_out);
1981 resp->mmr_out_size = 0;
1982 return;
1983 }
1984
1985 /* Construct mirror name from minor number */
1986 mirnp = metamnumname(&sp, d->mir_state_mnum, 0, &mde);
1987 if (mirnp == NULL) {
1988 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
1989 "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
1990 d->mir_state_mnum);
1991 resp->mmr_exitval = 2; /* Failure */
1992 Free(resp->mmr_out);
1993 resp->mmr_out_size = 0;
1994 return;
1995 }
1996
1997 /* Get common mirror structure */
1998 mm = (mm_unit_t *)meta_get_mdunit(sp, mirnp, &mde);
1999 if (mm == NULL) {
2000 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
2001 "MD_MN_GET_MIRROR_STATE: Invalid mirror minor %x\n"),
2002 d->mir_state_mnum);
2003 resp->mmr_exitval = 3; /* Failure */
2004 Free(resp->mmr_out);
2005 resp->mmr_out_size = 0;
2006 return;
2007 }
2008
2009 if (meta_get_tstate(d->mir_state_mnum, &tstate, &mde) != 0) {
2010 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
2011 "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"),
2012 d->mir_state_mnum);
2013 resp->mmr_exitval = 4; /* Failure */
2014 Free(resp->mmr_out);
2015 resp->mmr_out_size = 0;
2016 return;
2017 }
2018 /*
2019 * Fill in the sm_state/sm_flags value in the results structure which
2020 * gets passed back to the message originator
2021 */
2022 resp->mmr_exitval = 0;
2023 for (smi = 0; (smi < NMIRROR); smi++) {
2024 mm_submirror_t *mmsp = &mm->un_sm[smi];
2025 res->sm_state[smi] = mmsp->sm_state;
2026 res->sm_flags[smi] = mmsp->sm_flags;
2027 }
2028 /* Returm value of tstate for mirror */
2029 res->mir_tstate = tstate;
2030 }
2031
2032 /*
2033 * This is to issue an ioctl to call poke_hotspares
2034 */
2035 /*ARGSUSED*/
2036 void
mdmn_do_poke_hotspares(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)2037 mdmn_do_poke_hotspares(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
2038 {
2039
2040 md_mn_poke_hotspares_t pokehsp;
2041 md_mn_msg_pokehsp_t *d;
2042
2043 resp->mmr_out_size = 0;
2044 resp->mmr_err_size = 0;
2045 resp->mmr_out = NULL;
2046 resp->mmr_err = NULL;
2047 resp->mmr_comm_state = MDMNE_ACK;
2048 d = (md_mn_msg_pokehsp_t *)(void *)msg->msg_event_data;
2049
2050 (void) memset(&pokehsp, 0, sizeof (pokehsp));
2051 MD_SETDRIVERNAME(&pokehsp, MD_MIRROR, d->pokehsp_setno);
2052
2053 resp->mmr_exitval = metaioctl(MD_MN_POKE_HOTSPARES, &pokehsp,
2054 &pokehsp.mde, NULL);
2055 }
2056
2057 /*
2058 * Called to create a softpart during a metarecover operation
2059 */
2060 /*ARGSUSED*/
2061 void
mdmn_do_addmdname(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)2062 mdmn_do_addmdname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
2063 {
2064 md_mn_msg_addmdname_t *d;
2065 md_error_t mde = mdnullerror;
2066 mdsetname_t *sp;
2067 int init = 0;
2068 mdkey_t key;
2069 minor_t mnum;
2070
2071 resp->mmr_comm_state = MDMNE_ACK; /* Ok state */;
2072 resp->mmr_out_size = 0;
2073 resp->mmr_err_size = 0;
2074 resp->mmr_out = NULL;
2075 resp->mmr_err = NULL;
2076 d = (md_mn_msg_addmdname_t *)(void *)msg->msg_event_data;
2077
2078 if ((sp = metasetnosetname(d->addmdname_setno, &mde)) == NULL) {
2079 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
2080 "MD_MN_MSG_ADDMDNAME: Invalid setno %d\n"),
2081 d->addmdname_setno);
2082 resp->mmr_exitval = 1;
2083 return;
2084 }
2085
2086 /*
2087 * If device node does not exist then init it
2088 */
2089 if (!is_existing_meta_hsp(sp, d->addmdname_name)) {
2090 if ((key = meta_init_make_device(&sp, d->addmdname_name,
2091 &mde)) <= 0) {
2092 syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
2093 "MD_MN_MSG_ADDMDNAME: Invalid name %s\n"),
2094 d->addmdname_name);
2095 resp->mmr_exitval = 1;
2096 return;
2097 }
2098
2099 init = 1;
2100 }
2101
2102 /*
2103 * We should have it
2104 */
2105 if (metaname(&sp, d->addmdname_name, META_DEVICE, &mde) == NULL) {
2106
2107 if (init) {
2108 if (meta_getnmentbykey(sp->setno, MD_SIDEWILD,
2109 key, NULL, &mnum, NULL, &mde) != NULL) {
2110 (void) metaioctl(
2111 MD_IOCREM_DEV, &mnum, &mde, NULL);
2112 }
2113 (void) del_self_name(sp, key, &mde);
2114 }
2115
2116 resp->mmr_exitval = 1;
2117 return;
2118 }
2119
2120 resp->mmr_exitval = 0;
2121 }
2122
2123 /*
2124 * This is used to issue a MD_MN_RR_DIRTY ioctl to the mirror.
2125 */
2126 /*ARGSUSED*/
2127 void
mdmn_do_mark_dirty(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)2128 mdmn_do_mark_dirty(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
2129 {
2130 md_mn_msg_rr_dirty_t *d;
2131 md_mn_rr_dirty_params_t rp;
2132 int ret;
2133
2134 resp->mmr_out_size = 0;
2135 resp->mmr_err_size = 0;
2136 resp->mmr_out = NULL;
2137 resp->mmr_err = NULL;
2138 resp->mmr_comm_state = MDMNE_ACK;
2139 d = (md_mn_msg_rr_dirty_t *)((void *)(msg->msg_event_data));
2140
2141 (void) memset(&rp, 0, sizeof (rp));
2142 MD_SETDRIVERNAME(&rp, MD_MIRROR, MD_MIN2SET(d->rr_mnum))
2143 rp.rr_mnum = d->rr_mnum;
2144 rp.rr_nodeid = d->rr_nodeid;
2145 rp.rr_start = (ushort_t)((d->rr_range >> 16) & 0xffff);
2146 rp.rr_end = (ushort_t)(d->rr_range & 0xffff);
2147
2148 ret = metaioctl(MD_MN_RR_DIRTY, &rp, &rp.mde, NULL);
2149
2150 resp->mmr_exitval = ret;
2151 }
2152
2153 /*
2154 * This is used to issue a MD_MN_RR_CLEAN ioctl to the mirror.
2155 */
2156 /*ARGSUSED*/
2157 void
mdmn_do_mark_clean(md_mn_msg_t * msg,uint_t flags,md_mn_result_t * resp)2158 mdmn_do_mark_clean(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
2159 {
2160 md_mn_msg_rr_clean_t *d;
2161 md_mn_rr_clean_params_t *rcp;
2162 int ret;
2163
2164 resp->mmr_out_size = 0;
2165 resp->mmr_err_size = 0;
2166 resp->mmr_out = NULL;
2167 resp->mmr_err = NULL;
2168 resp->mmr_comm_state = MDMNE_ACK;
2169 d = (md_mn_msg_rr_clean_t *)((void *)(msg->msg_event_data));
2170
2171 rcp = Zalloc(sizeof (struct md_mn_rr_clean_params) +
2172 MDMN_MSG_RR_CLEAN_DATA_BYTES(d));
2173 MD_SETDRIVERNAME(rcp, MD_MIRROR, MD_MIN2SET(d->rr_mnum))
2174 rcp->rr_mnum = d->rr_mnum;
2175 rcp->rr_nodeid = d->rr_nodeid;
2176 rcp->rr_start_size = d->rr_start_size;
2177 (void) memcpy(MDMN_RR_CLEAN_PARAMS_DATA(rcp), MDMN_MSG_RR_CLEAN_DATA(d),
2178 MDMN_MSG_RR_CLEAN_DATA_BYTES(d));
2179
2180 ret = metaioctl(MD_MN_RR_CLEAN, rcp, &rcp->mde, NULL);
2181
2182 Free(rcp);
2183
2184 resp->mmr_exitval = ret;
2185 }
2186