1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 /*
29 * mirror operations
30 */
31
32 #include <meta.h>
33 #include <sys/lvm/md_mirror.h>
34 #include <thread.h>
35
36 extern int md_in_daemon;
37 extern md_mn_client_list_t *mdmn_clients;
38
39 /*
40 * chain of mirrors
41 */
42 typedef struct mm_unit_list {
43 struct mm_unit_list *next; /* next in chain */
44 mdname_t *namep; /* mirror name */
45 mm_pass_num_t pass; /* pass number */
46 uint_t done; /* resync done */
47 } mm_unit_list_t;
48
49 /*
50 * resync mirror
51 * meta_lock for this set should be held on entry.
52 */
53 int
meta_mirror_resync(mdsetname_t * sp,mdname_t * mirnp,daddr_t size,md_error_t * ep,md_resync_cmd_t cmd)54 meta_mirror_resync(
55 mdsetname_t *sp,
56 mdname_t *mirnp,
57 daddr_t size,
58 md_error_t *ep,
59 md_resync_cmd_t cmd /* Start/Block/Unblock/Kill */
60 )
61 {
62 char *miscname;
63 md_resync_ioctl_t ri;
64
65 /* should have a set */
66 assert(sp != NULL);
67 assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev)));
68
69 /* make sure we have a mirror */
70 if ((miscname = metagetmiscname(mirnp, ep)) == NULL)
71 return (-1);
72 if (strcmp(miscname, MD_MIRROR) != 0) {
73 return (mdmderror(ep, MDE_NOT_MM, meta_getminor(mirnp->dev),
74 mirnp->cname));
75 }
76
77 /* start resync */
78 (void) memset(&ri, 0, sizeof (ri));
79 MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
80 ri.ri_mnum = meta_getminor(mirnp->dev);
81 ri.ri_copysize = size;
82 switch (cmd) {
83 case MD_RESYNC_FORCE_MNSTART:
84 ri.ri_flags |= MD_RI_RESYNC_FORCE_MNSTART;
85 break;
86 case MD_RESYNC_START:
87 ri.ri_flags = 0;
88 break;
89 case MD_RESYNC_BLOCK:
90 ri.ri_flags = MD_RI_BLOCK;
91 break;
92 case MD_RESYNC_UNBLOCK:
93 ri.ri_flags = MD_RI_UNBLOCK;
94 break;
95 case MD_RESYNC_KILL:
96 ri.ri_flags = MD_RI_KILL;
97 break;
98 case MD_RESYNC_KILL_NO_WAIT:
99 ri.ri_flags = MD_RI_KILL | MD_RI_NO_WAIT;
100 break;
101 default:
102 /* TODO: Add new error MDE_BAD_RESYNC_FLAGS */
103 return (mderror(ep, MDE_BAD_RESYNC_OPT, mirnp->cname));
104 }
105
106 if (metaioctl(MD_IOCSETSYNC, &ri, &ri.mde, mirnp->cname) != 0)
107 return (mdstealerror(ep, &ri.mde));
108
109 /* return success */
110 return (0);
111 }
112
113 /*
114 * free units
115 */
116 static void
free_units(mm_unit_list_t * mirrors[MD_PASS_MAX+1])117 free_units(
118 mm_unit_list_t *mirrors[MD_PASS_MAX + 1]
119 )
120 {
121 uint_t i;
122
123 for (i = 0; (i < (MD_PASS_MAX + 1)); ++i) {
124 mm_unit_list_t *p, *n;
125
126 for (p = mirrors[i], n = NULL; (p != NULL); p = n) {
127 n = p->next;
128 Free(p);
129 }
130 mirrors[i] = NULL;
131 }
132 }
133
134 /*
135 * setup_units: build lists of units for each pass
136 */
137 static int
setup_units(mdsetname_t * sp,mm_unit_list_t * mirrors[MD_PASS_MAX+1],md_error_t * ep)138 setup_units(
139 mdsetname_t *sp,
140 mm_unit_list_t *mirrors[MD_PASS_MAX + 1],
141 md_error_t *ep
142 )
143 {
144 mdnamelist_t *mirrornlp = NULL;
145 mdnamelist_t *p;
146 int rval = 0;
147
148 /* should have a set */
149 assert(sp != NULL);
150
151 /* for each mirror */
152 if (meta_get_mirror_names(sp, &mirrornlp, 0, ep) < 0)
153 return (-1);
154 for (p = mirrornlp; (p != NULL); p = p->next) {
155 md_mirror_t *mirrorp;
156 mm_unit_list_t *lp;
157
158 /* get unit structure */
159 if ((mirrorp = meta_get_mirror(sp, p->namep, ep)) == NULL) {
160 rval = -1; /* record, but ignore errors */
161 continue;
162 }
163
164 /* save info */
165 lp = Zalloc(sizeof (*lp));
166 lp->namep = p->namep;
167 lp->pass = mirrorp->pass_num;
168 if ((lp->pass < 0) || (lp->pass > MD_PASS_MAX))
169 lp->pass = MD_PASS_MAX;
170
171 /* put on list */
172 lp->next = mirrors[lp->pass];
173 mirrors[lp->pass] = lp;
174 }
175
176 /* cleanup, return error */
177 metafreenamelist(mirrornlp);
178 return (rval);
179 }
180
181 /*
182 * resync all mirrors (in background)
183 */
184 int
meta_mirror_resync_all(mdsetname_t * sp,daddr_t size,md_error_t * ep)185 meta_mirror_resync_all(
186 mdsetname_t *sp,
187 daddr_t size,
188 md_error_t *ep
189 )
190 {
191 mm_unit_list_t *mirrors[MD_PASS_MAX + 1];
192 mm_pass_num_t pass, max_pass;
193 int rval = 0, fval;
194
195 /* should have a set */
196 assert(sp != NULL);
197
198 /* get mirrors */
199 (void) memset(mirrors, 0, sizeof (mirrors));
200 if (setup_units(sp, mirrors, ep) != 0)
201 return (-1);
202
203 /* fork a process */
204 if ((fval = md_daemonize(sp, ep)) != 0) {
205 /*
206 * md_daemonize will fork off a process. The is the
207 * parent or error.
208 */
209 if (fval > 0) {
210 free_units(mirrors);
211 return (0);
212 }
213 mdclrerror(ep);
214 }
215 /*
216 * Closing stdin/out/err here.
217 * In case this was called thru rsh, the calling process on the other
218 * side will know, it doesn't have to wait until all the resyncs have
219 * finished.
220 * Also initialise the rpc client pool so that this process will use
221 * a unique pool of clients. If we don't do this, all of the forked
222 * clients will end up using the same pool of clients which can result
223 * in hung clients.
224 */
225 if (meta_is_mn_set(sp, ep)) {
226 (void) close(0);
227 (void) close(1);
228 (void) close(2);
229 mdmn_clients = NULL;
230 }
231 assert((fval == 0) || (fval == -1));
232
233 /*
234 * Determine which pass level is the highest that contains mirrors to
235 * resync. We only need to wait for completion of earlier levels below
236 * this high watermark. If all mirrors are at the same pass level
237 * there is no requirement to wait for completion.
238 */
239
240 max_pass = 1;
241 for (pass = MD_PASS_MAX; pass > 1; --pass) {
242 if (mirrors[pass] != NULL) {
243 max_pass = pass;
244 break;
245 }
246 }
247
248 /*
249 * max_pass now contains the highest pass-level with resyncable mirrors
250 */
251
252 /* do passes */
253 for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
254 int dispatched = 0;
255 unsigned howlong = 1;
256 mm_unit_list_t *lp;
257
258 /* skip empty passes */
259 if (mirrors[pass] == NULL)
260 continue;
261
262 /* dispatch all resyncs in pass */
263 for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
264 if (meta_is_mn_set(sp, ep)) {
265 if (meta_mn_send_setsync(sp, lp->namep,
266 size, ep) != 0) {
267 rval = -1;
268 lp->done = 1;
269 } else {
270 ++dispatched;
271 }
272 } else {
273 if (meta_mirror_resync(sp, lp->namep, size, ep,
274 MD_RESYNC_START) != 0) {
275 rval = -1;
276 lp->done = 1;
277 } else {
278 ++dispatched;
279 }
280 }
281 }
282
283 /*
284 * Wait for them to finish iff we are at a level lower than
285 * max_pass. This orders the resyncs into distinct levels.
286 * I.e. level 2 resyncs won't start until all level 1 ones
287 * have completed.
288 */
289 if (pass == max_pass)
290 continue;
291
292 howlong = 1;
293 while (dispatched > 0) {
294
295 /* wait a while */
296 (void) sleep(howlong);
297
298 /* see if any finished */
299 for (lp = mirrors[pass]; lp != NULL; lp = lp->next) {
300 md_resync_ioctl_t ri;
301
302 if (lp->done)
303 continue;
304
305 (void) memset(&ri, '\0', sizeof (ri));
306 ri.ri_mnum = meta_getminor(lp->namep->dev);
307 MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
308 if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde,
309 lp->namep->cname) != 0) {
310 (void) mdstealerror(ep, &ri.mde);
311 rval = -1;
312 lp->done = 1;
313 --dispatched;
314 } else if (! (ri.ri_flags & MD_RI_INPROGRESS)) {
315 lp->done = 1;
316 --dispatched;
317 }
318 }
319
320 /* wait a little longer next time */
321 if (howlong < 10)
322 ++howlong;
323 }
324 }
325
326 /* cleanup, return success */
327 free_units(mirrors);
328 if (fval == 0) /* we are the child process so exit */
329 exit(0);
330 return (rval);
331 }
332
333 /*
334 * meta_mn_mirror_resync_all:
335 * -------------------------
336 * Resync all mirrors associated with given set (arg). Called when master
337 * node is adding a node to a diskset. Only want to initiate the resync on
338 * the current node.
339 */
340 void *
meta_mn_mirror_resync_all(void * arg)341 meta_mn_mirror_resync_all(void *arg)
342 {
343 set_t setno = *((set_t *)arg);
344 mdsetname_t *sp;
345 mm_unit_list_t *mirrors[MD_PASS_MAX + 1];
346 mm_pass_num_t pass, max_pass;
347 md_error_t mde = mdnullerror;
348 int fval;
349
350
351 /* should have a set */
352 assert(setno != NULL);
353
354 if ((sp = metasetnosetname(setno, &mde)) == NULL) {
355 mde_perror(&mde, "");
356 return (NULL);
357 }
358
359 if (!(meta_is_mn_set(sp, &mde))) {
360 mde_perror(&mde, "");
361 return (NULL);
362 }
363
364 /* fork a process */
365 if ((fval = md_daemonize(sp, &mde)) != 0) {
366 /*
367 * md_daemonize will fork off a process. The is the
368 * parent or error.
369 */
370 if (fval > 0) {
371 return (NULL);
372 }
373 mde_perror(&mde, "");
374 return (NULL);
375 }
376 /*
377 * Child process should never return back to rpc.metad, but
378 * should exit.
379 * Flush all internally cached data inherited from parent process
380 * since cached data will be cleared when parent process RPC request
381 * has completed (which is possibly before this child process
382 * can complete).
383 * Child process can retrieve and cache its own copy of data from
384 * rpc.metad that won't be changed by the parent process.
385 *
386 * Reset md_in_daemon since this child will be a client of rpc.metad
387 * not part of the rpc.metad daemon itself.
388 * md_in_daemon is used by rpc.metad so that libmeta can tell if
389 * this thread is rpc.metad or any other thread. (If this thread
390 * was rpc.metad it could use some short circuit code to get data
391 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
392 */
393 md_in_daemon = 0;
394 metaflushsetname(sp);
395 sr_cache_flush_setno(setno);
396 if ((sp = metasetnosetname(setno, &mde)) == NULL) {
397 mde_perror(&mde, "");
398 md_exit(sp, 1);
399 }
400
401 if (meta_lock(sp, TRUE, &mde) != 0) {
402 mde_perror(&mde, "");
403 md_exit(sp, 1);
404 }
405
406 /*
407 * Closing stdin/out/err here.
408 */
409 (void) close(0);
410 (void) close(1);
411 (void) close(2);
412 assert(fval == 0);
413
414 /* get mirrors */
415 (void) memset(mirrors, 0, sizeof (mirrors));
416 if (setup_units(sp, mirrors, &mde) != 0) {
417 (void) meta_unlock(sp, &mde);
418 md_exit(sp, 1);
419 }
420
421 /*
422 * Determine which pass level is the highest that contains mirrors to
423 * resync. We only need to wait for completion of earlier levels below
424 * this high watermark. If all mirrors are at the same pass level
425 * there is no requirement to wait for completion.
426 */
427 max_pass = 1;
428 for (pass = MD_PASS_MAX; pass > 1; --pass) {
429 if (mirrors[pass] != NULL) {
430 max_pass = pass;
431 break;
432 }
433 }
434
435 /*
436 * max_pass now contains the highest pass-level with resyncable mirrors
437 */
438 /* do passes */
439 for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
440 int dispatched = 0;
441 unsigned howlong = 1;
442 mm_unit_list_t *lp;
443
444 /* skip empty passes */
445 if (mirrors[pass] == NULL)
446 continue;
447
448 /* dispatch all resyncs in pass */
449 for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
450 if (meta_mirror_resync(sp, lp->namep, 0, &mde,
451 MD_RESYNC_FORCE_MNSTART) != 0) {
452 mdclrerror(&mde);
453 lp->done = 1;
454 } else {
455 ++dispatched;
456 }
457 }
458
459 /*
460 * Wait for them to finish iff we are at a level lower than
461 * max_pass. This orders the resyncs into distinct levels.
462 * I.e. level 2 resyncs won't start until all level 1 ones
463 * have completed.
464 */
465 if (pass == max_pass)
466 continue;
467
468 howlong = 1;
469 while (dispatched > 0) {
470
471 /* wait a while */
472 (void) sleep(howlong);
473
474 /* see if any finished */
475 for (lp = mirrors[pass]; lp != NULL; lp = lp->next) {
476 md_resync_ioctl_t ri;
477
478 if (lp->done)
479 continue;
480
481 (void) memset(&ri, '\0', sizeof (ri));
482 ri.ri_mnum = meta_getminor(lp->namep->dev);
483 MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
484 if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde,
485 lp->namep->cname) != 0) {
486 mdclrerror(&mde);
487 lp->done = 1;
488 --dispatched;
489 } else if (! (ri.ri_flags & MD_RI_INPROGRESS)) {
490 lp->done = 1;
491 --dispatched;
492 }
493 }
494
495 /* wait a little longer next time */
496 if (howlong < 10)
497 ++howlong;
498 }
499 }
500
501 /* cleanup, return success */
502 free_units(mirrors);
503 (void) meta_unlock(sp, &mde);
504 md_exit(sp, 0);
505 /*NOTREACHED*/
506 return (NULL);
507 }
508
509 /*
510 * meta_mirror_resync_process:
511 * --------------------------
512 * Modify any resync that is in progress on this node for the given set.
513 *
514 * Input Parameters:
515 * sp setname to scan for mirrors
516 * cmd action to take:
517 * MD_RESYNC_KILL - kill all resync threads
518 * MD_RESYNC_BLOCK - block all resync threads
519 * MD_RESYNC_UNBLOCK - resume all resync threads
520 * Output Parameters
521 * ep error return structure
522 *
523 * meta_lock for this set should be held on entry.
524 */
525 static void
meta_mirror_resync_process(mdsetname_t * sp,md_error_t * ep,md_resync_cmd_t cmd)526 meta_mirror_resync_process(mdsetname_t *sp, md_error_t *ep, md_resync_cmd_t cmd)
527 {
528 mm_unit_list_t *mirrors[MD_PASS_MAX + 1];
529 mm_pass_num_t pass;
530
531 /* Grab all the mirrors from the set (if any) */
532 (void) memset(mirrors, 0, sizeof (mirrors));
533 if (setup_units(sp, mirrors, ep) != 0)
534 return;
535
536 /* do passes */
537 for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
538 mm_unit_list_t *lp;
539
540 /* skip empty passes */
541 if (mirrors[pass] == NULL)
542 continue;
543
544 /* Process all resyncs in pass */
545 for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
546 (void) meta_mirror_resync(sp, lp->namep, 0, ep,
547 cmd);
548 }
549 }
550
551 /* Clear up mirror units */
552 free_units(mirrors);
553 }
554
555 /*
556 * meta_mirror_resync_process_all:
557 * ------------------------------
558 * Issue the given resync command to all mirrors contained in all multi-node
559 * sets.
560 *
561 * Input Parameters:
562 * cmd - MD_RESYNC_KILL, MD_RESYNC_BLOCK, MD_RESYNC_UNBLOCK
563 */
564 static void
meta_mirror_resync_process_all(md_resync_cmd_t cmd)565 meta_mirror_resync_process_all(md_resync_cmd_t cmd)
566 {
567 set_t setno, max_sets;
568 md_error_t mde = mdnullerror;
569 mdsetname_t *this_sp;
570 md_set_desc *sd;
571
572 /*
573 * Traverse all sets looking for multi-node capable ones.
574 */
575 max_sets = get_max_sets(&mde);
576 for (setno = 1; setno < max_sets; setno++) {
577 mde = mdnullerror;
578 if (this_sp = metasetnosetname(setno, &mde)) {
579 if ((sd = metaget_setdesc(this_sp, &mde)) == NULL)
580 continue;
581 if (!MD_MNSET_DESC(sd))
582 continue;
583
584 if (meta_lock(this_sp, TRUE, &mde)) {
585 continue;
586 }
587 meta_mirror_resync_process(this_sp, &mde, cmd);
588 (void) meta_unlock(this_sp, &mde);
589 }
590 }
591 }
592
593 /*
594 * meta_mirror_resync_kill_all:
595 * ---------------------------
596 * Abort any resync that is in progress on this node. Scan all sets for all
597 * mirrors.
598 * Note: this routine is provided for future use. For example to kill all
599 * resyncs on a node this could be used as long as the
600 * mddoors / rpc.mdcommd tuple is running on all members of the cluster.
601 */
602 void
meta_mirror_resync_kill_all(void)603 meta_mirror_resync_kill_all(void)
604 {
605 meta_mirror_resync_process_all(MD_RESYNC_KILL);
606 }
607
608 /*
609 * meta_mirror_resync_block_all:
610 * ----------------------------
611 * Block all resyncs that are in progress. This causes the resync state to
612 * freeze on this machine, and can be resumed by calling
613 * meta_mirror_resync_unblock_all.
614 */
615 void
meta_mirror_resync_block_all(void)616 meta_mirror_resync_block_all(void)
617 {
618 meta_mirror_resync_process_all(MD_RESYNC_BLOCK);
619 }
620
621 /*
622 * meta_mirror_resync_unblock_all:
623 * ------------------------------
624 * Unblock all previously blocked resync threads on this node.
625 */
626 void
meta_mirror_resync_unblock_all(void)627 meta_mirror_resync_unblock_all(void)
628 {
629 meta_mirror_resync_process_all(MD_RESYNC_UNBLOCK);
630 }
631
632 /*
633 * meta_mirror_resync_unblock:
634 * --------------------------
635 * Unblock any previously blocked resync threads for the given set.
636 * meta_lock for this set should be held on entry.
637 */
638 void
meta_mirror_resync_unblock(mdsetname_t * sp)639 meta_mirror_resync_unblock(mdsetname_t *sp)
640 {
641 md_error_t mde = mdnullerror;
642
643 meta_mirror_resync_process(sp, &mde, MD_RESYNC_UNBLOCK);
644 }
645
646 /*
647 * meta_mirror_resync_kill:
648 * -----------------------
649 * Kill any resync threads running on mirrors in the given set.
650 * Called when releasing a set (meta_set_prv.c`halt_set)
651 */
652 void
meta_mirror_resync_kill(mdsetname_t * sp)653 meta_mirror_resync_kill(mdsetname_t *sp)
654 {
655 md_error_t mde = mdnullerror;
656
657 meta_mirror_resync_process(sp, &mde, MD_RESYNC_KILL);
658 }
659