xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_mirror_resync.c (revision 856669dc1a0b09671e48a0220e5ec3b865c22c78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * mirror operations
30  */
31 
32 #include <meta.h>
33 #include <sys/lvm/md_mirror.h>
34 #include <thread.h>
35 
36 extern	int	md_in_daemon;
37 extern md_mn_client_list_t *mdmn_clients;
38 
39 /*
40  * chain of mirrors
41  */
42 typedef struct mm_unit_list {
43 	struct mm_unit_list	*next;	/* next in chain */
44 	mdname_t		*namep;	/* mirror name */
45 	mm_pass_num_t		pass;	/* pass number */
46 	uint_t			done;	/* resync done */
47 } mm_unit_list_t;
48 
49 /*
50  * resync mirror
51  * meta_lock for this set should be held on entry.
52  */
53 int
meta_mirror_resync(mdsetname_t * sp,mdname_t * mirnp,daddr_t size,md_error_t * ep,md_resync_cmd_t cmd)54 meta_mirror_resync(
55 	mdsetname_t		*sp,
56 	mdname_t		*mirnp,
57 	daddr_t			size,
58 	md_error_t		*ep,
59 	md_resync_cmd_t		cmd	/* Start/Block/Unblock/Kill */
60 )
61 {
62 	char			*miscname;
63 	md_resync_ioctl_t	ri;
64 
65 	/* should have a set */
66 	assert(sp != NULL);
67 	assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev)));
68 
69 	/* make sure we have a mirror */
70 	if ((miscname = metagetmiscname(mirnp, ep)) == NULL)
71 		return (-1);
72 	if (strcmp(miscname, MD_MIRROR) != 0) {
73 		return (mdmderror(ep, MDE_NOT_MM, meta_getminor(mirnp->dev),
74 		    mirnp->cname));
75 	}
76 
77 	/* start resync */
78 	(void) memset(&ri, 0, sizeof (ri));
79 	MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
80 	ri.ri_mnum = meta_getminor(mirnp->dev);
81 	ri.ri_copysize = size;
82 	switch (cmd) {
83 	case MD_RESYNC_FORCE_MNSTART:
84 		ri.ri_flags |= MD_RI_RESYNC_FORCE_MNSTART;
85 		break;
86 	case MD_RESYNC_START:
87 		ri.ri_flags = 0;
88 		break;
89 	case MD_RESYNC_BLOCK:
90 		ri.ri_flags = MD_RI_BLOCK;
91 		break;
92 	case MD_RESYNC_UNBLOCK:
93 		ri.ri_flags = MD_RI_UNBLOCK;
94 		break;
95 	case MD_RESYNC_KILL:
96 		ri.ri_flags = MD_RI_KILL;
97 		break;
98 	case MD_RESYNC_KILL_NO_WAIT:
99 		ri.ri_flags = MD_RI_KILL | MD_RI_NO_WAIT;
100 		break;
101 	default:
102 		/* TODO: Add new error MDE_BAD_RESYNC_FLAGS */
103 		return (mderror(ep, MDE_BAD_RESYNC_OPT, mirnp->cname));
104 	}
105 
106 	if (metaioctl(MD_IOCSETSYNC, &ri, &ri.mde, mirnp->cname) != 0)
107 		return (mdstealerror(ep, &ri.mde));
108 
109 	/* return success */
110 	return (0);
111 }
112 
113 /*
114  * free units
115  */
116 static void
free_units(mm_unit_list_t * mirrors[MD_PASS_MAX+1])117 free_units(
118 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1]
119 )
120 {
121 	uint_t		i;
122 
123 	for (i = 0; (i < (MD_PASS_MAX + 1)); ++i) {
124 		mm_unit_list_t	*p, *n;
125 
126 		for (p = mirrors[i], n = NULL; (p != NULL); p = n) {
127 			n = p->next;
128 			Free(p);
129 		}
130 		mirrors[i] = NULL;
131 	}
132 }
133 
134 /*
135  * setup_units:	build lists of units for each pass
136  */
137 static int
setup_units(mdsetname_t * sp,mm_unit_list_t * mirrors[MD_PASS_MAX+1],md_error_t * ep)138 setup_units(
139 	mdsetname_t	*sp,
140 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1],
141 	md_error_t	*ep
142 )
143 {
144 	mdnamelist_t	*mirrornlp = NULL;
145 	mdnamelist_t	*p;
146 	int		rval = 0;
147 
148 	/* should have a set */
149 	assert(sp != NULL);
150 
151 	/* for each mirror */
152 	if (meta_get_mirror_names(sp, &mirrornlp, 0, ep) < 0)
153 		return (-1);
154 	for (p = mirrornlp; (p != NULL); p = p->next) {
155 		md_mirror_t	*mirrorp;
156 		mm_unit_list_t	*lp;
157 
158 		/* get unit structure */
159 		if ((mirrorp = meta_get_mirror(sp, p->namep, ep)) == NULL) {
160 			rval = -1;	/* record, but ignore errors */
161 			continue;
162 		}
163 
164 		/* save info */
165 		lp = Zalloc(sizeof (*lp));
166 		lp->namep = p->namep;
167 		lp->pass = mirrorp->pass_num;
168 		if ((lp->pass < 0) || (lp->pass > MD_PASS_MAX))
169 			lp->pass = MD_PASS_MAX;
170 
171 		/* put on list */
172 		lp->next = mirrors[lp->pass];
173 		mirrors[lp->pass] = lp;
174 	}
175 
176 	/* cleanup, return error */
177 	metafreenamelist(mirrornlp);
178 	return (rval);
179 }
180 
181 /*
182  * resync all mirrors (in background)
183  */
184 int
meta_mirror_resync_all(mdsetname_t * sp,daddr_t size,md_error_t * ep)185 meta_mirror_resync_all(
186 	mdsetname_t	*sp,
187 	daddr_t		size,
188 	md_error_t	*ep
189 )
190 {
191 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1];
192 	mm_pass_num_t	pass, max_pass;
193 	int		rval = 0, fval;
194 
195 	/* should have a set */
196 	assert(sp != NULL);
197 
198 	/* get mirrors */
199 	(void) memset(mirrors, 0, sizeof (mirrors));
200 	if (setup_units(sp, mirrors, ep) != 0)
201 		return (-1);
202 
203 	/* fork a process */
204 	if ((fval = md_daemonize(sp, ep)) != 0) {
205 		/*
206 		 * md_daemonize will fork off a process.  The is the
207 		 * parent or error.
208 		 */
209 		if (fval > 0) {
210 			free_units(mirrors);
211 			return (0);
212 		}
213 		mdclrerror(ep);
214 	}
215 	/*
216 	 * Closing stdin/out/err here.
217 	 * In case this was called thru rsh, the calling process on the other
218 	 * side will know, it doesn't have to wait until all the resyncs have
219 	 * finished.
220 	 * Also initialise the rpc client pool so that this process will use
221 	 * a unique pool of clients. If we don't do this, all of the forked
222 	 * clients will end up using the same pool of clients which can result
223 	 * in hung clients.
224 	 */
225 	if (meta_is_mn_set(sp, ep)) {
226 		(void) close(0);
227 		(void) close(1);
228 		(void) close(2);
229 		mdmn_clients = NULL;
230 	}
231 	assert((fval == 0) || (fval == -1));
232 
233 	/*
234 	 * Determine which pass level is the highest that contains mirrors to
235 	 * resync. We only need to wait for completion of earlier levels below
236 	 * this high watermark. If all mirrors are at the same pass level
237 	 * there is no requirement to wait for completion.
238 	 */
239 
240 	max_pass = 1;
241 	for (pass = MD_PASS_MAX; pass > 1; --pass) {
242 		if (mirrors[pass] != NULL) {
243 			max_pass = pass;
244 			break;
245 		}
246 	}
247 
248 	/*
249 	 * max_pass now contains the highest pass-level with resyncable mirrors
250 	 */
251 
252 	/* do passes */
253 	for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
254 		int			dispatched = 0;
255 		unsigned		howlong = 1;
256 		mm_unit_list_t		*lp;
257 
258 		/* skip empty passes */
259 		if (mirrors[pass] == NULL)
260 			continue;
261 
262 		/* dispatch all resyncs in pass */
263 		for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
264 			if (meta_is_mn_set(sp, ep)) {
265 				if (meta_mn_send_setsync(sp, lp->namep,
266 				    size, ep) != 0) {
267 					rval = -1;
268 					lp->done = 1;
269 				} else {
270 					++dispatched;
271 				}
272 			} else {
273 				if (meta_mirror_resync(sp, lp->namep, size, ep,
274 				    MD_RESYNC_START) != 0) {
275 					rval = -1;
276 					lp->done = 1;
277 				} else {
278 					++dispatched;
279 				}
280 			}
281 		}
282 
283 		/*
284 		 * Wait for them to finish iff we are at a level lower than
285 		 * max_pass. This orders the resyncs into distinct levels.
286 		 * I.e. level 2 resyncs won't start until all level 1 ones
287 		 * have completed.
288 		 */
289 		if (pass == max_pass)
290 			continue;
291 
292 		howlong = 1;
293 		while (dispatched > 0) {
294 
295 			/* wait a while */
296 			(void) sleep(howlong);
297 
298 			/* see if any finished */
299 			for (lp = mirrors[pass]; lp != NULL; lp = lp->next) {
300 				md_resync_ioctl_t	ri;
301 
302 				if (lp->done)
303 					continue;
304 
305 				(void) memset(&ri, '\0', sizeof (ri));
306 				ri.ri_mnum = meta_getminor(lp->namep->dev);
307 				MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
308 				if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde,
309 				    lp->namep->cname) != 0) {
310 					(void) mdstealerror(ep, &ri.mde);
311 					rval = -1;
312 					lp->done = 1;
313 					--dispatched;
314 				} else if (! (ri.ri_flags & MD_RI_INPROGRESS)) {
315 					lp->done = 1;
316 					--dispatched;
317 				}
318 			}
319 
320 			/* wait a little longer next time */
321 			if (howlong < 10)
322 				++howlong;
323 		}
324 	}
325 
326 	/* cleanup, return success */
327 	free_units(mirrors);
328 	if (fval == 0)  /* we are the child process so exit */
329 		exit(0);
330 	return (rval);
331 }
332 
333 /*
334  * meta_mn_mirror_resync_all:
335  * -------------------------
336  * Resync all mirrors associated with given set (arg). Called when master
337  * node is adding a node to a diskset.  Only want to initiate the resync on
338  * the current node.
339  */
340 void *
meta_mn_mirror_resync_all(void * arg)341 meta_mn_mirror_resync_all(void *arg)
342 {
343 	set_t		setno = *((set_t *)arg);
344 	mdsetname_t	*sp;
345 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1];
346 	mm_pass_num_t	pass, max_pass;
347 	md_error_t	mde = mdnullerror;
348 	int		fval;
349 
350 
351 	/* should have a set */
352 	assert(setno != NULL);
353 
354 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
355 		mde_perror(&mde, "");
356 		return (NULL);
357 	}
358 
359 	if (!(meta_is_mn_set(sp, &mde))) {
360 		mde_perror(&mde, "");
361 		return (NULL);
362 	}
363 
364 	/* fork a process */
365 	if ((fval = md_daemonize(sp, &mde)) != 0) {
366 		/*
367 		 * md_daemonize will fork off a process.  The is the
368 		 * parent or error.
369 		 */
370 		if (fval > 0) {
371 			return (NULL);
372 		}
373 		mde_perror(&mde, "");
374 		return (NULL);
375 	}
376 	/*
377 	 * Child process should never return back to rpc.metad, but
378 	 * should exit.
379 	 * Flush all internally cached data inherited from parent process
380 	 * since cached data will be cleared when parent process RPC request
381 	 * has completed (which is possibly before this child process
382 	 * can complete).
383 	 * Child process can retrieve and cache its own copy of data from
384 	 * rpc.metad that won't be changed by the parent process.
385 	 *
386 	 * Reset md_in_daemon since this child will be a client of rpc.metad
387 	 * not part of the rpc.metad daemon itself.
388 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
389 	 * this thread is rpc.metad or any other thread.  (If this thread
390 	 * was rpc.metad it could use some short circuit code to get data
391 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
392 	 */
393 	md_in_daemon = 0;
394 	metaflushsetname(sp);
395 	sr_cache_flush_setno(setno);
396 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
397 		mde_perror(&mde, "");
398 		md_exit(sp, 1);
399 	}
400 
401 	if (meta_lock(sp, TRUE, &mde) != 0) {
402 		mde_perror(&mde, "");
403 		md_exit(sp, 1);
404 	}
405 
406 	/*
407 	 * Closing stdin/out/err here.
408 	 */
409 	(void) close(0);
410 	(void) close(1);
411 	(void) close(2);
412 	assert(fval == 0);
413 
414 	/* get mirrors */
415 	(void) memset(mirrors, 0, sizeof (mirrors));
416 	if (setup_units(sp, mirrors, &mde) != 0) {
417 		(void) meta_unlock(sp, &mde);
418 		md_exit(sp, 1);
419 	}
420 
421 	/*
422 	 * Determine which pass level is the highest that contains mirrors to
423 	 * resync. We only need to wait for completion of earlier levels below
424 	 * this high watermark. If all mirrors are at the same pass level
425 	 * there is no requirement to wait for completion.
426 	 */
427 	max_pass = 1;
428 	for (pass = MD_PASS_MAX; pass > 1; --pass) {
429 		if (mirrors[pass] != NULL) {
430 			max_pass = pass;
431 			break;
432 		}
433 	}
434 
435 	/*
436 	 * max_pass now contains the highest pass-level with resyncable mirrors
437 	 */
438 	/* do passes */
439 	for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
440 		int			dispatched = 0;
441 		unsigned		howlong = 1;
442 		mm_unit_list_t		*lp;
443 
444 		/* skip empty passes */
445 		if (mirrors[pass] == NULL)
446 			continue;
447 
448 		/* dispatch all resyncs in pass */
449 		for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
450 			if (meta_mirror_resync(sp, lp->namep, 0, &mde,
451 			    MD_RESYNC_FORCE_MNSTART) != 0) {
452 				mdclrerror(&mde);
453 				lp->done = 1;
454 			} else {
455 				++dispatched;
456 			}
457 		}
458 
459 		/*
460 		 * Wait for them to finish iff we are at a level lower than
461 		 * max_pass. This orders the resyncs into distinct levels.
462 		 * I.e. level 2 resyncs won't start until all level 1 ones
463 		 * have completed.
464 		 */
465 		if (pass == max_pass)
466 			continue;
467 
468 		howlong = 1;
469 		while (dispatched > 0) {
470 
471 			/* wait a while */
472 			(void) sleep(howlong);
473 
474 			/* see if any finished */
475 			for (lp = mirrors[pass]; lp != NULL; lp = lp->next) {
476 				md_resync_ioctl_t	ri;
477 
478 				if (lp->done)
479 					continue;
480 
481 				(void) memset(&ri, '\0', sizeof (ri));
482 				ri.ri_mnum = meta_getminor(lp->namep->dev);
483 				MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno);
484 				if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde,
485 				    lp->namep->cname) != 0) {
486 					mdclrerror(&mde);
487 					lp->done = 1;
488 					--dispatched;
489 				} else if (! (ri.ri_flags & MD_RI_INPROGRESS)) {
490 					lp->done = 1;
491 					--dispatched;
492 				}
493 			}
494 
495 			/* wait a little longer next time */
496 			if (howlong < 10)
497 				++howlong;
498 		}
499 	}
500 
501 	/* cleanup, return success */
502 	free_units(mirrors);
503 	(void) meta_unlock(sp, &mde);
504 	md_exit(sp, 0);
505 	/*NOTREACHED*/
506 	return (NULL);
507 }
508 
509 /*
510  * meta_mirror_resync_process:
511  * --------------------------
512  * Modify any resync that is in progress on this node for the given set.
513  *
514  * Input Parameters:
515  *	sp	setname to scan for mirrors
516  *	cmd	action to take:
517  *		MD_RESYNC_KILL	- kill all resync threads
518  *		MD_RESYNC_BLOCK	- block all resync threads
519  *		MD_RESYNC_UNBLOCK - resume all resync threads
520  * Output Parameters
521  *	ep	error return structure
522  *
523  * meta_lock for this set should be held on entry.
524  */
525 static void
meta_mirror_resync_process(mdsetname_t * sp,md_error_t * ep,md_resync_cmd_t cmd)526 meta_mirror_resync_process(mdsetname_t *sp, md_error_t *ep, md_resync_cmd_t cmd)
527 {
528 	mm_unit_list_t	*mirrors[MD_PASS_MAX + 1];
529 	mm_pass_num_t	pass;
530 
531 	/* Grab all the mirrors from the set (if any) */
532 	(void) memset(mirrors, 0, sizeof (mirrors));
533 	if (setup_units(sp, mirrors, ep) != 0)
534 		return;
535 
536 	/* do passes */
537 	for (pass = 1; (pass <= MD_PASS_MAX); ++pass) {
538 		mm_unit_list_t		*lp;
539 
540 		/* skip empty passes */
541 		if (mirrors[pass] == NULL)
542 			continue;
543 
544 		/* Process all resyncs in pass */
545 		for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) {
546 			(void) meta_mirror_resync(sp, lp->namep, 0, ep,
547 			    cmd);
548 		}
549 	}
550 
551 	/* Clear up mirror units */
552 	free_units(mirrors);
553 }
554 
555 /*
556  * meta_mirror_resync_process_all:
557  * ------------------------------
558  * Issue the given resync command to all mirrors contained in all multi-node
559  * sets.
560  *
561  * Input Parameters:
562  *	cmd	- MD_RESYNC_KILL, MD_RESYNC_BLOCK, MD_RESYNC_UNBLOCK
563  */
564 static void
meta_mirror_resync_process_all(md_resync_cmd_t cmd)565 meta_mirror_resync_process_all(md_resync_cmd_t cmd)
566 {
567 	set_t		setno, max_sets;
568 	md_error_t	mde = mdnullerror;
569 	mdsetname_t	*this_sp;
570 	md_set_desc	*sd;
571 
572 	/*
573 	 * Traverse all sets looking for multi-node capable ones.
574 	 */
575 	max_sets = get_max_sets(&mde);
576 	for (setno = 1; setno < max_sets; setno++) {
577 		mde = mdnullerror;
578 		if (this_sp = metasetnosetname(setno, &mde)) {
579 			if ((sd = metaget_setdesc(this_sp, &mde)) == NULL)
580 				continue;
581 			if (!MD_MNSET_DESC(sd))
582 				continue;
583 
584 			if (meta_lock(this_sp, TRUE, &mde)) {
585 				continue;
586 			}
587 			meta_mirror_resync_process(this_sp, &mde, cmd);
588 			(void) meta_unlock(this_sp, &mde);
589 		}
590 	}
591 }
592 
593 /*
594  * meta_mirror_resync_kill_all:
595  * ---------------------------
596  * Abort any resync that is in progress on this node. Scan all sets for all
597  * mirrors.
598  * Note: this routine is provided for future use. For example to kill all
599  *	 resyncs on a node this could be used as long as the
600  *	 mddoors / rpc.mdcommd tuple is running on all members of the cluster.
601  */
602 void
meta_mirror_resync_kill_all(void)603 meta_mirror_resync_kill_all(void)
604 {
605 	meta_mirror_resync_process_all(MD_RESYNC_KILL);
606 }
607 
608 /*
609  * meta_mirror_resync_block_all:
610  * ----------------------------
611  * Block all resyncs that are in progress. This causes the resync state to
612  * freeze on this machine, and can be resumed by calling
613  * meta_mirror_resync_unblock_all.
614  */
615 void
meta_mirror_resync_block_all(void)616 meta_mirror_resync_block_all(void)
617 {
618 	meta_mirror_resync_process_all(MD_RESYNC_BLOCK);
619 }
620 
621 /*
622  * meta_mirror_resync_unblock_all:
623  * ------------------------------
624  * Unblock all previously blocked resync threads on this node.
625  */
626 void
meta_mirror_resync_unblock_all(void)627 meta_mirror_resync_unblock_all(void)
628 {
629 	meta_mirror_resync_process_all(MD_RESYNC_UNBLOCK);
630 }
631 
632 /*
633  * meta_mirror_resync_unblock:
634  * --------------------------
635  * Unblock any previously blocked resync threads for the given set.
636  * meta_lock for this set should be held on entry.
637  */
638 void
meta_mirror_resync_unblock(mdsetname_t * sp)639 meta_mirror_resync_unblock(mdsetname_t *sp)
640 {
641 	md_error_t	mde = mdnullerror;
642 
643 	meta_mirror_resync_process(sp, &mde, MD_RESYNC_UNBLOCK);
644 }
645 
646 /*
647  * meta_mirror_resync_kill:
648  * -----------------------
649  * Kill any resync threads running on mirrors in the given set.
650  * Called when releasing a set (meta_set_prv.c`halt_set)
651  */
652 void
meta_mirror_resync_kill(mdsetname_t * sp)653 meta_mirror_resync_kill(mdsetname_t *sp)
654 {
655 	md_error_t	mde = mdnullerror;
656 
657 	meta_mirror_resync_process(sp, &mde, MD_RESYNC_KILL);
658 }
659