xref: /titanic_41/usr/src/uts/common/io/lvm/mirror/mirror_resync.c (revision f4b369f8ed5a4eeed9892045fdeadf3b5addb08d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/param.h>
27 #include <sys/systm.h>
28 #include <sys/conf.h>
29 #include <sys/file.h>
30 #include <sys/user.h>
31 #include <sys/uio.h>
32 #include <sys/t_lock.h>
33 #include <sys/buf.h>
34 #include <sys/dkio.h>
35 #include <sys/vtoc.h>
36 #include <sys/kmem.h>
37 #include <vm/page.h>
38 #include <sys/cmn_err.h>
39 #include <sys/sysmacros.h>
40 #include <sys/types.h>
41 #include <sys/mkdev.h>
42 #include <sys/stat.h>
43 #include <sys/open.h>
44 #include <sys/disp.h>
45 #include <sys/lvm/md_mirror.h>
46 #include <sys/modctl.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #include <sys/debug.h>
50 #include <sys/callb.h>
51 
52 #include <sys/sysevent/eventdefs.h>
53 #include <sys/sysevent/svm.h>
54 #include <sys/lvm/mdmn_commd.h>
55 
56 extern int		md_status;
57 extern kmutex_t		md_status_mx;
58 extern kmutex_t		md_mx;
59 
60 extern unit_t		md_nunits;
61 extern set_t		md_nsets;
62 extern md_set_t		md_set[];
63 extern major_t		md_major;
64 
65 extern md_ops_t		mirror_md_ops;
66 extern kmem_cache_t	*mirror_child_cache; /* mirror child memory pool */
67 extern mdq_anchor_t	md_mto_daemon;
68 extern daemon_request_t	mirror_timeout;
69 extern md_resync_t	md_cpr_resync;
70 extern clock_t		md_hz;
71 extern int		md_mtioctl_cnt;
72 
73 extern kmem_cache_t	*mirror_parent_cache;
74 #ifdef DEBUG
75 extern int		mirror_debug_flag;
76 #endif
77 
78 /*
79  * Tunable resync thread timeout. This is used as the time interval for updating
80  * the resync progress to the mddb. This allows restartable resyncs to be
81  * continued across a system reboot.
82  * Default is to update the resync progress every 5 minutes.
83  */
84 int md_mirror_resync_update_intvl = MD_DEF_MIRROR_RESYNC_INTVL;
85 
86 /*
87  * Settable mirror resync buffer size.  Specified in 512 byte
88  * blocks.  This is set to MD_DEF_RESYNC_BUF_SIZE by default.
89  */
90 int md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
91 
92 /*
93  * Tunables for dirty region processing when
94  * closing down a mirror.
95  *
96  * Dirty region processing during close of a
97  * mirror is basically monitoring the state
98  * of the resync region bitmaps and the number
99  * of outstanding i/o's per submirror to
100  * determine that there are no more dirty
101  * regions left over.
102  *
103  * The approach taken is a retry logic over
104  * md_mirror_rr_cleans iterations to monitor
105  * the progress.
106  *
107  * There are two methods of polling the progress
108  * on dirty bitmap processing: busy-waits and
109  * non-busy-waits.
110  *
111  * Busy-waits are used at the beginning to
112  * determine the final state as quick as
113  * possible; md_mirror_rr_polls defines the
114  * number of busy-waits.
115  *
116  * In case the number of busy-waits got exhausted
117  * with dirty regions left over, the retry logic
118  * switches over to non-busy-waits, thus giving
119  * relief to an obviously heavily loaded system.
120  * The timeout value is defined by the tunable
121  * md_mirror_rr_sleep_timo in seconds.
122  *
123  * The number of non-busy-waits is given by:
124  * md_mirror_rr_cleans - md_mirror_rr_polls.
125  *
126  * The values were found by testing on a
127  * 'typical' system and may require tuning
128  * to meet specific customer's requirements.
129  */
130 
131 int md_mirror_rr_cleans = 13;
132 int md_mirror_rr_polls = 3;
133 int md_mirror_rr_sleep_timo = 1;
134 
135 /*
136  * The value is not #defined because it will be computed
137  * in the future.
138  */
139 int md_max_xfer_bufsz = 2048;
140 
141 /*
142  * mirror_generate_rr_bitmap:
143  * -------------------
144  * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
145  * bitmap associated with mirror 'un'
146  *
147  * Input:
148  *      un      - mirror unit to get bitmap data from
149  *      *msgp   - location to return newly allocated md_mn_msg_rr_clean_t
150  *      *activep- location to return # of active i/os
151  *
152  * Returns:
153  *      1 => dirty bits cleared from un_dirty_bm and DRL flush required
154  *          *msgp contains bitmap of to-be-cleared bits
155  *      0 => no bits cleared
156  *          *msgp == NULL
157  */
158 static int
mirror_generate_rr_bitmap(mm_unit_t * un,md_mn_msg_rr_clean_t ** msgp,int * activep)159 mirror_generate_rr_bitmap(mm_unit_t *un, md_mn_msg_rr_clean_t **msgp,
160     int *activep)
161 {
162 	unsigned int	i, next_bit, data_bytes, start_bit;
163 	int		cleared_dirty = 0;
164 
165 	/* Skip any initial 0s. */
166 retry_dirty_scan:
167 	if ((start_bit = un->un_rr_clean_start_bit) >= un->un_rrd_num)
168 		un->un_rr_clean_start_bit = start_bit = 0;
169 
170 	/*
171 	 * Handle case where NO bits are set in PERNODE_DIRTY but the
172 	 * un_dirty_bm[] map does have entries set (after a 1st resync)
173 	 */
174 	for (; start_bit < un->un_rrd_num &&
175 	    !IS_PERNODE_DIRTY(md_mn_mynode_id, start_bit, un) &&
176 	    (un->un_pernode_dirty_sum[start_bit] != (uchar_t)0); start_bit++)
177 		;
178 
179 	if (start_bit >= un->un_rrd_num) {
180 		if (un->un_rr_clean_start_bit == 0) {
181 			return (0);
182 		} else {
183 			un->un_rr_clean_start_bit = 0;
184 			goto retry_dirty_scan;
185 		}
186 	}
187 
188 	/* how much to fit into this message */
189 	data_bytes = MIN(howmany(un->un_rrd_num - start_bit, NBBY),
190 	    MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES);
191 
192 	(*msgp) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes),
193 	    KM_SLEEP);
194 
195 	(*msgp)->rr_nodeid = md_mn_mynode_id;
196 	(*msgp)->rr_mnum = MD_SID(un);
197 	MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp, start_bit, data_bytes);
198 
199 	next_bit = MIN(start_bit + data_bytes * NBBY, un->un_rrd_num);
200 
201 	for (i = start_bit; i < next_bit; i++) {
202 		if (un->c.un_status & MD_UN_KEEP_DIRTY && IS_KEEPDIRTY(i, un)) {
203 			continue;
204 		}
205 		if (!IS_REGION_DIRTY(i, un)) {
206 			continue;
207 		}
208 		if (un->un_outstanding_writes[i] != 0) {
209 			(*activep)++;
210 			continue;
211 		}
212 
213 		/*
214 		 * Handle the case where a resync has completed and we still
215 		 * have the un_dirty_bm[] entries marked as dirty (these are
216 		 * the most recent DRL re-read from the replica). They need
217 		 * to be cleared from our un_dirty_bm[] but they will not have
218 		 * corresponding un_pernode_dirty[] entries set unless (and
219 		 * until) further write()s have been issued to the area.
220 		 * This handles the case where only the un_dirty_bm[] entry is
221 		 * set. Without this we'd not clear this region until a local
222 		 * write is issued to the affected area.
223 		 */
224 		if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un) ||
225 		    (un->un_pernode_dirty_sum[i] == (uchar_t)0)) {
226 			if (!IS_GOING_CLEAN(i, un)) {
227 				SET_GOING_CLEAN(i, un);
228 				(*activep)++;
229 				continue;
230 			}
231 			/*
232 			 * Now we've got a flagged pernode_dirty, _or_ a clean
233 			 * bitmap entry to process. Update the bitmap to flush
234 			 * the REGION_DIRTY / GOING_CLEAN bits when we send the
235 			 * cross-cluster message.
236 			 */
237 			cleared_dirty++;
238 			setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp), i - start_bit);
239 		} else {
240 			/*
241 			 * Not marked as active in the pernode bitmap, so skip
242 			 * any update to this. We just increment the 0 count
243 			 * and adjust the active count by any outstanding
244 			 * un_pernode_dirty_sum[] entries. This means we don't
245 			 * leave the mirror permanently dirty.
246 			 */
247 			(*activep) += (int)un->un_pernode_dirty_sum[i];
248 		}
249 	}
250 	if (!cleared_dirty) {
251 		kmem_free(*msgp, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes));
252 		*msgp = NULL;
253 	}
254 	un->un_rr_clean_start_bit = next_bit;
255 	return (cleared_dirty);
256 }
257 
258 /*
259  * There are three paths into here:
260  *
261  * md_daemon -> check_resync_regions -> prr
262  * mirror_internal_close -> mirror_process_unit_resync -> prr
263  * mirror_set_capability -> mirror_process_unit_resync -> prr
264  *
265  * The first one is a kernel daemon, the other two result from system calls.
266  * Thus, only the first case needs to deal with kernel CPR activity.  This
267  * is indicated by the cprinfop being non-NULL for kernel daemon calls, and
268  * NULL for system call paths.
269  */
270 static int
process_resync_regions_non_owner(mm_unit_t * un,callb_cpr_t * cprinfop)271 process_resync_regions_non_owner(mm_unit_t *un, callb_cpr_t *cprinfop)
272 {
273 	int			i, start, end;
274 	int			cleared_dirty = 0;
275 	/* Number of reasons why we can not proceed shutting down the mirror. */
276 	int			active = 0;
277 	set_t			setno = MD_UN2SET(un);
278 	md_mn_msg_rr_clean_t	*rmsg;
279 	md_mn_kresult_t		*kres;
280 	int			rval;
281 	minor_t			mnum = MD_SID(un);
282 	mdi_unit_t		*ui = MDI_UNIT(mnum);
283 	md_mn_nodeid_t		owner_node;
284 
285 	/*
286 	 * We drop the readerlock here to assist lock ordering with
287 	 * update_resync.  Once we have the un_rrp_inflight_mx, we
288 	 * can re-acquire it.
289 	 */
290 	md_unit_readerexit(ui);
291 
292 	/*
293 	 * Resync region processing must be single threaded. We can't use
294 	 * un_resync_mx for this purpose since this mutex gets released
295 	 * when blocking on un_resync_cv.
296 	 */
297 	mutex_enter(&un->un_rrp_inflight_mx);
298 
299 	(void) md_unit_readerlock(ui);
300 
301 	mutex_enter(&un->un_resync_mx);
302 
303 	rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], RW_READER);
304 	cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
305 	rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
306 
307 	if (cleared_dirty) {
308 		owner_node = un->un_mirror_owner;
309 		mutex_exit(&un->un_resync_mx);
310 
311 		/*
312 		 * Transmit the 'to-be-cleared' bitmap to all cluster nodes.
313 		 * Receipt of the message will cause the mirror owner to
314 		 * update the on-disk DRL.
315 		 */
316 
317 		kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
318 
319 		/* release readerlock before sending message */
320 		md_unit_readerexit(ui);
321 
322 		if (cprinfop) {
323 			mutex_enter(&un->un_prr_cpr_mx);
324 			CALLB_CPR_SAFE_BEGIN(cprinfop);
325 		}
326 
327 		rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_CLEAN,
328 		    MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_KSEND_NORETRY|
329 		    MD_MSGF_DIRECTED, un->un_mirror_owner,
330 		    (char *)rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg), kres);
331 
332 		if (cprinfop) {
333 			CALLB_CPR_SAFE_END(cprinfop, &un->un_prr_cpr_mx);
334 			mutex_exit(&un->un_prr_cpr_mx);
335 		}
336 
337 		/* reacquire readerlock after message */
338 		(void) md_unit_readerlock(ui);
339 
340 		if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
341 		    (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
342 			/* if commd is gone, no point in printing a message */
343 			if (md_mn_is_commd_present())
344 				mdmn_ksend_show_error(rval, kres, "RR_CLEAN");
345 			kmem_free(kres, sizeof (md_mn_kresult_t));
346 			kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
347 			mutex_exit(&un->un_rrp_inflight_mx);
348 			return (active);
349 		}
350 		kmem_free(kres, sizeof (md_mn_kresult_t));
351 
352 		/*
353 		 * If ownership changed while we were sending, we probably
354 		 * sent the message to the wrong node.  Leave fixing that for
355 		 * the next cycle.
356 		 */
357 		if (un->un_mirror_owner != owner_node) {
358 			mutex_exit(&un->un_rrp_inflight_mx);
359 			return (active);
360 		}
361 
362 		/*
363 		 * Now that we've sent the message, clear them from the
364 		 * pernode_dirty arrays.  These are ONLY cleared on a
365 		 * successful send, and failure has no impact.
366 		 */
367 		cleared_dirty = 0;
368 		start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
369 		end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
370 		mutex_enter(&un->un_resync_mx);
371 		rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
372 		    RW_READER);
373 		for (i = start; i < end; i++) {
374 			if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
375 			    i - start)) {
376 				if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un)) {
377 					un->un_pernode_dirty_sum[i]--;
378 					CLR_PERNODE_DIRTY(md_mn_mynode_id, i,
379 					    un);
380 				}
381 				if (IS_REGION_DIRTY(i, un)) {
382 					cleared_dirty++;
383 					CLR_REGION_DIRTY(i, un);
384 					CLR_GOING_CLEAN(i, un);
385 				}
386 			}
387 		}
388 		rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
389 
390 		kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
391 	}
392 	mutex_exit(&un->un_resync_mx);
393 
394 	mutex_exit(&un->un_rrp_inflight_mx);
395 
396 	return (active);
397 }
398 
399 static int
process_resync_regions_owner(mm_unit_t * un)400 process_resync_regions_owner(mm_unit_t *un)
401 {
402 	int			i, start, end;
403 	int			cleared_dirty = 0;
404 	/* Number of reasons why we can not proceed shutting down the mirror. */
405 	int			active = 0;
406 	set_t			setno = MD_UN2SET(un);
407 	int			mnset = MD_MNSET_SETNO(setno);
408 	md_mn_msg_rr_clean_t	*rmsg;
409 	minor_t			mnum = MD_SID(un);
410 	mdi_unit_t		*ui = MDI_UNIT(mnum);
411 
412 	/*
413 	 * We drop the readerlock here to assist lock ordering with
414 	 * update_resync.  Once we have the un_rrp_inflight_mx, we
415 	 * can re-acquire it.
416 	 */
417 	md_unit_readerexit(ui);
418 
419 	/*
420 	 * Resync region processing must be single threaded. We can't use
421 	 * un_resync_mx for this purpose since this mutex gets released
422 	 * when blocking on un_resync_cv.
423 	 */
424 	mutex_enter(&un->un_rrp_inflight_mx);
425 
426 	(void) md_unit_readerlock(ui);
427 
428 	mutex_enter(&un->un_resync_mx);
429 	un->un_waiting_to_clear++;
430 	while (un->un_resync_flg & MM_RF_STALL_CLEAN)
431 		cv_wait(&un->un_resync_cv, &un->un_resync_mx);
432 	un->un_waiting_to_clear--;
433 
434 	if (mnset) {
435 		rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
436 		    RW_READER);
437 		cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
438 
439 		if (cleared_dirty) {
440 			/*
441 			 * Clear the bits from the pernode_dirty arrays.
442 			 * If that results in any being cleared from the
443 			 * un_dirty_bm, commit it.
444 			 */
445 			cleared_dirty = 0;
446 			start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
447 			end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
448 			for (i = start; i < end; i++) {
449 				if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
450 				    i - start)) {
451 					if (IS_PERNODE_DIRTY(md_mn_mynode_id, i,
452 					    un)) {
453 						un->un_pernode_dirty_sum[i]--;
454 						CLR_PERNODE_DIRTY(
455 						    md_mn_mynode_id, i, un);
456 					}
457 					if (un->un_pernode_dirty_sum[i] == 0) {
458 						cleared_dirty++;
459 						CLR_REGION_DIRTY(i, un);
460 						CLR_GOING_CLEAN(i, un);
461 					}
462 				}
463 			}
464 			kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
465 		}
466 		rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
467 	} else {
468 		for (i = 0; i < un->un_rrd_num; i++) {
469 			if (un->c.un_status & MD_UN_KEEP_DIRTY)
470 				if (IS_KEEPDIRTY(i, un))
471 					continue;
472 
473 			if (!IS_REGION_DIRTY(i, un))
474 				continue;
475 			if (un->un_outstanding_writes[i] != 0) {
476 				active++;
477 				continue;
478 			}
479 
480 			if (!IS_GOING_CLEAN(i, un)) {
481 				SET_GOING_CLEAN(i, un);
482 				active++;
483 				continue;
484 			}
485 			CLR_REGION_DIRTY(i, un);
486 			CLR_GOING_CLEAN(i, un);
487 			cleared_dirty++;
488 		}
489 	}
490 
491 	if (cleared_dirty) {
492 		un->un_resync_flg |= MM_RF_GATECLOSED;
493 		mutex_exit(&un->un_resync_mx);
494 		mddb_commitrec_wrapper(un->un_rr_dirty_recid);
495 		mutex_enter(&un->un_resync_mx);
496 		un->un_resync_flg &= ~MM_RF_GATECLOSED;
497 
498 		if (un->un_waiting_to_mark != 0 ||
499 		    un->un_waiting_to_clear != 0) {
500 			active++;
501 			cv_broadcast(&un->un_resync_cv);
502 		}
503 	}
504 	mutex_exit(&un->un_resync_mx);
505 
506 	mutex_exit(&un->un_rrp_inflight_mx);
507 
508 	return (active);
509 }
510 
511 static int
process_resync_regions(mm_unit_t * un,callb_cpr_t * cprinfop)512 process_resync_regions(mm_unit_t *un, callb_cpr_t *cprinfop)
513 {
514 	int	mnset = MD_MNSET_SETNO(MD_UN2SET(un));
515 	/*
516 	 * For a mirror we can only update the on-disk resync-record if we
517 	 * currently own the mirror. If we are called and there is no owner we
518 	 * bail out before scanning the outstanding_writes[] array.
519 	 * NOTE: we only need to check here (before scanning the array) as we
520 	 * 	are called with the readerlock held. This means that a change
521 	 * 	of ownership away from us will block until this resync check
522 	 * 	has completed.
523 	 */
524 	if (mnset && (MD_MN_NO_MIRROR_OWNER(un) ||
525 	    (!MD_MN_MIRROR_OWNER(un) && !md_mn_is_commd_present_lite()))) {
526 		return (0);
527 	} else if (mnset && !MD_MN_MIRROR_OWNER(un)) {
528 		return (process_resync_regions_non_owner(un, cprinfop));
529 	} else {
530 		return (process_resync_regions_owner(un));
531 	}
532 }
533 
534 /*
535  * Function that is callable from other modules to provide
536  * ability to cleanup dirty region bitmap on demand. Used
537  * on last close of a unit to avoid massive device resyncs
538  * when coming back after rolling large amounts of data to
539  * a mirror (e.g. at umount with logging).
540  */
541 
542 void
mirror_process_unit_resync(mm_unit_t * un)543 mirror_process_unit_resync(mm_unit_t *un)
544 {
545 	int	cleans = 0;
546 
547 	while (process_resync_regions(un, NULL)) {
548 
549 		cleans++;
550 		if (cleans >= md_mirror_rr_cleans) {
551 			cmn_err(CE_NOTE,
552 			    "Could not clean resync regions\n");
553 			break;
554 		}
555 		if (cleans > md_mirror_rr_polls) {
556 			/*
557 			 * We did not make it with md_mirror_rr_polls
558 			 * iterations. Give the system relief and
559 			 * switch over to non-busy-wait.
560 			 */
561 			delay(md_mirror_rr_sleep_timo * md_hz);
562 		}
563 	}
564 }
565 
566 static void
check_resync_regions(daemon_request_t * timeout)567 check_resync_regions(daemon_request_t *timeout)
568 {
569 	mdi_unit_t	*ui;
570 	mm_unit_t	*un;
571 	md_link_t	*next;
572 	callb_cpr_t	cprinfo;
573 
574 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
575 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
576 
577 		if (md_get_setstatus(next->ln_setno) & MD_SET_STALE)
578 			continue;
579 
580 		un = MD_UNIT(next->ln_id);
581 
582 		/*
583 		 * Register this resync thread with the CPR mechanism. This
584 		 * allows us to detect when the system is suspended and so
585 		 * keep track of the RPC failure condition.
586 		 */
587 		CALLB_CPR_INIT(&cprinfo, &un->un_prr_cpr_mx, callb_md_mrs_cpr,
588 		    "check_resync_regions");
589 
590 		ui = MDI_UNIT(next->ln_id);
591 		(void) md_unit_readerlock(ui);
592 
593 		/*
594 		 * Do not clean up resync regions if it is an ABR
595 		 * mirror, or if a submirror is offline (we will use the resync
596 		 * region to resync when back online) or if there is only one
597 		 * submirror.
598 		 */
599 		if ((ui->ui_tstate & MD_ABR_CAP) ||
600 		    (un->c.un_status & MD_UN_OFFLINE_SM) || (un->un_nsm < 2)) {
601 			md_unit_readerexit(ui);
602 			/* Remove this thread from the CPR callback table. */
603 			mutex_enter(&un->un_prr_cpr_mx);
604 			CALLB_CPR_EXIT(&cprinfo);
605 			continue;
606 		}
607 
608 		(void) process_resync_regions(un, &cprinfo);
609 
610 		md_unit_readerexit(ui);
611 
612 		/* Remove this thread from the CPR callback table. */
613 		mutex_enter(&un->un_prr_cpr_mx);
614 		CALLB_CPR_EXIT(&cprinfo);
615 	}
616 
617 	rw_exit(&mirror_md_ops.md_link_rw.lock);
618 
619 	/* We are done */
620 	mutex_enter(&mirror_timeout.dr_mx);
621 	timeout->dr_pending = 0;
622 	mutex_exit(&mirror_timeout.dr_mx);
623 }
624 
625 static void
md_mirror_timeout(void * throwaway)626 md_mirror_timeout(void *throwaway)
627 {
628 
629 	mutex_enter(&mirror_timeout.dr_mx);
630 	if (!mirror_timeout.dr_pending) {
631 		mirror_timeout.dr_pending = 1;
632 		daemon_request(&md_mto_daemon, check_resync_regions,
633 		    (daemon_queue_t *)&mirror_timeout, REQ_OLD);
634 	}
635 
636 	if (mirror_md_ops.md_head != NULL)
637 		mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
638 		    throwaway, (int)MD_MDELAY*hz);
639 	else
640 		mirror_timeout.dr_timeout_id = 0;
641 
642 	mutex_exit(&mirror_timeout.dr_mx);
643 }
644 
645 void
resync_start_timeout(set_t setno)646 resync_start_timeout(set_t setno)
647 {
648 	if (md_get_setstatus(setno) & MD_SET_STALE)
649 		return;
650 
651 	mutex_enter(&mirror_timeout.dr_mx);
652 	if (mirror_timeout.dr_timeout_id == 0)
653 		mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
654 		    (void *)NULL, (int)MD_MDELAY*hz);
655 	mutex_exit(&mirror_timeout.dr_mx);
656 }
657 
658 static void
offlined_to_attached(mm_unit_t * un)659 offlined_to_attached(mm_unit_t *un)
660 {
661 	int		i;
662 	int		changed = 0;
663 
664 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
665 		return;
666 
667 	for (i = 0; i < NMIRROR; i++) {
668 		if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
669 			mirror_set_sm_state(&un->un_sm[i],
670 			    &un->un_smic[i], SMS_ATTACHED, 1);
671 			changed++;
672 		}
673 		if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC)) {
674 			mirror_set_sm_state(&un->un_sm[i],
675 			    &un->un_smic[i], SMS_ATTACHED_RESYNC, 1);
676 			changed++;
677 		}
678 	}
679 
680 	if (changed != 0) {
681 		un->c.un_status &= ~MD_UN_OFFLINE_SM;
682 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
683 	}
684 }
685 
686 static void
get_unit_resync(mm_unit_t * un)687 get_unit_resync(mm_unit_t *un)
688 {
689 	mddb_recstatus_t	status;
690 	struct optim_resync	*orp;
691 
692 	if (un->un_rr_dirty_recid == 0) {
693 		offlined_to_attached(un);
694 		return;
695 	}
696 
697 	status = mddb_getrecstatus(un->un_rr_dirty_recid);
698 	if ((status == MDDB_NORECORD) || (status == MDDB_NODATA)) {
699 		un->un_rr_dirty_recid = 0;
700 		offlined_to_attached(un);
701 		return;
702 	}
703 
704 	mddb_setrecprivate(un->un_rr_dirty_recid, MD_PRV_GOTIT);
705 	orp = (struct optim_resync *)mddb_getrecaddr(un->un_rr_dirty_recid);
706 	un->un_dirty_bm = orp->or_rr;
707 }
708 
709 static int
create_unit_resync(mm_unit_t * un,int snarfing)710 create_unit_resync(mm_unit_t *un, int snarfing)
711 {
712 	diskaddr_t	tb;
713 	int		i;
714 	int		blksize;	/* rr size in blocks */
715 	int		num_rr;
716 	mddb_recid_t	recid;
717 	size_t		size;	/* bitmap size */
718 	optim_resync_t	*orp;
719 	mddb_type_t	typ1;
720 	set_t		setno;
721 
722 	tb = un->c.un_total_blocks;
723 
724 	if (((tb + MD_MIN_RR_SIZE)/ MD_MIN_RR_SIZE) > MD_DEF_NUM_RR) {
725 		blksize = (int)(tb / MD_DEF_NUM_RR);
726 		num_rr = (int)((tb + (blksize)) / (blksize));
727 	} else {
728 		blksize = MD_MIN_RR_SIZE;
729 		num_rr = (int)((tb + MD_MIN_RR_SIZE) / MD_MIN_RR_SIZE);
730 	}
731 
732 	size = howmany(num_rr, NBBY) + sizeof (*orp) - sizeof (orp->or_rr);
733 
734 	setno = MD_UN2SET(un);
735 
736 	typ1 = (mddb_type_t)md_getshared_key(setno,
737 	    mirror_md_ops.md_driver.md_drivername);
738 
739 	recid =  mddb_createrec(size, typ1, RESYNC_REC,
740 	    MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
741 	if (recid < 0) {
742 		if (snarfing && !(md_get_setstatus(setno) & MD_SET_STALE)) {
743 			md_set_setstatus(setno, MD_SET_STALE);
744 			cmn_err(CE_WARN, "md: state database is stale");
745 		}
746 		return (-1);
747 	}
748 
749 	un->un_rr_dirty_recid = recid;
750 	orp = (optim_resync_t *)mddb_getrecaddr(recid);
751 	orp->or_magic = OR_MAGIC;
752 	orp->or_blksize = blksize;
753 	orp->or_num = num_rr;
754 
755 	un->un_rrd_blksize = blksize;
756 	un->un_rrd_num  = num_rr;
757 	un->un_dirty_bm = orp->or_rr;
758 
759 	if (snarfing)
760 		for (i = 0; i < howmany(num_rr, NBBY); i++)
761 			orp->or_rr[i] = 0xFF;
762 
763 	if (!snarfing) {
764 		mddb_commitrec_wrapper(recid);
765 		mirror_commit(un, NO_SUBMIRRORS, 0);
766 		return (0);
767 	}
768 	mddb_setrecprivate(recid, MD_PRV_PENDCOM);
769 	mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
770 	return (0);
771 }
772 
773 int
unit_setup_resync(mm_unit_t * un,int snarfing)774 unit_setup_resync(mm_unit_t *un, int snarfing)
775 {
776 	int err;
777 	int syncable;
778 	int i;
779 	mdi_unit_t	*ui = MDI_UNIT(MD_SID(un));
780 	int nonABR = 1;		/* only set if ABR marked in ui_tstate */
781 
782 	un->un_dirty_bm = NULL;
783 	un->un_rs_buffer = NULL;
784 
785 	mutex_init(&un->un_rrp_inflight_mx, "rrp mx", MUTEX_DEFAULT, NULL);
786 
787 	mutex_init(&un->un_resync_mx, NULL, MUTEX_DEFAULT, NULL);
788 	cv_init(&un->un_resync_cv, NULL, CV_DEFAULT, NULL);
789 	un->un_resync_flg = 0;
790 	un->un_waiting_to_mark = 0;
791 	un->un_waiting_to_commit = 0;
792 	un->un_waiting_to_clear = 0;
793 
794 	un->un_goingclean_bm = NULL;
795 	un->un_goingdirty_bm = NULL;
796 	un->un_outstanding_writes = NULL;
797 	un->un_resync_bm = NULL;
798 
799 	if (snarfing)
800 		get_unit_resync(un);
801 
802 	if (un->un_rr_dirty_recid == 0) {
803 		/*
804 		 * If a MN diskset and snarfing and this node is not the
805 		 * master, do not delete any records on snarf of the
806 		 * mirror records (create_unit_resync deletes records).
807 		 *
808 		 * Master node should have already handled this case.
809 		 */
810 		if (MD_MNSET_SETNO(MD_UN2SET(un)) && snarfing &&
811 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
812 #ifdef DEBUG
813 			cmn_err(CE_NOTE, "unit_setup_resync: no rr for %s on"
814 			    " nodeid %d\n", md_shortname(MD_SID(un)),
815 			    md_set[MD_UN2SET(un)].s_nodeid);
816 #endif
817 			return (-1);
818 		}
819 		if ((err = create_unit_resync(un, snarfing)) != 0)
820 			return (err);
821 	}
822 
823 	un->un_goingclean_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
824 	    un->un_rrd_num, NBBY)), KM_SLEEP);
825 	un->un_goingdirty_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
826 	    un->un_rrd_num, NBBY)), KM_SLEEP);
827 	un->un_outstanding_writes = (short *)kmem_zalloc(
828 	    (uint_t)un->un_rrd_num * sizeof (short), KM_SLEEP);
829 	un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
830 	    un->un_rrd_num, NBBY)), KM_SLEEP);
831 
832 	/*
833 	 * Allocate pernode bitmap for this node. All other nodes' maps will
834 	 * be created 'on-the-fly' in the ioctl message handler
835 	 */
836 	if (MD_MNSET_SETNO(MD_UN2SET(un))) {
837 		un->un_pernode_dirty_sum =
838 		    (uchar_t *)kmem_zalloc(un->un_rrd_num, KM_SLEEP);
839 		if (md_mn_mynode_id > 0) {
840 			un->un_pernode_dirty_bm[md_mn_mynode_id-1] = (uchar_t *)
841 			    kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)),
842 			    KM_SLEEP);
843 		}
844 
845 		/*
846 		 * Allocate taskq to process deferred (due to locking) RR_CLEAN
847 		 * requests.
848 		 */
849 		un->un_drl_task = (ddi_taskq_t *)md_create_taskq(MD_UN2SET(un),
850 		    MD_SID(un));
851 	}
852 
853 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
854 		return (0);
855 
856 	/*
857 	 * Only mark mirror which has an associated DRL as requiring a resync.
858 	 * For ABR mirrors we need not set the resync record bitmap up.
859 	 */
860 	if (ui && (ui->ui_tstate & MD_ABR_CAP))
861 		nonABR = 0;
862 
863 	for (i = 0, syncable = 0; i < NMIRROR; i++) {
864 		if (nonABR) {
865 			if ((SUBMIRROR_IS_READABLE(un, i) ||
866 			    SMS_BY_INDEX_IS(un, i,
867 			    (SMS_OFFLINE | SMS_OFFLINE_RESYNC))))
868 				syncable++;
869 		}
870 	}
871 
872 	if (snarfing && un->un_pass_num && (syncable > 1)) {
873 		bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm,
874 		    howmany(un->un_rrd_num, NBBY));
875 
876 		un->c.un_status |= (MD_UN_OPT_NOT_DONE | MD_UN_WAR);
877 		un->c.un_status &= ~MD_UN_OFFLINE_SM;
878 		for (i = 0; i < NMIRROR; i++) {
879 			if ((SUBMIRROR_IS_READABLE(un, i)) ||
880 			    SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC))
881 				un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
882 
883 			if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
884 				un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
885 				mirror_set_sm_state(&un->un_sm[i],
886 				    &un->un_smic[i], SMS_OFFLINE_RESYNC, 1);
887 				mddb_setrecprivate(un->c.un_record_id,
888 				    MD_PRV_PENDCOM);
889 			}
890 		}
891 	}
892 	return (0);
893 }
894 
895 /*
896  * resync_kill_pending:
897  * -------------------
898  * Determine if the resync thread has been requested to terminate.
899  * Block if MD_RI_BLOCK or MD_RI_BLOCK_OWNER is set in un->un_rs_thread_flags.
900  * MD_RI_BLOCK is only set as a result of a user-initiated ioctl via metasync.
901  * MD_RI_BLOCK_OWNER is set by the ownership change of a multi-node  mirror.
902  *
903  * Returns:
904  *	0	Kill not pending
905  *	1	Kill requested	(set MD_UN_RESYNC_CANCEL in un->c.un_status)
906  *
907  * Note: this routine may block
908  *	 the writerlock for <ui> will be dropped and reacquired if <mx_type>
909  *	 is set to MD_WRITER_HELD.
910  *	 the readerlock for <ui> will be dropped and reacquired if <mx_type>
911  *	 is set to MD_READER_HELD.
912  */
913 static int
resync_kill_pending(mm_unit_t * un,mdi_unit_t * ui,uint_t mx_type)914 resync_kill_pending(
915 	mm_unit_t *un,
916 	mdi_unit_t *ui,
917 	uint_t mx_type)
918 {
919 	int	retval = 0;
920 
921 	/* Ensure that we don't block with any mutex held */
922 	if (mx_type == MD_WRITER_HELD) {
923 		md_unit_writerexit(ui);
924 	} else if (mx_type == MD_READER_HELD) {
925 		md_unit_readerexit(ui);
926 	}
927 	mutex_enter(&un->un_rs_thread_mx);
928 	while (un->un_rs_thread_flags & (MD_RI_BLOCK|MD_RI_BLOCK_OWNER)) {
929 		cv_wait(&un->un_rs_thread_cv, &un->un_rs_thread_mx);
930 		if (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN))
931 			break;
932 	}
933 	/* Determine if we've been asked to abort or shutdown gracefully */
934 	if (un->un_rs_thread_flags & MD_RI_KILL) {
935 		un->c.un_status |= MD_UN_RESYNC_CANCEL;
936 		retval = 1;
937 	} else if (un->un_rs_thread_flags & MD_RI_SHUTDOWN) {
938 		retval = 1;
939 	}
940 	mutex_exit(&un->un_rs_thread_mx);
941 
942 	/* Reacquire mutex if dropped on entry */
943 	if (mx_type == MD_WRITER_HELD) {
944 		(void) md_unit_writerlock(ui);
945 	} else if (mx_type == MD_READER_HELD) {
946 		(void) md_unit_readerlock(ui);
947 	}
948 	return (retval);
949 }
950 
951 /*
952  * resync_read_buffer:
953  * ------------------
954  * Issue the resync source read for the specified start block and size.
955  * This will cause the mirror strategy routine to issue a write-after-read
956  * once this request completes successfully.
957  * If 'flag_err' is set we expect to see a write error flagged in the b_error
958  * field of the buffer created for this i/o request. If clear we do not expect
959  * to see the error flagged for write failures.
960  * Read failures will always set the B_ERROR bit which will stop the resync
961  * immediately.
962  */
963 static int
resync_read_buffer(mm_unit_t * un,diskaddr_t blk,size_t cnt,int flag_err)964 resync_read_buffer(mm_unit_t *un, diskaddr_t blk, size_t cnt, int flag_err)
965 {
966 	md_mcs_t	*sp;
967 	buf_t		*bp;
968 	int		ret = 0;
969 
970 	sp = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
971 	mirror_child_init(sp);
972 
973 	bp = &sp->cs_buf;
974 	bp->b_edev = makedevice(md_major, MD_SID(un));
975 	bp->b_flags = B_READ;
976 	bp->b_lblkno = blk;
977 	bp->b_bcount = dbtob(cnt);
978 	bp->b_un.b_addr = un->un_rs_buffer;
979 	md_unit_readerexit(MDI_UNIT(MD_SID(un)));
980 
981 	(void) md_mirror_strategy(bp, MD_STR_NOTTOP | MD_STR_MAPPED |
982 	    MD_STR_WAR | (flag_err ? MD_STR_FLAG_ERR : 0), NULL);
983 
984 	(void) biowait(bp);
985 
986 	(void) md_unit_readerlock(MDI_UNIT(MD_SID(un)));
987 	if (bp->b_flags & B_ERROR) {
988 		ret = 1;
989 	}
990 	kmem_cache_free(mirror_child_cache, sp);
991 	return (ret);
992 }
993 
994 /*
995  * send_mn_resync_done_message
996  *
997  * At the end of a resync, send a message to all nodes to indicate that
998  * the resync is complete. The argument, flags, has the following values
999  *
1000  * RESYNC_ERR - if an error occurred that terminated the resync
1001  * CLEAR_OPT_NOT_DONE   - Just need to clear the OPT_NOT_DONE flag
1002  *
1003  * unit writerlock set on entry
1004  * Only send the message if the thread is not marked as shutting down:
1005  * [un_rs_thread_flags & MD_RI_SHUTDOWN] or being killed:
1006  * [un->c.un_status & MD_UN_RESYNC_CANCEL]
1007  * or if there has been an error that terminated the resync:
1008  *	flags & RESYNC_ERR
1009  *
1010  */
1011 static void
send_mn_resync_done_message(mm_unit_t * un,int flags)1012 send_mn_resync_done_message(
1013 	mm_unit_t	*un,
1014 	int		flags
1015 )
1016 {
1017 	md_mn_msg_resync_t	*rmsg = un->un_rs_msg;
1018 	set_t			setno;
1019 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
1020 	md_mn_kresult_t		*kres;
1021 	int			dont_send = 0;
1022 	int			rval;
1023 	int			nretries = 0;
1024 
1025 	rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
1026 
1027 	/*
1028 	 * Only send the message if this resync thread is still active. This
1029 	 * handles the case where ownership changes to different nodes during
1030 	 * a resync can cause multiple spurious resync_done messages to occur
1031 	 * when the resync completes. This happens because only one node is
1032 	 * the resync owner but other nodes will have their resync_unit thread
1033 	 * blocked in 'resync_kill_pending'
1034 	 */
1035 	mutex_enter(&un->un_rs_thread_mx);
1036 	dont_send = (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN)) ? 1
1037 	    : 0;
1038 	mutex_exit(&un->un_rs_thread_mx);
1039 	dont_send |= (un->c.un_status & MD_UN_RESYNC_CANCEL) ? 1 : 0;
1040 
1041 	/*
1042 	 * Always send a message if we've encountered an error that terminated
1043 	 * the resync.
1044 	 */
1045 	if (flags & RESYNC_ERR)
1046 		dont_send = 0;
1047 
1048 	if (dont_send) {
1049 #ifdef DEBUG
1050 		if (mirror_debug_flag) {
1051 			printf("Don't send resync done message, mnum = %x,"
1052 			    " type = %x, flags = %d\n", MD_SID(un),
1053 			    un->un_rs_type, flags);
1054 		}
1055 #endif  /* DEBUG */
1056 		return;
1057 	}
1058 
1059 #ifdef DEBUG
1060 	if (mirror_debug_flag) {
1061 		printf("send resync done message, mnum = %x, type = %x\n",
1062 		    MD_SID(un), un->un_rs_type);
1063 	}
1064 #endif
1065 
1066 	rmsg->msg_resync_mnum = MD_SID(un);
1067 	rmsg->msg_resync_type = un->un_rs_type;
1068 	rmsg->msg_originator = md_mn_mynode_id;
1069 	rmsg->msg_resync_flags = 0;
1070 	if (flags & RESYNC_ERR)
1071 		rmsg->msg_resync_flags |= MD_MN_RS_ERR;
1072 	if (flags & CLEAR_OPT_NOT_DONE)
1073 		rmsg->msg_resync_flags |= MD_MN_RS_CLEAR_OPT_NOT_DONE;
1074 
1075 	setno = MD_MIN2SET(MD_SID(un));
1076 	md_unit_writerexit(ui);
1077 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1078 
1079 smrd_msg:
1080 	mutex_enter(&un->un_rs_cpr_mx);
1081 	CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1082 
1083 	rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_PHASE_DONE,
1084 	    MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1085 
1086 	CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1087 	mutex_exit(&un->un_rs_cpr_mx);
1088 
1089 	/* if the node hasn't yet joined, it's Ok. */
1090 	if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
1091 	    (kres->kmmr_comm_state !=  MDMNE_NOT_JOINED)) {
1092 		mdmn_ksend_show_error(rval, kres, "RESYNC_PHASE_DONE");
1093 		/* If we're shutting down already, pause things here. */
1094 		if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1095 			while (!md_mn_is_commd_present()) {
1096 				delay(md_hz);
1097 			}
1098 			/*
1099 			 * commd is now available again. Retry the message once.
1100 			 * If this fails we panic as the system is in an
1101 			 * unexpected state.
1102 			 */
1103 			if (nretries++ == 0)
1104 				goto smrd_msg;
1105 		}
1106 		cmn_err(CE_PANIC, "ksend_message failure: RESYNC_PHASE_DONE");
1107 	}
1108 	kmem_free(kres, sizeof (md_mn_kresult_t));
1109 	(void) md_unit_writerlock(ui);
1110 }
1111 
1112 /*
1113  * send_mn_resync_next_message
1114  *
1115  * Sent a message to all nodes indicating the next region to be resynced.
1116  * The message contains the region to be resynced and the current position in
1117  * the resync as denoted by un_rs_resync_done and un_rs_resync_2_do.
1118  * On entry the unit readerlock is held.
1119  */
1120 static void
send_mn_resync_next_message(mm_unit_t * un,diskaddr_t currentblk,size_t rsize,int flags)1121 send_mn_resync_next_message(
1122 	mm_unit_t	*un,
1123 	diskaddr_t	currentblk,
1124 	size_t		rsize,
1125 	int		flags
1126 )
1127 {
1128 	md_mn_msg_resync_t	*rmsg = un->un_rs_msg;
1129 	set_t			setno;
1130 	md_mn_kresult_t		*kres;
1131 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
1132 	int			rval;
1133 	md_mps_t		*ps;
1134 	mm_submirror_t		*sm;
1135 	int			smi;
1136 	int			nretries = 0;
1137 
1138 	ASSERT(rmsg != NULL);
1139 #ifdef DEBUG
1140 	if (mirror_debug_flag) {
1141 		printf("send resync next message, mnum = %x, start=%lld, "
1142 		    "size=%ld, type=%x, done=%lld, 2_do=%lld\n",
1143 		    MD_SID(un), currentblk, rsize, un->un_rs_type,
1144 		    un->un_rs_resync_done, un->un_rs_resync_2_do);
1145 	}
1146 #endif
1147 	rmsg->msg_resync_mnum = MD_SID(un);
1148 	rmsg->msg_resync_type = un->un_rs_type;
1149 	rmsg->msg_resync_start = currentblk;
1150 	rmsg->msg_resync_rsize = rsize;
1151 	rmsg->msg_resync_done = un->un_rs_resync_done;
1152 	rmsg->msg_resync_2_do = un->un_rs_resync_2_do;
1153 	rmsg->msg_originator = md_mn_mynode_id;
1154 	if (flags & MD_FIRST_RESYNC_NEXT)
1155 		rmsg->msg_resync_flags = MD_MN_RS_FIRST_RESYNC_NEXT;
1156 
1157 	/*
1158 	 * Copy current submirror state and flags into message. This provides
1159 	 * a means of keeping all nodes that are currently active in the cluster
1160 	 * synchronised with regards to their submirror state settings. If we
1161 	 * did not pass this information here, the only time every node gets
1162 	 * submirror state updated is at the end of a resync phase. This can be
1163 	 * a significant amount of time for large metadevices.
1164 	 */
1165 	for (smi = 0; smi < NMIRROR; smi++) {
1166 		sm = &un->un_sm[smi];
1167 		rmsg->msg_sm_state[smi] = sm->sm_state;
1168 		rmsg->msg_sm_flags[smi] = sm->sm_flags;
1169 	}
1170 	setno = MD_MIN2SET(MD_SID(un));
1171 	md_unit_readerexit(ui);
1172 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1173 
1174 smrn_msg:
1175 	mutex_enter(&un->un_rs_cpr_mx);
1176 	CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1177 
1178 	rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG,
1179 	    0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1180 
1181 	CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1182 	mutex_exit(&un->un_rs_cpr_mx);
1183 
1184 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
1185 		mdmn_ksend_show_error(rval, kres, "RESYNC_NEXT");
1186 		/* If we're shutting down already, pause things here. */
1187 		if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1188 			while (!md_mn_is_commd_present()) {
1189 				delay(md_hz);
1190 			}
1191 			/*
1192 			 * commd is now available again. Retry the message once.
1193 			 * If this fails we panic as the system is in an
1194 			 * unexpected state.
1195 			 */
1196 			if (nretries++ == 0)
1197 				goto smrn_msg;
1198 		}
1199 		cmn_err(CE_PANIC, "ksend_message failure: RESYNC_NEXT");
1200 	}
1201 	kmem_free(kres, sizeof (md_mn_kresult_t));
1202 	(void) md_unit_readerlock(ui);
1203 	ps = un->un_rs_prev_overlap;
1204 
1205 	/* Allocate previous overlap reference if needed */
1206 	if (ps == NULL) {
1207 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
1208 		ps->ps_un = un;
1209 		ps->ps_ui = ui;
1210 		ps->ps_firstblk = 0;
1211 		ps->ps_lastblk = 0;
1212 		ps->ps_flags = 0;
1213 		md_unit_readerexit(ui);
1214 		(void) md_unit_writerlock(ui);
1215 		un->un_rs_prev_overlap = ps;
1216 		md_unit_writerexit(ui);
1217 		(void) md_unit_readerlock(ui);
1218 	}
1219 
1220 	ps->ps_firstblk = currentblk;
1221 	ps->ps_lastblk = currentblk + rsize - 1;
1222 }
1223 
1224 static int
resync_read_blk_range(mm_unit_t * un,diskaddr_t currentblk,diskaddr_t stopbefore,uint_t type,int flags)1225 resync_read_blk_range(
1226 	mm_unit_t *un,
1227 	diskaddr_t currentblk,
1228 	diskaddr_t stopbefore,
1229 	uint_t type,
1230 	int	flags
1231 )
1232 {
1233 	size_t copysize;	/* limited by max xfer buf size */
1234 	size_t rsize;		/* size of resync block (for MN) */
1235 	set_t		setno;
1236 	diskaddr_t	newstop;
1237 	diskaddr_t	rs_startblk;
1238 	uint_t		rs_type;
1239 	int		flags1 = flags & MD_FIRST_RESYNC_NEXT;
1240 
1241 	rs_type = un->un_rs_type;
1242 	rs_startblk = currentblk;
1243 	if (stopbefore > un->c.un_total_blocks)
1244 		stopbefore = un->c.un_total_blocks;
1245 	if (currentblk < un->un_resync_startbl)
1246 		currentblk = un->un_resync_startbl;
1247 
1248 	copysize = un->un_rs_copysize;
1249 	rsize = MD_DEF_RESYNC_BLK_SZ;
1250 
1251 	setno = MD_MIN2SET(MD_SID(un));
1252 	while (currentblk < stopbefore) {
1253 		/*
1254 		 * Split the block up into units of MD_DEF_RESYNC_BLK_SZ and
1255 		 * if a MN device and sendflag is set, send a RESYNC_MESSAGE
1256 		 * to all nodes.
1257 		 */
1258 		if ((currentblk + MD_DEF_RESYNC_BLK_SZ) > stopbefore)
1259 			rsize = stopbefore - currentblk;
1260 		if (MD_MNSET_SETNO(setno) && (flags & MD_SEND_MESS_XMIT)) {
1261 			un->un_resync_startbl = currentblk;
1262 			rs_startblk = currentblk;
1263 			send_mn_resync_next_message(un, currentblk, rsize,
1264 			    flags1);
1265 			if (flags1)
1266 				flags1 = 0;
1267 			/* check to see if we've been asked to terminate */
1268 			if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1269 				return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1270 				    ? 1:0);
1271 			/*
1272 			 * Check to see if another node has completed this
1273 			 * block, if so either the type or the resync region
1274 			 * will have changed. If the resync type has changed,
1275 			 * just exit.
1276 			 * If the resync region has changed, reset currentblk
1277 			 * to the start of the current resync region and
1278 			 * continue.
1279 			 */
1280 			if (un->un_rs_type != rs_type)
1281 				return (0);
1282 			if (un->un_rs_prev_overlap->ps_firstblk >
1283 			    rs_startblk) {
1284 				currentblk =
1285 				    un->un_rs_prev_overlap->ps_firstblk;
1286 				continue;
1287 			}
1288 		}
1289 		newstop = currentblk + rsize;
1290 		while (currentblk < newstop) {
1291 			if ((currentblk + copysize) > stopbefore)
1292 				copysize = (size_t)(stopbefore - currentblk);
1293 			if (resync_read_buffer(un, currentblk, copysize,
1294 			    (flags & MD_RESYNC_FLAG_ERR)))
1295 				return (1);
1296 
1297 			/* resync_read_buffer releases/grabs a new lock */
1298 			un = (mm_unit_t *)MD_UNIT(MD_SID(un));
1299 			currentblk += copysize;
1300 
1301 			/* check to see if we've been asked to terminate */
1302 			if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1303 				return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1304 				    ? 1:0);
1305 			if (MD_MNSET_SETNO(setno)) {
1306 				/*
1307 				 * Check to see if another node has completed
1308 				 * this block, see above
1309 				 */
1310 				if (un->un_rs_type != rs_type)
1311 					return (0);
1312 				if (un->un_rs_prev_overlap->ps_firstblk >
1313 				    rs_startblk)
1314 					currentblk =
1315 					    un->un_rs_prev_overlap->ps_firstblk;
1316 			}
1317 		}
1318 	}
1319 	return (0);
1320 }
1321 
1322 static void
optimized_resync(mm_unit_t * un)1323 optimized_resync(mm_unit_t *un)
1324 {
1325 	mdi_unit_t	*ui;
1326 	minor_t		mnum;
1327 	int		rr, smi;
1328 	int		resync_regions;
1329 	uchar_t		*dirtyregions;
1330 	diskaddr_t	first, stopbefore;
1331 	int		err;
1332 	int		cnt;
1333 	sm_state_t	state;
1334 	int		broke_out = 0;
1335 	set_t		setno;
1336 	uint_t		old_rs_type = un->un_rs_type;
1337 	uint_t		old_rs_done;
1338 	uint_t		flags1 = MD_FIRST_RESYNC_NEXT|MD_RESYNC_FLAG_ERR;
1339 	size_t		start_rr;
1340 
1341 	mnum = MD_SID(un);
1342 	ui = MDI_UNIT(mnum);
1343 	setno = MD_UN2SET(un);
1344 
1345 	if (!(un->c.un_status & MD_UN_OPT_NOT_DONE)) {
1346 		/*
1347 		 * We aren't marked as needing a resync so for multi-node
1348 		 * sets we flag the completion so that all nodes see the same
1349 		 * metadevice state. This is a problem when a new node joins
1350 		 * an existing set as it has to perform a 'metasync -r' and
1351 		 * we have to step through all of the resync phases. If we
1352 		 * don't do this the nodes that were already in the set will
1353 		 * have the metadevices marked as 'Okay' but the joining node
1354 		 * will have 'Needs Maintenance' which is unclearable.
1355 		 */
1356 		if (MD_MNSET_SETNO(setno)) {
1357 			send_mn_resync_done_message(un, CLEAR_OPT_NOT_DONE);
1358 		}
1359 		return;
1360 	}
1361 
1362 	/*
1363 	 * No need for optimized resync if ABR set, clear rs_type and flags
1364 	 * and exit
1365 	 */
1366 	if (ui->ui_tstate & MD_ABR_CAP) {
1367 		un->un_rs_type = MD_RS_NONE;
1368 		un->c.un_status &= ~(MD_UN_OPT_NOT_DONE | MD_UN_WAR);
1369 		return;
1370 	}
1371 
1372 	un->un_rs_dropped_lock = 1;
1373 	un->c.un_status |= MD_UN_WAR;
1374 	resync_regions = un->un_rrd_num;
1375 	dirtyregions = un->un_resync_bm;
1376 	md_unit_writerexit(ui);
1377 
1378 	/* For MN sets, resync NOTIFY is done when processing resync messages */
1379 	if (!MD_MNSET_SETNO(setno)) {
1380 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1381 		    SVM_TAG_METADEVICE, setno, MD_SID(un));
1382 	}
1383 	un = (mm_unit_t *)md_unit_readerlock(ui);
1384 
1385 	/* check to see if we've been asked to terminate */
1386 	if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1387 		if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1388 			broke_out = RESYNC_ERR;
1389 	}
1390 	/*
1391 	 * Check that we are still performing an optimized
1392 	 * resync. If not, another node must have completed it
1393 	 * so we have no more work to do.
1394 	 */
1395 	if (un->un_rs_type != old_rs_type) {
1396 		md_unit_readerexit(ui);
1397 		(void) md_unit_writerlock(ui);
1398 		return;
1399 	}
1400 	/*
1401 	 * If rs_resync_done is non-zero, we must be completing an optimized
1402 	 * resync that has already been partially done on another node.
1403 	 * Therefore clear the bits in resync_bm for the resync regions
1404 	 * already done. If resync_startbl is zero, calculate 2_do.
1405 	 */
1406 	if (un->un_rs_resync_done > 0) {
1407 		BLK_TO_RR(start_rr, un->un_resync_startbl, un);
1408 		for (rr = 0; rr < start_rr && rr < resync_regions; rr++)
1409 			CLR_KEEPDIRTY(rr, un);
1410 	} else {
1411 		un->un_rs_resync_2_do = 0;
1412 		for (rr = 0; rr < resync_regions; rr++)
1413 			if (isset(dirtyregions, rr))
1414 				un->un_rs_resync_2_do++;
1415 	}
1416 
1417 	for (rr = 0; (rr < resync_regions) && (broke_out != RESYNC_ERR); rr++) {
1418 		if (isset(dirtyregions, rr)) {
1419 			RR_TO_BLK(first, rr, un);
1420 			RR_TO_BLK(stopbefore, rr+1, un);
1421 			old_rs_type = un->un_rs_type;
1422 			old_rs_done = un->un_rs_resync_done;
1423 			err = resync_read_blk_range(un, first, stopbefore,
1424 			    MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
1425 			flags1 = MD_RESYNC_FLAG_ERR;
1426 
1427 			/* resync_read_blk_range releases/grabs a new lock */
1428 			un = (mm_unit_t *)MD_UNIT(mnum);
1429 
1430 			if (err) {
1431 				broke_out = RESYNC_ERR;
1432 				break;
1433 			}
1434 
1435 			/*
1436 			 * Check that we are still performing an optimized
1437 			 * resync. If not, another node must have completed it
1438 			 * so we have no more work to do.
1439 			 */
1440 			if (un->un_rs_type != old_rs_type) {
1441 				md_unit_readerexit(ui);
1442 				(void) md_unit_writerlock(ui);
1443 				return;
1444 			}
1445 
1446 			/*
1447 			 * If resync_done has increased, we must have
1448 			 * blocked in resync_read_blk_range while another node
1449 			 * continued with the resync. Therefore clear resync_bm
1450 			 * for the blocks that have been resynced on another
1451 			 * node and update rr to the next RR to be done.
1452 			 */
1453 			if (old_rs_done < un->un_rs_resync_done) {
1454 				int i;
1455 				BLK_TO_RR(start_rr, un->un_resync_startbl - 1,
1456 				    un);
1457 				for (i = rr; i < start_rr; i++)
1458 					CLR_KEEPDIRTY(i, un);
1459 				rr = start_rr;
1460 			} else
1461 				un->un_rs_resync_done++;
1462 
1463 			for (smi = 0, cnt = 0; smi < NMIRROR; smi++)
1464 				if (SUBMIRROR_IS_WRITEABLE(un, smi) &&
1465 				    !(SMS_BY_INDEX_IS(un, smi, SMS_ALL_ERRED)))
1466 					cnt++;
1467 			if (cnt < 2) {
1468 				broke_out = RESYNC_ERR;
1469 				break;
1470 			}
1471 			CLR_KEEPDIRTY(rr, un);
1472 			/* Check to see if we've completed the resync cleanly */
1473 			if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1474 				break;
1475 
1476 			/*
1477 			 * Check that we haven't exceeded un_rs_resync_2_do. If
1478 			 * we have we've completed the resync.
1479 			 */
1480 			if (un->un_rs_resync_done > un->un_rs_resync_2_do)
1481 				break;
1482 		}
1483 	}
1484 	md_unit_readerexit(ui);
1485 	un = (mm_unit_t *)md_unit_writerlock(ui);
1486 
1487 	/*
1488 	 * If MN set send message to all nodes to indicate resync
1489 	 * phase is complete. The processing of the message will update the
1490 	 * mirror state
1491 	 */
1492 	if (MD_MNSET_SETNO(setno)) {
1493 		send_mn_resync_done_message(un, broke_out);
1494 	} else {
1495 
1496 		if (!broke_out)
1497 			un->c.un_status &= ~MD_UN_WAR;
1498 
1499 		un->c.un_status &= ~MD_UN_KEEP_DIRTY;
1500 
1501 		setno = MD_UN2SET(un);
1502 		for (smi = 0; smi < NMIRROR; smi++) {
1503 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
1504 			if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE_RESYNC)) {
1505 				state = (broke_out ? SMS_OFFLINE : SMS_RUNNING);
1506 				mirror_set_sm_state(&un->un_sm[smi],
1507 				    &un->un_smic[smi], state, broke_out);
1508 				mirror_commit(un, NO_SUBMIRRORS, 0);
1509 			}
1510 			if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE))
1511 				un->c.un_status |= MD_UN_OFFLINE_SM;
1512 		}
1513 	}
1514 
1515 	/* For MN sets, resync NOTIFY is done when processing resync messages */
1516 	if (!MD_MNSET_SETNO(setno)) {
1517 		if (broke_out) {
1518 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1519 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1520 		} else {
1521 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1522 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1523 		}
1524 	}
1525 }
1526 
1527 /*
1528  * recalc_resync_done
1529  *
1530  * This function deals with a change in value of un_rs_resync_2_do in a
1531  * component resync. This may change if we are restarting a component
1532  * resync on a single node having rebooted with a different value of
1533  * md_resync_bufsz or if we are running in a multi-node with nodes having
1534  * different values of md_resync_bufsz.
1535  * If there is a change in un_rs_resync_2_do, we need to recalculate
1536  * the value of un_rs_resync_done given the new value for resync_2_do.
1537  * We have to calculate a new value for resync_done to be either
1538  * if un_resync_startbl is set, (un_resync_startbl - initblock)/(blksize + skip)
1539  * or if it is not set, we need to calculate it from un_rs_resync_done,
1540  * (un_rs_resync_done/un_rs_resync_2_do) * resync_2_do
1541  * In addition we need to deal with the overflow case by using a factor to
1542  * prevent overflow
1543  */
1544 
1545 static void
recalc_resync_done(mm_unit_t * un,size_t resync_2_do,diskaddr_t initblock,u_longlong_t blk_size,u_longlong_t skip)1546 recalc_resync_done(mm_unit_t *un, size_t resync_2_do, diskaddr_t initblock,
1547     u_longlong_t blk_size, u_longlong_t skip)
1548 {
1549 	diskaddr_t		x;
1550 	uint_t			factor = 1;
1551 
1552 	/*
1553 	 * If resync_2_do has not yet been calculated, no need to modify
1554 	 * resync_done
1555 	 */
1556 	if (un->un_rs_resync_2_do == 0) {
1557 		return;
1558 	}
1559 	if (un->un_rs_resync_2_do == resync_2_do)
1560 		return; /* No change, so nothing to do */
1561 	/*
1562 	 * If un_rs_startbl is set, another node must have already started
1563 	 * this resync and hence we can calculate resync_done from
1564 	 * resync_startbl
1565 	 */
1566 	if (un->un_resync_startbl) {
1567 		un->un_rs_resync_done = (un->un_resync_startbl - initblock) /
1568 		    (blk_size + skip);
1569 		return;
1570 	}
1571 	/*
1572 	 * un_resync_startbl is not set so we must calculate it from
1573 	 * un_rs_resync_done.
1574 	 * If the larger of the two values of resync_2_do is greater than 32
1575 	 * bits, calculate a factor to divide by to ensure that we don't
1576 	 * overflow 64 bits when calculating the new value for resync_done
1577 	 */
1578 	x = (un->un_rs_resync_2_do > resync_2_do) ? un->un_rs_resync_2_do :
1579 	    resync_2_do;
1580 	while (x > INT32_MAX) {
1581 		x = x >> 1;
1582 		factor = factor << 1;
1583 	}
1584 	un->un_rs_resync_done = ((un->un_rs_resync_done/factor) *
1585 	    (resync_2_do/factor)) /
1586 	    ((un->un_rs_resync_2_do + (factor * factor) - 1)/
1587 	    (factor * factor));
1588 }
1589 
1590 static void
check_comp_4_resync(mm_unit_t * un,int smi,int ci)1591 check_comp_4_resync(mm_unit_t *un, int smi, int ci)
1592 {
1593 	mdi_unit_t		*ui;
1594 	minor_t			mnum;
1595 	mm_submirror_t		*sm;
1596 	mm_submirror_ic_t	*smic;
1597 	size_t			count;
1598 	u_longlong_t		skip;
1599 	u_longlong_t		size;
1600 	u_longlong_t		blk_size;
1601 	diskaddr_t		initblock;
1602 	diskaddr_t		block;
1603 	diskaddr_t		frag = 0;
1604 	md_m_shared_t		*shared;
1605 	int			err;
1606 	set_t			setno;
1607 	int			broke_out = 0;
1608 	int			blks;
1609 	uint_t			old_rs_type = un->un_rs_type;
1610 	diskaddr_t		old_rs_done;
1611 	uint_t			flags1 = MD_FIRST_RESYNC_NEXT;
1612 	diskaddr_t		resync_2_do;
1613 
1614 	mnum = MD_SID(un);
1615 	ui = MDI_UNIT(mnum);
1616 	sm = &un->un_sm[smi];
1617 	smic = &un->un_smic[smi];
1618 	setno = MD_UN2SET(un);
1619 
1620 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1621 	    (sm->sm_dev, sm, ci);
1622 
1623 	if (shared->ms_state != CS_RESYNC) {
1624 		SET_RS_TYPE_NONE(un->un_rs_type);
1625 		return;
1626 	}
1627 
1628 	if (shared->ms_flags & MDM_S_RS_TRIED) {
1629 		SET_RS_TYPE_NONE(un->un_rs_type);
1630 		return;
1631 	}
1632 
1633 	(void) (*(smic->sm_get_bcss))
1634 	    (sm->sm_dev, sm, ci, &initblock, &count, &skip, &size);
1635 
1636 	if ((count == 1) && (skip == 0)) {
1637 		count = (size_t)(size / un->un_rs_copysize);
1638 		if ((frag = (size - (count * un->un_rs_copysize))) != 0)
1639 			count++;
1640 		size = (u_longlong_t)un->un_rs_copysize;
1641 	}
1642 	blk_size = size; /* Save block size for this resync */
1643 
1644 	ASSERT(count >= 1);
1645 	resync_2_do = count;
1646 	/*
1647 	 * If part way through a resync, un_rs_resync_done/un_rs_resync_2_do
1648 	 * gives the proportion of the resync that has already been done.
1649 	 * If un_rs_copysize has changed since this previous partial resync,
1650 	 * either because this node has been rebooted with a different value
1651 	 * for md_resync_bufsz or because another node with a different value
1652 	 * for md_resync_bufsz performed the previous resync, we need to
1653 	 * recalculate un_rs_resync_done as a proportion of our value of
1654 	 * resync_2_do.
1655 	 */
1656 	recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1657 
1658 	/*
1659 	 * For MN mirrors we need to send a message to all nodes indicating
1660 	 * the next region to be resynced. For a component resync, the size of
1661 	 * the contiguous region that is processed by resync_read_blk_range()
1662 	 * may be small if there is the interleave size.
1663 	 * Therefore, rather than sending the message within
1664 	 * resync_read_blk_range(), we will send a message every
1665 	 * MD_DEF_RESYNC_BLK_SZ blocks. Calculate the frequency in terms of
1666 	 * the number of blocks. Then, if we are restarting a resync, round
1667 	 * un_rs_resync_done down to the previous resync region boundary. This
1668 	 * ensures that we send a RESYNC_NEXT message before resyncing any
1669 	 * blocks
1670 	 */
1671 	if (MD_MNSET_SETNO(setno)) {
1672 		blks = ((MD_DEF_RESYNC_BLK_SZ + blk_size + skip - 1)/
1673 		    (blk_size + skip));
1674 		un->un_rs_resync_done = (un->un_rs_resync_done/blks) * blks;
1675 	}
1676 	/*
1677 	 * un_rs_resync_done is the number of ('size' + 'skip') increments
1678 	 * already resynced from the base 'block'
1679 	 * un_rs_resync_2_do is the number of iterations in
1680 	 * this component resync.
1681 	 */
1682 	ASSERT(count >= un->un_rs_resync_done);
1683 	un->un_rs_resync_2_do = (diskaddr_t)count;
1684 
1685 	un->c.un_status |= MD_UN_WAR;
1686 	sm->sm_flags |= MD_SM_RESYNC_TARGET;
1687 	md_unit_writerexit(ui);
1688 
1689 	/* For MN sets, resync NOTIFY is done when processing resync messages */
1690 	if (!MD_MNSET_SETNO(setno)) {
1691 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1692 		    SVM_TAG_METADEVICE, setno, MD_SID(un));
1693 	}
1694 	un = (mm_unit_t *)md_unit_readerlock(ui);
1695 
1696 	/* check to see if we've been asked to terminate */
1697 	if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1698 		if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1699 			broke_out = RESYNC_ERR;
1700 	}
1701 	/*
1702 	 * Check that we are still performing the same component
1703 	 * resync. If not, another node must have completed it
1704 	 * so we have no more work to do.
1705 	 */
1706 	if (un->un_rs_type != old_rs_type) {
1707 		md_unit_readerexit(ui);
1708 		(void) md_unit_writerlock(ui);
1709 		return;
1710 	}
1711 	/*
1712 	 * Adjust resync_done, resync_2_do, start of resync area and count to
1713 	 * skip already resync'd data. We need to recalculate resync_done as
1714 	 * we have dropped the unit lock above and may have lost ownership to
1715 	 * another node, with a different resync buffer size and it may have
1716 	 * sent us new values of resync_done and resync_2_do based on its
1717 	 * resync buffer size
1718 	 */
1719 	recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1720 	un->un_rs_resync_2_do = resync_2_do;
1721 	count -= un->un_rs_resync_done;
1722 	block = initblock + ((blk_size + skip) * (int)un->un_rs_resync_done);
1723 
1724 	un->un_rs_dropped_lock = 1;
1725 	while ((count > 0) && (broke_out != RESYNC_ERR)) {
1726 		old_rs_done = un->un_rs_resync_done;
1727 		/*
1728 		 * For MN mirrors send a message to the other nodes. This
1729 		 * message includes the size of the region that must be blocked
1730 		 * for all writes
1731 		 */
1732 		if (MD_MNSET_SETNO(setno)) {
1733 			if ((un->un_rs_resync_done%blks == 0)) {
1734 				un->un_resync_startbl = block;
1735 				send_mn_resync_next_message(un, block,
1736 				    (blk_size+skip)*blks, flags1);
1737 				flags1 = 0;
1738 				/*
1739 				 * check to see if we've been asked to
1740 				 * terminate
1741 				 */
1742 				if (resync_kill_pending(un,
1743 				    MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1744 					if (un->c.un_status &
1745 					    MD_UN_RESYNC_CANCEL) {
1746 						broke_out = RESYNC_ERR;
1747 						break;
1748 					}
1749 				}
1750 
1751 				/*
1752 				 * Check that we are still performing the same
1753 				 * component resync. If not, another node must
1754 				 * have completed it so we have no more work to
1755 				 * do. Also reset count to remaining resync as
1756 				 * we may have lost ownership in in
1757 				 * send_mn_resync_next_message while another
1758 				 * node continued with the resync and
1759 				 * incremented resync_done.
1760 				 */
1761 				if (un->un_rs_type != old_rs_type) {
1762 					md_unit_readerexit(ui);
1763 					(void) md_unit_writerlock(ui);
1764 					return;
1765 				}
1766 				/*
1767 				 * recalculate resync_done, resync_2_do
1768 				 * We need to recalculate resync_done as
1769 				 * we have dropped the unit lock in
1770 				 * send_mn_resync_next_message above and may
1771 				 * have lost ownership to another node, with a
1772 				 * different resync buffer size and it may have
1773 				 * sent us new values of resync_done and
1774 				 * resync_2_do based on its resync buffer size
1775 				 */
1776 				recalc_resync_done(un, resync_2_do, initblock,
1777 				    blk_size, skip);
1778 				un->un_rs_resync_2_do = resync_2_do;
1779 				count = un->un_rs_resync_2_do -
1780 				    un->un_rs_resync_done;
1781 				/*
1782 				 * Adjust start of resync area to skip already
1783 				 * resync'd data
1784 				 */
1785 				block = initblock + ((blk_size + skip) *
1786 				    (int)un->un_rs_resync_done);
1787 				old_rs_done = un->un_rs_resync_done;
1788 			}
1789 		}
1790 		err = resync_read_blk_range(un, block, block + size,
1791 		    MD_READER_HELD, MD_RESYNC_FLAG_ERR);
1792 
1793 		/* resync_read_blk_range releases/grabs a new lock */
1794 		un = (mm_unit_t *)MD_UNIT(mnum);
1795 
1796 		if (err) {
1797 			broke_out = RESYNC_ERR;
1798 			break;
1799 		}
1800 		/*
1801 		 * If we are no longer resyncing this component, return as
1802 		 * another node has progressed the resync.
1803 		 */
1804 		if (un->un_rs_type != old_rs_type) {
1805 			md_unit_readerexit(ui);
1806 			(void) md_unit_writerlock(ui);
1807 			return;
1808 		}
1809 
1810 		/*
1811 		 * recalculate resync_done, resync_2_do. We need to recalculate
1812 		 * resync_done as we have dropped the unit lock in
1813 		 * resync_read_blk_range above and may have lost ownership to
1814 		 * another node, with a different resync buffer size and it may
1815 		 * have sent us new values of resync_done and resync_2_do based
1816 		 * on its resync buffer size
1817 		 */
1818 		recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1819 		un->un_rs_resync_2_do = resync_2_do;
1820 
1821 		/*
1822 		 * Reset count to remaining resync as we may have blocked in
1823 		 * resync_read_blk_range while another node continued
1824 		 * with the resync and incremented resync_done. Also adjust
1825 		 * start of resync area to skip already resync'd data.
1826 		 */
1827 		count = un->un_rs_resync_2_do - un->un_rs_resync_done;
1828 		block = initblock +((blk_size + skip) *
1829 		    (int)un->un_rs_resync_done);
1830 
1831 		/*
1832 		 * If we are picking up from another node, we retry the last
1833 		 * block otherwise step on to the next block
1834 		 */
1835 		if (old_rs_done == un->un_rs_resync_done) {
1836 			block += blk_size + skip;
1837 			un->un_rs_resync_done++;
1838 			count--;
1839 		}
1840 
1841 		if ((count == 1) && frag)
1842 			size = frag;
1843 		if (shared->ms_state == CS_ERRED) {
1844 			err = 1;
1845 			broke_out = RESYNC_ERR;
1846 			break;
1847 		}
1848 
1849 		/* Check to see if we've completed the resync cleanly */
1850 		if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1851 			break;
1852 	}
1853 
1854 	md_unit_readerexit(ui);
1855 	un = (mm_unit_t *)md_unit_writerlock(ui);
1856 
1857 	/*
1858 	 * If MN set send message to all nodes to indicate resync
1859 	 * phase is complete. The processing of the message will update the
1860 	 * mirror state
1861 	 */
1862 	if (MD_MNSET_SETNO(setno)) {
1863 		send_mn_resync_done_message(un, broke_out);
1864 	} else {
1865 		un->c.un_status &= ~MD_UN_WAR;
1866 		sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
1867 
1868 		if (err)
1869 			shared->ms_flags |= MDM_S_RS_TRIED;
1870 		else
1871 			/*
1872 			 * As we don't transmit the changes,
1873 			 * no need to drop the lock.
1874 			 */
1875 			set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
1876 			    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
1877 	}
1878 
1879 	/* For MN sets, resync NOTIFY is done when processing resync messages */
1880 	if (!MD_MNSET_SETNO(setno)) {
1881 		if (broke_out) {
1882 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1883 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1884 		} else {
1885 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1886 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1887 		}
1888 		SET_RS_TYPE_NONE(un->un_rs_type);
1889 	}
1890 }
1891 
1892 static void
submirror_resync(mm_unit_t * un)1893 submirror_resync(mm_unit_t *un)
1894 {
1895 	mdi_unit_t		*ui;
1896 	minor_t			mnum;
1897 	mm_submirror_t		*sm;
1898 	mm_submirror_ic_t	*smic;
1899 	int			smi;
1900 	diskaddr_t		chunk;
1901 	diskaddr_t		curblk;
1902 	int			err;
1903 	int			cnt;
1904 	set_t			setno;
1905 	int			broke_out = 0;
1906 	int			i;
1907 	int			flags1 = MD_FIRST_RESYNC_NEXT;
1908 	int			compcnt;
1909 
1910 	mnum = MD_SID(un);
1911 	ui = MDI_UNIT(mnum);
1912 	setno = MD_UN2SET(un);
1913 
1914 	/*
1915 	 * If the submirror_index is non-zero, we are continuing a resync
1916 	 * so restart resync from last submirror marked as being resynced.
1917 	 */
1918 	if (RS_SMI(un->un_rs_type) != 0) {
1919 		smi = RS_SMI(un->un_rs_type);
1920 		sm = &un->un_sm[smi];
1921 		smic = &un->un_smic[smi];
1922 		if (!SMS_IS(sm, SMS_ATTACHED_RESYNC)) {
1923 			for (smi = 0; smi < NMIRROR; smi++) {
1924 				sm = &un->un_sm[smi];
1925 				smic = &un->un_smic[smi];
1926 				if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1927 					break;
1928 			}
1929 		}
1930 	} else {
1931 		for (smi = 0; smi < NMIRROR; smi++) {
1932 			sm = &un->un_sm[smi];
1933 			smic = &un->un_smic[smi];
1934 			if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1935 				break;
1936 		}
1937 	}
1938 	if (smi == NMIRROR) {
1939 		SET_RS_TYPE_NONE(un->un_rs_type);
1940 		return;
1941 	}
1942 
1943 	/*
1944 	 * If we've only got one component we can fail on a resync write
1945 	 * if an error is encountered. This stops an unnecessary read of the
1946 	 * whole mirror on a target write error.
1947 	 */
1948 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
1949 	if (compcnt == 1)
1950 		flags1 |= MD_RESYNC_FLAG_ERR;
1951 
1952 	un->c.un_status |= MD_UN_WAR;
1953 	sm->sm_flags |= MD_SM_RESYNC_TARGET;
1954 	SET_RS_SMI(un->un_rs_type, smi);
1955 	md_unit_writerexit(ui);
1956 
1957 	/* For MN sets, resync NOTIFY is done when processing resync messages */
1958 	if (!MD_MNSET_SETNO(setno)) {
1959 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1960 		    SVM_TAG_METADEVICE, setno, MD_SID(un));
1961 	}
1962 	un = (mm_unit_t *)md_unit_readerlock(ui);
1963 
1964 	un->un_rs_dropped_lock = 1;
1965 
1966 	/* check to see if we've been asked to terminate */
1967 	if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1968 		if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1969 			broke_out = RESYNC_ERR;
1970 	}
1971 	/*
1972 	 * Check that we are still performing the same submirror
1973 	 * resync. If not, another node must have completed it
1974 	 * so we have no more work to do.
1975 	 */
1976 	if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
1977 		md_unit_readerexit(ui);
1978 		(void) md_unit_writerlock(ui);
1979 		return;
1980 	}
1981 
1982 	/* if > 1TB mirror, increase percent done granularity */
1983 	if (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS)
1984 		chunk = un->c.un_total_blocks / 1000;
1985 	else
1986 		chunk = un->c.un_total_blocks / 100;
1987 	if (chunk == 0)
1988 		chunk = un->c.un_total_blocks;
1989 	/*
1990 	 * If a MN set, round the chunk size up to a multiple of
1991 	 * MD_DEF_RESYNC_BLK_SZ
1992 	 */
1993 	if (MD_MNSET_SETNO(setno)) {
1994 		chunk = ((chunk + MD_DEF_RESYNC_BLK_SZ)/MD_DEF_RESYNC_BLK_SZ)
1995 		    * MD_DEF_RESYNC_BLK_SZ;
1996 		if (chunk > un->c.un_total_blocks)
1997 			chunk = un->c.un_total_blocks;
1998 	}
1999 	/*
2000 	 * Handle restartable resyncs that continue from where the previous
2001 	 * resync left off. The new resync range is from un_rs_resync_done ..
2002 	 * un_rs_resync_2_do
2003 	 */
2004 	curblk = 0;
2005 	if (un->un_rs_resync_done == 0) {
2006 		un->un_rs_resync_2_do = un->c.un_total_blocks;
2007 	} else {
2008 		curblk = un->un_rs_resync_done;
2009 	}
2010 	while ((curblk != un->c.un_total_blocks) && (broke_out != RESYNC_ERR)) {
2011 		diskaddr_t	rs_done;
2012 
2013 		rs_done = un->un_rs_resync_done;
2014 		err = resync_read_blk_range(un, curblk, curblk + chunk,
2015 		    MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
2016 		flags1 = (compcnt == 1 ? MD_RESYNC_FLAG_ERR : 0);
2017 
2018 		/* resync_read_blk_range releases/grabs a new lock */
2019 		un = (mm_unit_t *)MD_UNIT(mnum);
2020 
2021 		if (err) {
2022 			broke_out = RESYNC_ERR;
2023 			break;
2024 		}
2025 
2026 		/*
2027 		 * If we are no longer executing a submirror resync, return
2028 		 * as another node has completed the submirror resync.
2029 		 */
2030 		if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
2031 			md_unit_readerexit(ui);
2032 			(void) md_unit_writerlock(ui);
2033 			return;
2034 		}
2035 		/*
2036 		 * If resync_done has changed, we must have blocked
2037 		 * in resync_read_blk_range while another node
2038 		 * continued with the resync so restart from resync_done.
2039 		 */
2040 		if (rs_done != un->un_rs_resync_done) {
2041 			curblk = un->un_rs_resync_done;
2042 		} else {
2043 			curblk += chunk;
2044 			un->un_rs_resync_done = curblk;
2045 		}
2046 
2047 		if ((curblk + chunk) > un->c.un_total_blocks)
2048 			chunk = un->c.un_total_blocks - curblk;
2049 		for (i = 0, cnt = 0; i < NMIRROR; i++)
2050 			if (SUBMIRROR_IS_WRITEABLE(un, i) &&
2051 			    !SMS_BY_INDEX_IS(un, i, SMS_ALL_ERRED) &&
2052 			    (un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET))
2053 				cnt++;
2054 		if (cnt == 0) {
2055 			broke_out = RESYNC_ERR;
2056 			break;
2057 		}
2058 
2059 		/* Check to see if we've completed the resync cleanly */
2060 		if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
2061 			break;
2062 	}
2063 	md_unit_readerexit(ui);
2064 	un = (mm_unit_t *)md_unit_writerlock(ui);
2065 
2066 	/*
2067 	 * If MN set send message to all nodes to indicate resync
2068 	 * phase is complete. The processing of the message will update the
2069 	 * mirror state
2070 	 */
2071 	if (MD_MNSET_SETNO(setno)) {
2072 		send_mn_resync_done_message(un, broke_out);
2073 	} else {
2074 		sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
2075 		if (err) {
2076 			mirror_set_sm_state(sm, smic, SMS_ATTACHED, 1);
2077 		} else {
2078 			mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2079 		}
2080 		un->c.un_status &= ~MD_UN_WAR;
2081 		mirror_commit(un, SMI2BIT(smi), 0);
2082 	}
2083 
2084 	/* For MN sets, resync NOTIFY is done when processing resync messages */
2085 	if (!MD_MNSET_SETNO(setno)) {
2086 		if (broke_out) {
2087 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
2088 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2089 		} else {
2090 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
2091 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2092 		}
2093 	}
2094 }
2095 
2096 static void
component_resync(mm_unit_t * un)2097 component_resync(mm_unit_t *un)
2098 {
2099 	mm_submirror_t		*sm;
2100 	mm_submirror_ic_t	*smic;
2101 	int			ci;
2102 	int			i;
2103 	int			compcnt;
2104 
2105 	/*
2106 	 * Handle the case where we are picking up a partially complete
2107 	 * component resync. In this case un_rs_type contains the submirror
2108 	 * and component index of where we should restart the resync.
2109 	 */
2110 	while (un->un_rs_type != MD_RS_COMPONENT) {
2111 		i = RS_SMI(un->un_rs_type);
2112 		ci = RS_CI(un->un_rs_type);
2113 		check_comp_4_resync(un, i, ci);
2114 		if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2115 		    MD_WRITER_HELD))
2116 			return;
2117 		/*
2118 		 * If we have no current resync, contine to scan submirror and
2119 		 * components. If the resync has moved on to another component,
2120 		 * restart it and if the resync is no longer a component
2121 		 * resync, just exit
2122 		 */
2123 		if (RS_TYPE(un->un_rs_type) == MD_RS_NONE)
2124 			break;
2125 		if (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT)
2126 			return;
2127 	}
2128 	/* Now continue scanning _all_ submirrors and components */
2129 	for (i = 0; i < NMIRROR; i++) {
2130 		sm = &un->un_sm[i];
2131 		smic = &un->un_smic[i];
2132 		if (!SMS_IS(sm, SMS_RUNNING | SMS_LIMPING))
2133 			continue;
2134 		compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2135 		for (ci = 0; ci < compcnt; ci++) {
2136 			SET_RS_SMI(un->un_rs_type, i);
2137 			SET_RS_CI(un->un_rs_type, ci);
2138 			SET_RS_TYPE(un->un_rs_type, MD_RS_COMPONENT);
2139 			check_comp_4_resync(un, i, ci);
2140 			/* Bail out if we've been asked to abort/shutdown */
2141 			if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2142 			    MD_WRITER_HELD))
2143 				return;
2144 			/*
2145 			 * Now check if another node has continued with the
2146 			 * resync, if we are no longer in component resync,
2147 			 * exit, otherwise update to the current component - 1
2148 			 * so that the next call of check_comp_4 resync() will
2149 			 * resync the current component.
2150 			 */
2151 			if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2152 			    (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT))
2153 				return;
2154 			else {
2155 				if (RS_SMI(un->un_rs_type) != i) {
2156 					i = RS_SMI(un->un_rs_type);
2157 					ci = RS_CI(un->un_rs_type) - 1;
2158 				} else if (RS_CI(un->un_rs_type) != ci)
2159 					ci = RS_CI(un->un_rs_type) - 1;
2160 			}
2161 		}
2162 	}
2163 }
2164 
2165 static void
reset_comp_flags(mm_unit_t * un)2166 reset_comp_flags(mm_unit_t *un)
2167 {
2168 	mm_submirror_t		*sm;
2169 	mm_submirror_ic_t	*smic;
2170 	md_m_shared_t		*shared;
2171 	int			ci;
2172 	int			i;
2173 	int			compcnt;
2174 
2175 	for (i = 0; i < NMIRROR; i++) {
2176 		sm = &un->un_sm[i];
2177 		smic = &un->un_smic[i];
2178 		if (!SMS_IS(sm, SMS_INUSE))
2179 			continue;
2180 		compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2181 		for (ci = 0; ci < compcnt; ci++) {
2182 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2183 			    (sm->sm_dev, sm, ci);
2184 			shared->ms_flags &= ~MDM_S_RS_TRIED;
2185 		}
2186 	}
2187 }
2188 
2189 /*
2190  * resync_progress_thread:
2191  * ----------------------
2192  * Thread started on first resync of a unit which simply blocks until woken up
2193  * by a cv_signal, and then updates the mddb for the mirror unit record. This
2194  * saves the resync progress information (un_rs_resync_done, un_rs_resync_2_do)
2195  * so that an aborted resync can be continued after an intervening reboot.
2196  */
2197 static void
resync_progress_thread(minor_t mnum)2198 resync_progress_thread(minor_t mnum)
2199 {
2200 	mm_unit_t	*un = MD_UNIT(mnum);
2201 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2202 	set_t		setno = MD_MIN2SET(mnum);
2203 
2204 	while (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2205 		mutex_enter(&un->un_rs_progress_mx);
2206 		cv_wait(&un->un_rs_progress_cv, &un->un_rs_progress_mx);
2207 		mutex_exit(&un->un_rs_progress_mx);
2208 		if (un->un_rs_progress_flags & MD_RI_KILL)
2209 			break;
2210 
2211 		/*
2212 		 * Commit mirror unit if we're the Master node in a multi-node
2213 		 * environment
2214 		 */
2215 		if (MD_MNSET_SETNO(setno) && md_set[setno].s_am_i_master) {
2216 			(void) md_unit_readerlock(ui);
2217 			mirror_commit(un, NO_SUBMIRRORS, 0);
2218 			md_unit_readerexit(ui);
2219 		}
2220 	}
2221 	thread_exit();
2222 }
2223 
2224 /*
2225  * resync_progress:
2226  * ---------------
2227  * Timeout handler for updating the progress of the resync thread.
2228  * Simply wake up the resync progress daemon which will then mirror_commit() the
2229  * unit structure to the mddb. This snapshots the current progress of the resync
2230  */
2231 static void
resync_progress(void * arg)2232 resync_progress(void *arg)
2233 {
2234 	mm_unit_t	*un = (mm_unit_t *)arg;
2235 	mdi_unit_t	*ui = MDI_UNIT(MD_SID(un));
2236 	uint_t		active;
2237 
2238 	mutex_enter(&un->un_rs_progress_mx);
2239 	cv_signal(&un->un_rs_progress_cv);
2240 	mutex_exit(&un->un_rs_progress_mx);
2241 
2242 	/* schedule the next timeout if the resync is still marked active */
2243 	(void) md_unit_readerlock(ui);
2244 	active = un->c.un_status & MD_UN_RESYNC_ACTIVE ? 1 : 0;
2245 	md_unit_readerexit(ui);
2246 	if (active) {
2247 		un->un_rs_resync_to_id = timeout(resync_progress, un,
2248 		    (clock_t)(drv_usectohz(60000000) *
2249 		    md_mirror_resync_update_intvl));
2250 	}
2251 }
2252 
2253 /*
2254  * resync_unit:
2255  * -----------
2256  * Resync thread which drives all forms of resync (optimized, component,
2257  * submirror). Must handle thread suspension and kill to allow multi-node
2258  * resync to run without undue ownership changes.
2259  *
2260  * For a MN set, the reync mechanism is as follows:
2261  *
2262  * When a resync is started, either via metattach, metaonline, metareplace,
2263  * metasync or by a hotspare kicking in, a message is sent to all nodes, which
2264  * calls mirror_resync_thread. If there is currently no mirror owner, the
2265  * master node sends a CHOOSE_OWNER message to the handler on the master. This
2266  * chooses a mirror owner and sends a CHANGE_OWNER message requesting the
2267  * selected node to become the owner.
2268  * If this node is not the owner it sets itself to block in resync_kill_pending
2269  * and if there is no owner all nodes will block until the chosen owner is
2270  * selected, in which case it will unblock itself. So, on entry to this
2271  * function only one node will continue past resync_kill_pending().
2272  * Once the resync thread is started, it basically cycles through the optimized,
2273  * component and submirrors resyncs until there is no more work to do.
2274  *
2275  * For an ABR mirror, once a mirror owner is chosen it will complete the resync
2276  * unless the nodes dies in which case a new owner will be chosen and it will
2277  * have to complete the resync from the point at which the previous owner died.
2278  * To do this we broadcast a RESYNC_NEXT message before each region to be
2279  * resynced and this message contains the address and length of the region
2280  * being resynced and the current progress through the resync. The size of
2281  * this region is MD_DEF_RESYNC_BLK_SZ blocks. It is larger than the resync
2282  * block size to limit the amount of inter node traffic. The RESYNC_NEXT
2283  * message also indicates to all other nodes that all writes to this block
2284  * must be blocked until the next RESYNC_NEXT message is received. This ensures
2285  * that no node can write to a block that is being resynced. For all MN
2286  * mirrors we also block the whole resync region on the resync owner node so
2287  * that all writes to the resync region are blocked on all nodes. There is a
2288  * difference here between a MN set and a regular set in that for a MN set
2289  * we protect the mirror from writes to the current resync block by blocking
2290  * a larger region. For a regular set we just block writes to the current
2291  * resync block.
2292  *
2293  * For a non-ABR mirror the same RESYNC_NEXT message is sent with an
2294  * additional purpose. In this case, there is only one mirror owner at a time
2295  * and rather than continually switching ownership between the chosen mirror
2296  * owner and the node that is writing to the mirror, we move the resync to the
2297  * mirror owner. When we swich ownership, we block the old owner and unblock
2298  * the resync thread on the new owner. To enable the new owner to continue the
2299  * resync, all nodes need to have the latest resync status, Then, following each
2300  * resync write, we check to see if the resync state has changed and if it
2301  * has this must be because we have lost ownership to another node(s) for a
2302  * period and then have become owner again later in the resync process. If we
2303  * are still dealing with the same resync, we just adjust addresses and counts
2304  * and then continue. If the resync has moved on to a different type, for
2305  * example from an optimized to a submirror resync, we move on to process the
2306  * resync described by rs_type and continue from the position described by
2307  * resync_done and resync_startbl.
2308  *
2309  * Note that for non-ABR mirrors it is possible for a write to be made on a
2310  * non resync-owner node without a change of ownership. This is the case when
2311  * the mirror has a soft part created on it and a write in ABR mode is made
2312  * to that soft part. Therefore we still need to block writes to the resync
2313  * region on all nodes.
2314  *
2315  * Sending the latest resync state to all nodes also enables them to continue
2316  * a resync in the event that the mirror owner dies. If a mirror owner for
2317  * a non-ABR mirror has died, there will be dirty resync regions. Therefore,
2318  * regardless of whether another type of resync was in progress, we must first
2319  * do an optimized resync to clean up the dirty regions before continuing
2320  * with the interrupted resync.
2321  *
2322  * The resync status is held in the unit structure
2323  * On disk
2324  * un_rs_resync_done	The number of contiguous resyc blocks done so far
2325  * un_rs_resync_2_do	The total number of contiguous resync blocks
2326  * un_rs_type		The resync type (inc submirror and component numbers)
2327  * In core
2328  * un_resync_startbl	The address of the current resync block being processed
2329  *
2330  * In the event that the whole cluster fails we need to just use
2331  * un_rs_resync_done to restart the resync and to ensure that this is
2332  * periodically written to disk, we have a thread which writes the record
2333  * to disk every 5 minutes. As the granularity of un_rs_resync_done is
2334  * usually coarse ( for an optimized resync 1001 is the max value) there is
2335  * little point in writing this more frequently.
2336  */
2337 static void
resync_unit(minor_t mnum)2338 resync_unit(minor_t mnum)
2339 {
2340 	mdi_unit_t	*ui;
2341 	mm_unit_t	*un;
2342 	md_error_t	mde = mdnullerror;
2343 	int		mn_resync = 0;
2344 	int		resync_finish = 0;
2345 	set_t		setno = MD_MIN2SET(mnum);
2346 	uint_t		old_rs_type = MD_RS_NONE;
2347 	uint_t		old_rs_done = 0, old_rs_2_do = 0;
2348 	uint_t		old_rs_startbl = 0;
2349 	int		block_resync = 1;
2350 	char		cpr_name[23];	/* Unique CPR name */
2351 	int		rs_copysize;
2352 	char		*rs_buffer;
2353 	int		nretries = 0;
2354 
2355 resync_restart:
2356 #ifdef DEBUG
2357 	if (mirror_debug_flag)
2358 		printf("Resync started (mnum = %x)\n", mnum);
2359 #endif
2360 	/*
2361 	 * increment the mirror resync count
2362 	 */
2363 	mutex_enter(&md_cpr_resync.md_resync_mutex);
2364 	md_cpr_resync.md_mirror_resync++;
2365 	mutex_exit(&md_cpr_resync.md_resync_mutex);
2366 
2367 	ui = MDI_UNIT(mnum);
2368 	un = MD_UNIT(mnum);
2369 
2370 	rs_copysize = un->un_rs_copysize;
2371 	if (rs_copysize == 0) {
2372 		/*
2373 		 * Don't allow buffer size to fall outside the
2374 		 * range 0 < bufsize <= md_max_xfer_bufsz.
2375 		 */
2376 		if (md_resync_bufsz <= 0)
2377 			md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
2378 		rs_copysize = MIN(md_resync_bufsz, md_max_xfer_bufsz);
2379 	}
2380 	rs_buffer = kmem_zalloc(dbtob(rs_copysize), KM_SLEEP);
2381 	un = md_unit_writerlock(ui);
2382 	un->un_rs_copysize = rs_copysize;
2383 	un->un_rs_buffer = rs_buffer;
2384 
2385 	if (MD_MNSET_SETNO(setno)) {
2386 		/*
2387 		 * Register this resync thread with the CPR mechanism. This
2388 		 * allows us to detect when the system is suspended and so
2389 		 * keep track of the RPC failure condition.
2390 		 */
2391 		(void) snprintf(cpr_name, sizeof (cpr_name),
2392 		    "mirror_resync%x", mnum);
2393 		CALLB_CPR_INIT(&un->un_rs_cprinfo, &un->un_rs_cpr_mx,
2394 		    callb_md_mrs_cpr, cpr_name);
2395 
2396 		if (ui->ui_tstate & MD_RESYNC_NOT_DONE) {
2397 			/*
2398 			 * If this is the first resync following the initial
2399 			 * snarf (MD_RESYNC_NOT_DONE still set) and we've
2400 			 * been started outside a reconfig step (e.g. by being
2401 			 * added to an existing set) we need to query the
2402 			 * existing submirror state for this mirror.
2403 			 * The set_status flags will have MD_MN_SET_MIR_STATE_RC
2404 			 * set if we've been through a step4 reconfig, so only
2405 			 * query the master if this isn't (yet) set. In this
2406 			 * case we must continue the resync thread as there is
2407 			 * not guaranteed to be a currently running resync on
2408 			 * any of the other nodes. Worst case is that we will
2409 			 * initiate an ownership change to this node and then
2410 			 * find that there is no resync to perform. However, we
2411 			 * will then have correct status across the cluster.
2412 			 */
2413 			if (!md_set[setno].s_am_i_master) {
2414 				if (!(md_get_setstatus(setno) &
2415 				    MD_SET_MN_MIR_STATE_RC)) {
2416 					mirror_get_status(un, NULL);
2417 					block_resync = 0;
2418 #ifdef DEBUG
2419 					if (mirror_debug_flag) {
2420 						mm_submirror_t *sm;
2421 						int i;
2422 						for (i = 0; i < NMIRROR; i++) {
2423 							sm = &un->un_sm[i];
2424 							printf(
2425 							    "sm[%d] state=%4x"
2426 							    " flags=%4x\n", i,
2427 							    sm->sm_state,
2428 							    sm->sm_flags);
2429 						}
2430 					}
2431 #endif
2432 				}
2433 			}
2434 			ui->ui_tstate &= ~MD_RESYNC_NOT_DONE;
2435 		}
2436 		/*
2437 		 * For MN set, if we have an owner, then start the resync on it.
2438 		 * If there is no owner the master must send a message to
2439 		 * choose the owner. This message will contain the current
2440 		 * resync count and it will only be sent to the master, where
2441 		 * the resync count will be used to choose the next node to
2442 		 * perform a resync, by cycling through the nodes in the set.
2443 		 * The message handler will then send a CHANGE_OWNER message to
2444 		 * all nodes, and on receipt of that message, the chosen owner
2445 		 * will issue a SET_OWNER ioctl to become the owner. This ioctl
2446 		 * will be requested to spawn a thread to issue the
2447 		 * REQUEST_OWNER message to become the owner which avoids the
2448 		 * need for concurrent ioctl requests.
2449 		 * After sending the message, we will block waiting for one
2450 		 * of the nodes to become the owner and start the resync
2451 		 */
2452 		if (MD_MN_NO_MIRROR_OWNER(un)) {
2453 			/*
2454 			 * There is no owner, block and then the master will
2455 			 * choose the owner. Only perform this if 'block_resync'
2456 			 * is set.
2457 			 */
2458 			if (block_resync) {
2459 				mutex_enter(&un->un_rs_thread_mx);
2460 				un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2461 				mutex_exit(&un->un_rs_thread_mx);
2462 			}
2463 			if (md_set[setno].s_am_i_master) {
2464 				md_unit_writerexit(ui);
2465 				(void) mirror_choose_owner(un, NULL);
2466 				(void) md_unit_writerlock(ui);
2467 			}
2468 		} else {
2469 			/* There is an owner, block if we are not it */
2470 			if (!MD_MN_MIRROR_OWNER(un)) {
2471 				mutex_enter(&un->un_rs_thread_mx);
2472 				un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2473 				mutex_exit(&un->un_rs_thread_mx);
2474 			}
2475 		}
2476 	}
2477 	/*
2478 	 * Start a timeout chain to update the resync progress to the mddb.
2479 	 * This will run every md_mirror_resync_update_intvl minutes and allows
2480 	 * a resync to be continued over a reboot.
2481 	 */
2482 	ASSERT(un->un_rs_resync_to_id == 0);
2483 	un->un_rs_resync_to_id = timeout(resync_progress, un,
2484 	    (clock_t)(drv_usectohz(60000000) * md_mirror_resync_update_intvl));
2485 
2486 	/*
2487 	 * Handle resync restart from the last logged position. The contents
2488 	 * of un_rs_resync_2_do and un_rs_resync_done are dependent on the
2489 	 * type of resync that was in progress.
2490 	 */
2491 	if (MD_MNSET_SETNO(setno)) {
2492 		switch ((uint_t)RS_TYPE(un->un_rs_type)) {
2493 		case MD_RS_NONE:
2494 		case MD_RS_OPTIMIZED:
2495 		case MD_RS_COMPONENT:
2496 		case MD_RS_SUBMIRROR:
2497 		case MD_RS_ABR:
2498 			break;
2499 		default:
2500 			un->un_rs_type = MD_RS_NONE;
2501 		}
2502 		/* Allocate a resync message, if required */
2503 		if (un->un_rs_msg == NULL) {
2504 			un->un_rs_msg = (md_mn_msg_resync_t *)kmem_zalloc(
2505 			    sizeof (md_mn_msg_resync_t), KM_SLEEP);
2506 		}
2507 		mn_resync = 1;
2508 	}
2509 
2510 	/* Check to see if we've been requested to block/kill */
2511 	if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2512 		goto bail_out;
2513 	}
2514 
2515 	do {
2516 		un->un_rs_dropped_lock = 0;
2517 		/*
2518 		 * Always perform an optimized resync first as this will bring
2519 		 * the mirror into an available state in the shortest time.
2520 		 * If we are resuming an interrupted resync, other than an
2521 		 * optimized resync, we save the type and amount done so that
2522 		 * we can resume the appropriate resync after the optimized
2523 		 * resync has completed.
2524 		 */
2525 		if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2526 		    (RS_TYPE(un->un_rs_type) != MD_RS_OPTIMIZED)) {
2527 			old_rs_type = un->un_rs_type;
2528 			old_rs_done = un->un_rs_resync_done;
2529 			old_rs_2_do = un->un_rs_resync_2_do;
2530 			old_rs_startbl = un->un_resync_startbl;
2531 		}
2532 		SET_RS_TYPE(un->un_rs_type, MD_RS_OPTIMIZED);
2533 		/*
2534 		 * If we are continuing a resync that is not an
2535 		 * OPTIMIZED one, then we start from the beginning when
2536 		 * doing this optimized resync
2537 		 */
2538 		if (RS_TYPE(old_rs_type) != MD_RS_OPTIMIZED) {
2539 			un->un_rs_resync_done = 0;
2540 			un->un_rs_resync_2_do = 0;
2541 			un->un_resync_startbl = 0;
2542 		}
2543 		optimized_resync(un);
2544 		/* Check to see if we've been requested to block/kill */
2545 		if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2546 			goto bail_out;
2547 		}
2548 		un = (mm_unit_t *)MD_UNIT(mnum);
2549 		/*
2550 		 * If another node has moved the resync on, we must
2551 		 * restart the correct resync
2552 		 */
2553 		if (mn_resync &&
2554 		    (RS_TYPE(un->un_rs_type) != MD_RS_NONE)) {
2555 			old_rs_type = un->un_rs_type;
2556 			old_rs_done = un->un_rs_resync_done;
2557 			old_rs_2_do = un->un_rs_resync_2_do;
2558 			old_rs_startbl = un->un_resync_startbl;
2559 		}
2560 
2561 		/*
2562 		 * Restore previous resync progress or move onto a
2563 		 * component resync.
2564 		 */
2565 		if (RS_TYPE(old_rs_type) != MD_RS_NONE) {
2566 			un->un_rs_type = old_rs_type;
2567 			un->un_rs_resync_done = old_rs_done;
2568 			un->un_rs_resync_2_do = old_rs_2_do;
2569 			un->un_resync_startbl = old_rs_startbl;
2570 		} else {
2571 			un->un_rs_type = MD_RS_COMPONENT;
2572 			un->un_rs_resync_done = 0;
2573 			un->un_rs_resync_2_do = 0;
2574 			un->un_resync_startbl = 0;
2575 		}
2576 
2577 		if (RS_TYPE(un->un_rs_type) == MD_RS_COMPONENT) {
2578 			component_resync(un);
2579 			/* Check to see if we've been requested to block/kill */
2580 			if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2581 				goto bail_out;
2582 			}
2583 			un = (mm_unit_t *)MD_UNIT(mnum);
2584 			/*
2585 			 * If we have moved on from a component resync, another
2586 			 * node must have completed it and started a submirror
2587 			 * resync, so leave the resync state alone. For non
2588 			 * multi-node sets we move onto the submirror resync.
2589 			 */
2590 			if (mn_resync) {
2591 				if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2592 					un->un_rs_type = MD_RS_SUBMIRROR;
2593 					un->un_rs_resync_done =
2594 					    un->un_rs_resync_2_do = 0;
2595 					un->un_resync_startbl = 0;
2596 				}
2597 			} else {
2598 				un->un_rs_type = MD_RS_SUBMIRROR;
2599 				un->un_rs_resync_done = 0;
2600 				un->un_rs_resync_2_do = 0;
2601 				un->un_resync_startbl = 0;
2602 			}
2603 		}
2604 		if (RS_TYPE(un->un_rs_type) == MD_RS_SUBMIRROR) {
2605 			submirror_resync(un);
2606 			/* Check to see if we've been requested to block/kill */
2607 			if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2608 				goto bail_out;
2609 			}
2610 			un = (mm_unit_t *)MD_UNIT(mnum);
2611 			/*
2612 			 * If we have moved on from a submirror resync, another
2613 			 * node must have completed it and started a different
2614 			 * resync, so leave the resync state alone
2615 			 */
2616 			if (mn_resync) {
2617 				if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2618 					un->un_rs_resync_done =
2619 					    un->un_rs_resync_2_do = 0;
2620 					un->un_resync_startbl = 0;
2621 				}
2622 			} else {
2623 				/* If non-MN mirror, reinitialize state */
2624 				un->un_rs_type = MD_RS_NONE;
2625 				un->un_rs_resync_done = 0;
2626 				un->un_rs_resync_2_do = 0;
2627 				un->un_resync_startbl = 0;
2628 			}
2629 		}
2630 	} while (un->un_rs_dropped_lock);
2631 	mutex_enter(&un->un_rs_thread_mx);
2632 	un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
2633 	mutex_exit(&un->un_rs_thread_mx);
2634 
2635 	resync_finish = 1;
2636 bail_out:
2637 #ifdef DEBUG
2638 	if (mirror_debug_flag)
2639 		printf("Resync stopped (mnum = %x), resync_finish = %d\n",
2640 		    mnum, resync_finish);
2641 #endif
2642 	kmem_free(un->un_rs_buffer, dbtob(un->un_rs_copysize));
2643 
2644 	mutex_enter(&un->un_rs_progress_mx);
2645 	un->un_rs_progress_flags |= MD_RI_KILL;
2646 	cv_signal(&un->un_rs_progress_cv);
2647 	mutex_exit(&un->un_rs_progress_mx);
2648 
2649 	/*
2650 	 * For MN Set, send a RESYNC_FINISH if this node completed the resync.
2651 	 * There is no need to grow unit here, it will be done in the
2652 	 * handler for the RESYNC_FINISH message together with resetting
2653 	 * MD_UN_RESYNC_ACTIVE.
2654 	 */
2655 	if (mn_resync) {
2656 		if (resync_finish) {
2657 			/*
2658 			 * Normal resync completion. Issue a RESYNC_FINISH
2659 			 * message if we're part of a multi-node set.
2660 			 */
2661 			md_mn_kresult_t	*kres;
2662 			md_mn_msg_resync_t *rmsg;
2663 			int		rval;
2664 
2665 			rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
2666 			md_unit_writerexit(ui);
2667 
2668 			rmsg->msg_resync_mnum = mnum;
2669 			rmsg->msg_resync_type = 0;
2670 			rmsg->msg_resync_done = 0;
2671 			rmsg->msg_resync_2_do = 0;
2672 			rmsg->msg_originator = md_mn_mynode_id;
2673 
2674 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2675 
2676 smrf_msg:
2677 			mutex_enter(&un->un_rs_cpr_mx);
2678 			CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
2679 
2680 			rval = mdmn_ksend_message(setno,
2681 			    MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, 0,
2682 			    (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
2683 
2684 			CALLB_CPR_SAFE_END(&un->un_rs_cprinfo,
2685 			    &un->un_rs_cpr_mx);
2686 			mutex_exit(&un->un_rs_cpr_mx);
2687 
2688 			if (!MDMN_KSEND_MSG_OK(rval, kres)) {
2689 				mdmn_ksend_show_error(rval, kres,
2690 				    "RESYNC_FINISH");
2691 				/* If we're shutting down, pause things here. */
2692 				if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
2693 					while (!md_mn_is_commd_present()) {
2694 						delay(md_hz);
2695 					}
2696 					/*
2697 					 * commd is now available again. Retry
2698 					 * the message once. If this fails we
2699 					 * panic as the system is in an
2700 					 * unexpected state.
2701 					 */
2702 					if (nretries++ == 0)
2703 						goto smrf_msg;
2704 				}
2705 				cmn_err(CE_PANIC,
2706 				    "ksend_message failure: RESYNC_FINISH");
2707 			}
2708 			kmem_free(kres, sizeof (md_mn_kresult_t));
2709 			(void) md_unit_writerlock(ui);
2710 		}
2711 		/*
2712 		 * If the resync has been cancelled, clear flags, reset owner
2713 		 * for ABR mirror and release the resync region parent
2714 		 * structure.
2715 		 */
2716 		if (un->c.un_status & MD_UN_RESYNC_CANCEL) {
2717 			md_mps_t	*ps;
2718 
2719 			if (ui->ui_tstate & MD_ABR_CAP) {
2720 				/* Resync finished, if ABR set owner to NULL */
2721 				mutex_enter(&un->un_owner_mx);
2722 				un->un_mirror_owner = 0;
2723 				mutex_exit(&un->un_owner_mx);
2724 			}
2725 
2726 			un->c.un_status &= ~(MD_UN_RESYNC_CANCEL |
2727 			    MD_UN_RESYNC_ACTIVE);
2728 			ps = un->un_rs_prev_overlap;
2729 			if (ps != NULL) {
2730 				/* Remove previous overlap resync region */
2731 				if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2732 				mirror_overlap_tree_remove(ps);
2733 				/*
2734 				 * Release the overlap range reference
2735 				 */
2736 				un->un_rs_prev_overlap = NULL;
2737 				kmem_cache_free(mirror_parent_cache,
2738 				    ps);
2739 			}
2740 		}
2741 
2742 		/*
2743 		 * Release resync message buffer. This will be reallocated on
2744 		 * the next invocation of the resync_unit thread.
2745 		 */
2746 		if (un->un_rs_msg) {
2747 			kmem_free(un->un_rs_msg, sizeof (md_mn_msg_resync_t));
2748 			un->un_rs_msg = NULL;
2749 		}
2750 	} else {
2751 		/* For non-MN sets deal with any pending grows */
2752 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2753 		if (un->c.un_status & MD_UN_GROW_PENDING) {
2754 			if ((mirror_grow_unit(un, &mde) != 0) ||
2755 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
2756 				un->c.un_status &= ~MD_UN_GROW_PENDING;
2757 			}
2758 		}
2759 	}
2760 
2761 	reset_comp_flags(un);
2762 	un->un_resync_completed = 0;
2763 	mirror_commit(un, NO_SUBMIRRORS, 0);
2764 	md_unit_writerexit(ui);
2765 
2766 	/*
2767 	 * Stop the resync progress thread.
2768 	 */
2769 	if (un->un_rs_resync_to_id != 0) {
2770 		(void) untimeout(un->un_rs_resync_to_id);
2771 		un->un_rs_resync_to_id = 0;
2772 	}
2773 
2774 	/*
2775 	 * Calling mirror_internal_close() makes further reference to un / ui
2776 	 * dangerous. If we are the only consumer of the mirror it is possible
2777 	 * for a metaclear to be processed after completion of the m_i_c()
2778 	 * routine. As we need to handle the case where another resync has been
2779 	 * scheduled for the mirror, we raise the open count on the device
2780 	 * which protects against the close / metaclear / lock => panic scenario
2781 	 */
2782 	(void) md_unit_incopen(MD_SID(un), FREAD|FWRITE, OTYP_LYR);
2783 	(void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2784 
2785 	/*
2786 	 * deccrement the mirror resync count
2787 	 */
2788 	mutex_enter(&md_cpr_resync.md_resync_mutex);
2789 	md_cpr_resync.md_mirror_resync--;
2790 	mutex_exit(&md_cpr_resync.md_resync_mutex);
2791 
2792 	/*
2793 	 * Remove the thread reference as we're about to exit. This allows a
2794 	 * subsequent mirror_resync_unit() to start a new thread.
2795 	 * If RESYNC_ACTIVE is set, mirror_resync_unit() must have been
2796 	 * called to start a new resync, so reopen the mirror and go back to
2797 	 * the start.
2798 	 */
2799 	(void) md_unit_writerlock(ui);
2800 	mutex_enter(&un->un_rs_thread_mx);
2801 	un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2802 	mutex_exit(&un->un_rs_thread_mx);
2803 	if (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2804 		md_unit_writerexit(ui);
2805 		if (mirror_internal_open(MD_SID(un), (FREAD|FWRITE),
2806 		    OTYP_LYR, 0, (IOLOCK *)NULL) == 0) {
2807 			/* Release the reference grabbed above */
2808 			(void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0,
2809 			    (IOLOCK *)NULL);
2810 			goto resync_restart;
2811 		}
2812 		(void) md_unit_writerlock(ui);
2813 		cmn_err(CE_NOTE,
2814 		    "Could not open metadevice (%x) for resync\n",
2815 		    MD_SID(un));
2816 	}
2817 	un->un_rs_thread = NULL;
2818 	md_unit_writerexit(ui);
2819 
2820 	/*
2821 	 * Check for hotspares once we've cleared the resync thread reference.
2822 	 * If there are any errored units a poke_hotspares() will result in
2823 	 * a call to mirror_resync_unit() which we need to allow to start.
2824 	 */
2825 	(void) poke_hotspares();
2826 
2827 	/*
2828 	 * Remove this thread from the CPR callback table.
2829 	 */
2830 	if (mn_resync) {
2831 		mutex_enter(&un->un_rs_cpr_mx);
2832 		CALLB_CPR_EXIT(&un->un_rs_cprinfo);
2833 	}
2834 
2835 	/*
2836 	 * Remove the extra reference to the unit we generated above. After
2837 	 * this call it is *unsafe* to reference either ui or un as they may
2838 	 * no longer be allocated.
2839 	 */
2840 	(void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2841 
2842 	thread_exit();
2843 }
2844 
2845 /*
2846  * mirror_resync_unit:
2847  * ------------------
2848  * Start a resync for the given mirror metadevice. Save the resync thread ID in
2849  * un->un_rs_thread for later manipulation.
2850  *
2851  * Returns:
2852  *	0	Success
2853  *	!=0	Error
2854  */
2855 /*ARGSUSED*/
2856 int
mirror_resync_unit(minor_t mnum,md_resync_ioctl_t * ri,md_error_t * ep,IOLOCK * lockp)2857 mirror_resync_unit(
2858 	minor_t			mnum,
2859 	md_resync_ioctl_t	*ri,
2860 	md_error_t		*ep,
2861 	IOLOCK			*lockp
2862 )
2863 {
2864 	mdi_unit_t		*ui;
2865 	mm_unit_t		*un;
2866 	set_t			setno = MD_MIN2SET(mnum);
2867 
2868 	ui = MDI_UNIT(mnum);
2869 
2870 	if (md_get_setstatus(setno) & MD_SET_STALE)
2871 		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
2872 
2873 	if (mirror_internal_open(mnum, (FREAD|FWRITE), OTYP_LYR, 0, lockp)) {
2874 		return (mdmderror(ep, MDE_MIRROR_OPEN_FAILURE, mnum));
2875 	}
2876 	if (lockp) {
2877 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
2878 	} else {
2879 		un = (mm_unit_t *)md_unit_writerlock(ui);
2880 	}
2881 
2882 	/*
2883 	 * Check to see if we're attempting to start a resync while one is
2884 	 * already running.
2885 	 */
2886 	if (un->c.un_status & MD_UN_RESYNC_ACTIVE ||
2887 	    un->un_rs_thread != NULL) {
2888 		/*
2889 		 * Ensure RESYNC_ACTIVE set, it may not be if the resync thread
2890 		 * is in the process of terminating, setting the flag will
2891 		 * cause the resync thread to return to the beginning
2892 		 */
2893 		un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2894 		if (lockp) {
2895 			md_ioctl_writerexit(lockp);
2896 		} else {
2897 			md_unit_writerexit(ui);
2898 		}
2899 		(void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2900 		return (0);
2901 	}
2902 	un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2903 	un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
2904 	if ((ri) && (ri->ri_copysize > 0) &&
2905 	    (ri->ri_copysize <= md_max_xfer_bufsz))
2906 		un->un_rs_copysize = ri->ri_copysize;
2907 	else
2908 		un->un_rs_copysize = 0;
2909 
2910 	/* Start the resync progress thread off */
2911 	un->un_rs_progress_flags = 0;
2912 	(void) thread_create(NULL, 0, resync_progress_thread,
2913 	    (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
2914 
2915 	/*
2916 	 * We have to store the thread ID in the unit structure so do not
2917 	 * drop writerlock until the thread is active. This means resync_unit
2918 	 * may spin on its first md_unit_readerlock(), but deadlock won't occur.
2919 	 */
2920 	mutex_enter(&un->un_rs_thread_mx);
2921 	un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2922 	mutex_exit(&un->un_rs_thread_mx);
2923 	un->un_rs_thread = thread_create(NULL, 0, resync_unit,
2924 	    (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, 60);
2925 	if (un->un_rs_thread == (kthread_id_t)NULL) {
2926 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2927 		if (lockp) {
2928 			md_ioctl_writerexit(lockp);
2929 		} else {
2930 			md_unit_writerexit(ui);
2931 		}
2932 		(void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2933 		return (mdmderror(ep, MDE_MIRROR_THREAD_FAILURE, mnum));
2934 	} else {
2935 		if (lockp) {
2936 			md_ioctl_writerexit(lockp);
2937 		} else {
2938 			md_unit_writerexit(ui);
2939 		}
2940 	}
2941 
2942 	return (0);
2943 }
2944 
2945 /*
2946  * mirror_ioctl_resync:
2947  * -------------------
2948  * Called as a result of an MD_IOCSETSYNC ioctl. Either start, block, unblock
2949  * or kill the resync thread associated with the specified unit.
2950  * Can return with locks held since mdioctl will free any locks
2951  * that are marked in lock->l_flags.
2952  *
2953  * Returns:
2954  *	0	Success
2955  *	!=0	Error Code
2956  */
2957 int
mirror_ioctl_resync(md_resync_ioctl_t * ri,IOLOCK * lock)2958 mirror_ioctl_resync(
2959 	md_resync_ioctl_t	*ri,
2960 	IOLOCK			*lock
2961 )
2962 {
2963 	minor_t			mnum = ri->ri_mnum;
2964 	mm_unit_t		*un;
2965 	uint_t			bits;
2966 	mm_submirror_t		*sm;
2967 	mm_submirror_ic_t	*smic;
2968 	int			smi;
2969 	kt_did_t		tid;
2970 	set_t			setno = MD_MIN2SET(mnum);
2971 
2972 	mdclrerror(&ri->mde);
2973 
2974 	if ((setno >= md_nsets) ||
2975 	    (MD_MIN2UNIT(mnum) >= md_nunits)) {
2976 		return (mdmderror(&ri->mde, MDE_INVAL_UNIT, mnum));
2977 	}
2978 
2979 	/* RD_LOCK flag grabs the md_ioctl_readerlock */
2980 	un = mirror_getun(mnum, &ri->mde, RD_LOCK, lock);
2981 
2982 	if (un == NULL) {
2983 		return (mdmderror(&ri->mde, MDE_UNIT_NOT_SETUP, mnum));
2984 	}
2985 	if (un->c.un_type != MD_METAMIRROR) {
2986 		return (mdmderror(&ri->mde, MDE_NOT_MM, mnum));
2987 	}
2988 	if (un->un_nsm < 2) {
2989 		return (0);
2990 	}
2991 
2992 	/*
2993 	 * Determine the action to take based on the ri_flags field:
2994 	 * 	MD_RI_BLOCK:	Block current resync thread
2995 	 *	MD_RI_UNBLOCK:	Unblock resync thread
2996 	 *	MD_RI_KILL:	Abort resync thread
2997 	 *	MD_RI_RESYNC_FORCE_MNSTART: Directly start resync thread
2998 	 *		without using rpc.mdcommd messages.
2999 	 *	any other:	Start resync thread
3000 	 */
3001 	switch (ri->ri_flags & (MD_RI_BLOCK|MD_RI_UNBLOCK|MD_RI_KILL)) {
3002 
3003 	case MD_RI_BLOCK:
3004 		/* Halt resync thread by setting flag in un_rs_flags */
3005 		if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3006 			return (0);
3007 		}
3008 		mutex_enter(&un->un_rs_thread_mx);
3009 		un->un_rs_thread_flags |= MD_RI_BLOCK;
3010 		mutex_exit(&un->un_rs_thread_mx);
3011 		return (0);
3012 
3013 	case MD_RI_UNBLOCK:
3014 		/*
3015 		 * Restart resync thread by clearing flag in un_rs_flags and
3016 		 * cv_signal'ing the blocked thread.
3017 		 */
3018 		if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3019 			return (0);
3020 		}
3021 		mutex_enter(&un->un_rs_thread_mx);
3022 		un->un_rs_thread_flags &= ~MD_RI_BLOCK;
3023 		cv_signal(&un->un_rs_thread_cv);
3024 		mutex_exit(&un->un_rs_thread_mx);
3025 		return (0);
3026 
3027 	case MD_RI_KILL:
3028 		/* Abort resync thread. */
3029 		if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3030 			return (0);
3031 		}
3032 		mutex_enter(&un->un_rs_thread_mx);
3033 		tid = un->un_rs_thread ? (un->un_rs_thread)->t_did : 0;
3034 		un->un_rs_thread_flags &= ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
3035 		un->un_rs_thread_flags |= MD_RI_KILL;
3036 		cv_signal(&un->un_rs_thread_cv);
3037 		mutex_exit(&un->un_rs_thread_mx);
3038 		if (tid != 0) {
3039 			if (!(ri->ri_flags & MD_RI_NO_WAIT)) {
3040 				md_ioctl_readerexit(lock);
3041 				thread_join(tid);
3042 				un->un_rs_thread_flags &= ~MD_RI_KILL;
3043 				un->un_rs_thread = NULL;
3044 				cmn_err(CE_WARN, "md: %s: Resync cancelled\n",
3045 				    md_shortname(MD_SID(un)));
3046 			}
3047 		}
3048 		return (0);
3049 	}
3050 
3051 	md_ioctl_readerexit(lock);
3052 
3053 	bits = 0;
3054 	for (smi = 0; smi < NMIRROR; smi++) {
3055 		sm = &un->un_sm[smi];
3056 		smic = &un->un_smic[smi];
3057 		if (!SMS_IS(sm, SMS_ATTACHED))
3058 			continue;
3059 		mirror_set_sm_state(sm, smic, SMS_ATTACHED_RESYNC, 1);
3060 		bits |= SMI2BIT(smi);
3061 	}
3062 	if (bits != 0)
3063 		mirror_commit(un, bits, 0);
3064 
3065 	/*
3066 	 * If we are resyncing a mirror in a MN set and the rpc.mdcommd
3067 	 * can be used, we do not start the resync at this point.
3068 	 * Instead, the metasync command that issued the ioctl
3069 	 * will send a RESYNC_STARTING message to start the resync thread. The
3070 	 * reason we do it this way is to ensure that the metasync ioctl is
3071 	 * executed on all nodes before the resync thread is started.
3072 	 *
3073 	 * If a MN set and the MD_RI_RESYNC_FORCE_MNSTART flag is set, then
3074 	 * don't use rpc.mdcommd, but just start the resync thread.  This
3075 	 * flag is set on a node when it is being added to a diskset
3076 	 * so that the resync threads are started on the newly added node.
3077 	 */
3078 	if ((!(MD_MNSET_SETNO(setno))) ||
3079 	    (ri->ri_flags & MD_RI_RESYNC_FORCE_MNSTART)) {
3080 		return (mirror_resync_unit(mnum, ri, &ri->mde, lock));
3081 	} else {
3082 		return (0);
3083 	}
3084 }
3085 
3086 int
mirror_mark_resync_region_non_owner(struct mm_unit * un,diskaddr_t startblk,diskaddr_t endblk,md_mn_nodeid_t source_node)3087 mirror_mark_resync_region_non_owner(struct mm_unit *un,
3088 	diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3089 {
3090 	int			no_change;
3091 	size_t			start_rr;
3092 	size_t			current_rr;
3093 	size_t			end_rr;
3094 	md_mn_msg_rr_dirty_t	*rr;
3095 	md_mn_kresult_t		*kres;
3096 	set_t			setno = MD_UN2SET(un);
3097 	int			rval;
3098 	md_mn_nodeid_t		node_idx = source_node - 1;
3099 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
3100 	md_mn_nodeid_t		owner_node;
3101 	minor_t			mnum = MD_SID(un);
3102 
3103 	if (un->un_nsm < 2)
3104 		return (0);
3105 
3106 	/*
3107 	 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3108 	 * not, allocate it and then fill the [start..end] entries.
3109 	 * Update un_pernode_dirty_sum if we've gone 0->1.
3110 	 * Update un_dirty_bm if the corresponding entries are clear.
3111 	 */
3112 	rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3113 	if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3114 		un->un_pernode_dirty_bm[node_idx] =
3115 		    (uchar_t *)kmem_zalloc(
3116 		    (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3117 	}
3118 	rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3119 
3120 	BLK_TO_RR(end_rr, endblk, un);
3121 	BLK_TO_RR(start_rr, startblk, un);
3122 
3123 	no_change = 1;
3124 
3125 	mutex_enter(&un->un_resync_mx);
3126 	rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3127 	for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3128 		un->un_outstanding_writes[current_rr]++;
3129 		if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) {
3130 			un->un_pernode_dirty_sum[current_rr]++;
3131 			SET_PERNODE_DIRTY(source_node, current_rr, un);
3132 		}
3133 		CLR_GOING_CLEAN(current_rr, un);
3134 		if (!IS_REGION_DIRTY(current_rr, un)) {
3135 			no_change = 0;
3136 			SET_REGION_DIRTY(current_rr, un);
3137 			SET_GOING_DIRTY(current_rr, un);
3138 		} else if (IS_GOING_DIRTY(current_rr, un))
3139 			no_change = 0;
3140 	}
3141 	rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3142 	mutex_exit(&un->un_resync_mx);
3143 
3144 	if (no_change) {
3145 		return (0);
3146 	}
3147 
3148 	/*
3149 	 * If we have dirty regions to commit, send a
3150 	 * message to the owning node so that the
3151 	 * in-core bitmap gets updated appropriately.
3152 	 * TODO: make this a kmem_cache pool to improve
3153 	 * alloc/free performance ???
3154 	 */
3155 	kres = (md_mn_kresult_t *)kmem_alloc(sizeof (md_mn_kresult_t),
3156 	    KM_SLEEP);
3157 	rr = (md_mn_msg_rr_dirty_t *)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t),
3158 	    KM_SLEEP);
3159 
3160 resend_mmrr:
3161 	owner_node = un->un_mirror_owner;
3162 
3163 	rr->rr_mnum = mnum;
3164 	rr->rr_nodeid = md_mn_mynode_id;
3165 	rr->rr_range = (ushort_t)start_rr << 16;
3166 	rr->rr_range |= (ushort_t)end_rr & 0xFFFF;
3167 
3168 	/* release readerlock before sending message */
3169 	md_unit_readerexit(ui);
3170 
3171 	rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_DIRTY,
3172 	    MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_DIRECTED,
3173 	    un->un_mirror_owner, (char *)rr,
3174 	    sizeof (md_mn_msg_rr_dirty_t), kres);
3175 
3176 	/* reaquire readerlock on message completion */
3177 	(void) md_unit_readerlock(ui);
3178 
3179 	/* if the message send failed, note it, and pass an error back up */
3180 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
3181 		/* if commd is gone, no point in printing a message */
3182 		if (md_mn_is_commd_present())
3183 			mdmn_ksend_show_error(rval, kres, "RR_DIRTY");
3184 		kmem_free(kres, sizeof (md_mn_kresult_t));
3185 		kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3186 		return (1);
3187 	}
3188 
3189 	/*
3190 	 * if the owner changed while we were sending the message, and it's
3191 	 * not us, the new mirror owner won't yet have done the right thing
3192 	 * with our data.  Let him know.  If we became the owner, we'll
3193 	 * deal with that differently below.  Note that receiving a message
3194 	 * about another node twice won't hurt anything.
3195 	 */
3196 	if (un->un_mirror_owner != owner_node && !MD_MN_MIRROR_OWNER(un))
3197 		goto resend_mmrr;
3198 
3199 	kmem_free(kres, sizeof (md_mn_kresult_t));
3200 	kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3201 
3202 	mutex_enter(&un->un_resync_mx);
3203 
3204 	/*
3205 	 * If we became the owner changed while we were sending the message,
3206 	 * we have dirty bits in the un_pernode_bm that aren't yet reflected
3207 	 * in the un_dirty_bm, as it was re-read from disk, and our bits
3208 	 * are also not reflected in the on-disk DRL.  Fix that now.
3209 	 */
3210 	if (MD_MN_MIRROR_OWNER(un)) {
3211 		rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3212 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY),
3213 		    un->un_pernode_dirty_bm[node_idx], un->un_dirty_bm);
3214 		rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3215 
3216 		un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3217 
3218 		mutex_exit(&un->un_resync_mx);
3219 		mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3220 		mutex_enter(&un->un_resync_mx);
3221 
3222 		un->un_resync_flg &= ~(MM_RF_COMMITING | MM_RF_GATECLOSED);
3223 		cv_broadcast(&un->un_resync_cv);
3224 	}
3225 
3226 	for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3227 		CLR_GOING_DIRTY(current_rr, un);
3228 
3229 	mutex_exit(&un->un_resync_mx);
3230 
3231 	return (0);
3232 }
3233 
3234 int
mirror_mark_resync_region_owner(struct mm_unit * un,diskaddr_t startblk,diskaddr_t endblk,md_mn_nodeid_t source_node)3235 mirror_mark_resync_region_owner(struct mm_unit *un,
3236 	diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3237 {
3238 	int			no_change;
3239 	size_t			start_rr;
3240 	size_t			current_rr;
3241 	size_t			end_rr;
3242 	int			mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3243 	md_mn_nodeid_t		node_idx = source_node - 1;
3244 
3245 	if (un->un_nsm < 2)
3246 		return (0);
3247 
3248 	/*
3249 	 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3250 	 * not, allocate it and then fill the [start..end] entries.
3251 	 * Update un_pernode_dirty_sum if we've gone 0->1.
3252 	 * Update un_dirty_bm if the corresponding entries are clear.
3253 	 */
3254 	if (mnset) {
3255 		rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3256 		if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3257 			un->un_pernode_dirty_bm[node_idx] =
3258 			    (uchar_t *)kmem_zalloc(
3259 			    (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3260 		}
3261 		rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3262 	}
3263 
3264 	mutex_enter(&un->un_resync_mx);
3265 
3266 	if (mnset)
3267 		rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3268 
3269 	no_change = 1;
3270 	BLK_TO_RR(end_rr, endblk, un);
3271 	BLK_TO_RR(start_rr, startblk, un);
3272 	for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3273 		if (!mnset || source_node == md_mn_mynode_id)
3274 			un->un_outstanding_writes[current_rr]++;
3275 		if (mnset) {
3276 			if (!IS_PERNODE_DIRTY(source_node, current_rr, un))
3277 				un->un_pernode_dirty_sum[current_rr]++;
3278 			SET_PERNODE_DIRTY(source_node, current_rr, un);
3279 		}
3280 		CLR_GOING_CLEAN(current_rr, un);
3281 		if (!IS_REGION_DIRTY(current_rr, un))
3282 			no_change = 0;
3283 		if (IS_GOING_DIRTY(current_rr, un))
3284 			no_change = 0;
3285 	}
3286 
3287 	if (mnset)
3288 		rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3289 
3290 	if (no_change) {
3291 		mutex_exit(&un->un_resync_mx);
3292 		return (0);
3293 	}
3294 	un->un_waiting_to_mark++;
3295 	while (un->un_resync_flg & MM_RF_GATECLOSED) {
3296 		if (panicstr)
3297 			return (1);
3298 		cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3299 	}
3300 	un->un_waiting_to_mark--;
3301 
3302 	no_change = 1;
3303 	for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3304 		if (!IS_REGION_DIRTY(current_rr, un)) {
3305 			SET_REGION_DIRTY(current_rr, un);
3306 			SET_GOING_DIRTY(current_rr, un);
3307 			no_change = 0;
3308 		} else {
3309 			if (IS_GOING_DIRTY(current_rr, un))
3310 				no_change = 0;
3311 		}
3312 	}
3313 	if (no_change) {
3314 		if (un->un_waiting_to_mark == 0 || un->un_waiting_to_clear != 0)
3315 			cv_broadcast(&un->un_resync_cv);
3316 		mutex_exit(&un->un_resync_mx);
3317 		return (0);
3318 	}
3319 
3320 	un->un_resync_flg |= MM_RF_COMMIT_NEEDED;
3321 	un->un_waiting_to_commit++;
3322 	while (un->un_waiting_to_mark != 0 &&
3323 	    !(un->un_resync_flg & MM_RF_GATECLOSED)) {
3324 		if (panicstr)
3325 			return (1);
3326 		cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3327 	}
3328 
3329 	if (un->un_resync_flg & MM_RF_COMMIT_NEEDED) {
3330 		un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3331 		un->un_resync_flg &= ~MM_RF_COMMIT_NEEDED;
3332 
3333 		mutex_exit(&un->un_resync_mx);
3334 		mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3335 		mutex_enter(&un->un_resync_mx);
3336 
3337 		un->un_resync_flg &= ~MM_RF_COMMITING;
3338 		cv_broadcast(&un->un_resync_cv);
3339 	}
3340 	while (un->un_resync_flg & MM_RF_COMMITING) {
3341 		if (panicstr)
3342 			return (1);
3343 		cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3344 	}
3345 
3346 	for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3347 		CLR_GOING_DIRTY(current_rr, un);
3348 
3349 	if (--un->un_waiting_to_commit == 0) {
3350 		un->un_resync_flg &= ~MM_RF_GATECLOSED;
3351 		cv_broadcast(&un->un_resync_cv);
3352 	}
3353 	mutex_exit(&un->un_resync_mx);
3354 
3355 	return (0);
3356 }
3357 
3358 int
mirror_mark_resync_region(struct mm_unit * un,diskaddr_t startblk,diskaddr_t endblk,md_mn_nodeid_t source_node)3359 mirror_mark_resync_region(struct mm_unit *un,
3360 	diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3361 {
3362 	int	mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3363 
3364 	if (mnset && !MD_MN_MIRROR_OWNER(un)) {
3365 		return (mirror_mark_resync_region_non_owner(un, startblk,
3366 		    endblk, source_node));
3367 	} else {
3368 		return (mirror_mark_resync_region_owner(un, startblk, endblk,
3369 		    source_node));
3370 	}
3371 }
3372 
3373 int
mirror_resize_resync_regions(mm_unit_t * un,diskaddr_t new_tb)3374 mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3375 {
3376 	short		*owp;
3377 	optim_resync_t	*orp;
3378 	uint_t		rr_mult = 1;
3379 	uint_t		old_nregions, new_nregions;
3380 	int		old_bm_size, new_bm_size;
3381 	size_t		size;
3382 	mddb_recid_t	recid, old_recid;
3383 	uchar_t		*old_dirty_bm;
3384 	int		i, j;
3385 	mddb_type_t	typ1;
3386 	set_t		setno = MD_UN2SET(un);
3387 	uchar_t		*old_pns;
3388 
3389 	old_nregions = un->un_rrd_num;
3390 	new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3391 
3392 	while (new_nregions > MD_MAX_NUM_RR) {
3393 		new_nregions >>= 1;
3394 		rr_mult <<= 1;
3395 	}
3396 
3397 	new_bm_size = howmany(new_nregions, NBBY);
3398 	old_bm_size = howmany(old_nregions, NBBY);
3399 
3400 	size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3401 
3402 	typ1 = (mddb_type_t)md_getshared_key(setno,
3403 	    mirror_md_ops.md_driver.md_drivername);
3404 	recid = mddb_createrec(size, typ1, RESYNC_REC,
3405 	    MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3406 	if (recid < 0)
3407 		return (-1);
3408 
3409 	orp = (struct optim_resync *)mddb_getrecaddr(recid);
3410 	ASSERT(orp != NULL);
3411 
3412 	orp->or_magic = OR_MAGIC;		/* Magic # */
3413 	orp->or_blksize = un->un_rrd_blksize;	/* Same block size */
3414 	orp->or_num = new_nregions;		/* New number of regions */
3415 
3416 	old_dirty_bm = un->un_dirty_bm;
3417 	un->un_dirty_bm = orp->or_rr;
3418 
3419 	kmem_free((caddr_t)un->un_goingdirty_bm, old_bm_size);
3420 	un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3421 
3422 	kmem_free((caddr_t)un->un_goingclean_bm, old_bm_size);
3423 	un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3424 
3425 	kmem_free((caddr_t)un->un_resync_bm, old_bm_size);
3426 	un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3427 
3428 	owp = un->un_outstanding_writes;
3429 	un->un_outstanding_writes = (short *)kmem_zalloc(
3430 	    new_nregions * sizeof (short), KM_SLEEP);
3431 
3432 	old_pns = un->un_pernode_dirty_sum;
3433 	if (old_pns)
3434 		un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(new_nregions,
3435 		    KM_SLEEP);
3436 
3437 	/*
3438 	 * Now translate the old records into the new
3439 	 * records
3440 	 */
3441 	for (i = 0; i < old_nregions; i++) {
3442 		/*
3443 		 * only bring forward the
3444 		 * outstanding write counters and the dirty bits and also
3445 		 * the pernode_summary counts
3446 		 */
3447 		if (!isset(old_dirty_bm, i))
3448 			continue;
3449 
3450 		setbit(un->un_dirty_bm, (i / rr_mult));
3451 		un->un_outstanding_writes[(i / rr_mult)] += owp[i];
3452 		if (old_pns)
3453 			un->un_pernode_dirty_sum[(i / rr_mult)] += old_pns[i];
3454 	}
3455 	kmem_free((caddr_t)owp, old_nregions * sizeof (short));
3456 	if (old_pns)
3457 		kmem_free((caddr_t)old_pns, old_nregions);
3458 
3459 	/*
3460 	 * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
3461 	 */
3462 	for (j = 0; j < MD_MNMAXSIDES; j++) {
3463 		rw_enter(&un->un_pernode_dirty_mx[j], RW_WRITER);
3464 		old_dirty_bm = un->un_pernode_dirty_bm[j];
3465 		if (old_dirty_bm) {
3466 			un->un_pernode_dirty_bm[j] = (uchar_t *)kmem_zalloc(
3467 			    new_bm_size, KM_SLEEP);
3468 			for (i = 0; i < old_nregions; i++) {
3469 				if (!isset(old_dirty_bm, i))
3470 					continue;
3471 
3472 				setbit(un->un_pernode_dirty_bm[j],
3473 				    (i / rr_mult));
3474 			}
3475 			kmem_free((caddr_t)old_dirty_bm, old_bm_size);
3476 		}
3477 		rw_exit(&un->un_pernode_dirty_mx[j]);
3478 	}
3479 
3480 	/* Save the old record id */
3481 	old_recid = un->un_rr_dirty_recid;
3482 
3483 	/* Update the mirror unit struct */
3484 	un->un_rr_dirty_recid = recid;
3485 	un->un_rrd_num = new_nregions;
3486 	un->un_rrd_blksize = un->un_rrd_blksize * rr_mult;
3487 
3488 	orp->or_blksize = un->un_rrd_blksize;
3489 
3490 	/*
3491 	 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3492 	 * instead of using mddb_commitrecs_wrapper, is that you cannot
3493 	 * atomically commit optimized records.
3494 	 */
3495 	mddb_commitrec_wrapper(recid);
3496 	mddb_commitrec_wrapper(un->c.un_record_id);
3497 	mddb_deleterec_wrapper(old_recid);
3498 	return (0);
3499 }
3500 
3501 /* lockp can be NULL for !MN diksets */
3502 int
mirror_add_resync_regions(mm_unit_t * un,diskaddr_t new_tb)3503 mirror_add_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3504 {
3505 	uchar_t		*old;
3506 	short		*owp;
3507 	optim_resync_t	*orp;
3508 	uint_t		old_nregions, new_nregions;
3509 	int		old_bm_size, new_bm_size;
3510 	size_t		size;
3511 	mddb_recid_t	recid, old_recid;
3512 	mddb_type_t	typ1;
3513 	set_t		setno = MD_UN2SET(un);
3514 	int		i;
3515 
3516 	old_nregions = un->un_rrd_num;
3517 	new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3518 
3519 	new_bm_size = howmany(new_nregions, NBBY);
3520 	old_bm_size = howmany(old_nregions, NBBY);
3521 
3522 	size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3523 
3524 	typ1 = (mddb_type_t)md_getshared_key(setno,
3525 	    mirror_md_ops.md_driver.md_drivername);
3526 
3527 	recid = mddb_createrec(size, typ1, RESYNC_REC,
3528 	    MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3529 	if (recid < 0)
3530 		return (-1);
3531 
3532 	orp = (struct optim_resync *)mddb_getrecaddr(recid);
3533 	ASSERT(orp != NULL);
3534 
3535 	orp->or_magic = OR_MAGIC;		/* Magic # */
3536 	orp->or_blksize = un->un_rrd_blksize;	/* Same block size */
3537 	orp->or_num = new_nregions;		/* New number of regions */
3538 
3539 	/* Copy the old bm over the new bm */
3540 	bcopy((caddr_t)un->un_dirty_bm, (caddr_t)orp->or_rr, old_bm_size);
3541 
3542 	/*
3543 	 * Create new bigger incore arrays, copy, and free old ones:
3544 	 *		un_goingdirty_bm
3545 	 *		un_goingclean_bm
3546 	 *		un_resync_bm
3547 	 *		un_outstanding_writes
3548 	 *		un_pernode_dirty_sum
3549 	 *		un_pernode_dirty_bm[]
3550 	 */
3551 	old = un->un_goingdirty_bm;
3552 	un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3553 	bcopy((caddr_t)old, (caddr_t)un->un_goingdirty_bm, old_bm_size);
3554 	kmem_free((caddr_t)old, old_bm_size);
3555 
3556 	old = un->un_goingclean_bm;
3557 	un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3558 	bcopy((caddr_t)old, (caddr_t)un->un_goingclean_bm, old_bm_size);
3559 	kmem_free((caddr_t)old, old_bm_size);
3560 
3561 	old = un->un_resync_bm;
3562 	un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3563 	bcopy((caddr_t)old, (caddr_t)un->un_resync_bm, old_bm_size);
3564 	kmem_free((caddr_t)old, old_bm_size);
3565 
3566 	owp = un->un_outstanding_writes;
3567 	un->un_outstanding_writes = (short *)kmem_zalloc(
3568 	    (uint_t)new_nregions * sizeof (short), KM_SLEEP);
3569 	bcopy((caddr_t)owp, (caddr_t)un->un_outstanding_writes,
3570 	    old_nregions * sizeof (short));
3571 	kmem_free((caddr_t)owp, (old_nregions * sizeof (short)));
3572 
3573 	old = un->un_pernode_dirty_sum;
3574 	if (old) {
3575 		un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(
3576 		    new_nregions, KM_SLEEP);
3577 		bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_sum,
3578 		    old_nregions);
3579 		kmem_free((caddr_t)old, old_nregions);
3580 	}
3581 
3582 	for (i = 0; i < MD_MNMAXSIDES; i++) {
3583 		rw_enter(&un->un_pernode_dirty_mx[i], RW_WRITER);
3584 		old = un->un_pernode_dirty_bm[i];
3585 		if (old) {
3586 			un->un_pernode_dirty_bm[i] = (uchar_t *)kmem_zalloc(
3587 			    new_bm_size, KM_SLEEP);
3588 			bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_bm[i],
3589 			    old_bm_size);
3590 			kmem_free((caddr_t)old, old_bm_size);
3591 		}
3592 		rw_exit(&un->un_pernode_dirty_mx[i]);
3593 	}
3594 
3595 	/* Save the old record id */
3596 	old_recid = un->un_rr_dirty_recid;
3597 
3598 	/* Update the mirror unit struct */
3599 	un->un_rr_dirty_recid = recid;
3600 	un->un_rrd_num = new_nregions;
3601 	un->un_dirty_bm = orp->or_rr;
3602 
3603 	/*
3604 	 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3605 	 * instead of using mddb_commitrecs_wrapper, is that you cannot
3606 	 * atomically commit optimized records.
3607 	 */
3608 	mddb_commitrec_wrapper(recid);
3609 	mddb_commitrec_wrapper(un->c.un_record_id);
3610 	mddb_deleterec_wrapper(old_recid);
3611 	return (0);
3612 }
3613 
3614 /*
3615  * mirror_copy_rr:
3616  * --------------
3617  * Combine the dirty record bitmap with the in-core resync bitmap. This allows
3618  * us to carry a resync over an ownership change.
3619  */
3620 void
mirror_copy_rr(int sz,uchar_t * src,uchar_t * dest)3621 mirror_copy_rr(int sz, uchar_t *src, uchar_t *dest)
3622 {
3623 	int	i;
3624 
3625 	for (i = 0; i < sz; i++)
3626 		*dest++ |= *src++;
3627 }
3628 
3629 /*
3630  * mirror_set_dirty_rr:
3631  * -------------------
3632  * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
3633  * For the owning node (DRL/mirror owner) update the on-disk RR if needed.
3634  * Called on every clean->dirty transition for the originating writer node.
3635  * Note: only the non-owning nodes will initiate this message and it is only
3636  * the owning node that has to process it.
3637  */
3638 int
mirror_set_dirty_rr(md_mn_rr_dirty_params_t * iocp)3639 mirror_set_dirty_rr(md_mn_rr_dirty_params_t *iocp)
3640 {
3641 
3642 	minor_t			mnum = iocp->rr_mnum;
3643 	mm_unit_t		*un;
3644 	int			start = (int)iocp->rr_start;
3645 	int			end = (int)iocp->rr_end;
3646 	set_t			setno = MD_MIN2SET(mnum);
3647 	md_mn_nodeid_t		orignode = iocp->rr_nodeid;	/* 1-based */
3648 	diskaddr_t		startblk, endblk;
3649 
3650 	mdclrerror(&iocp->mde);
3651 
3652 	if ((setno >= md_nsets) ||
3653 	    (MD_MIN2UNIT(mnum) >= md_nunits)) {
3654 		return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3655 	}
3656 
3657 	/* Must have _NO_ ioctl lock set if we update the RR on-disk */
3658 	un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3659 
3660 	if (un == NULL) {
3661 		return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3662 	}
3663 	if (un->c.un_type != MD_METAMIRROR) {
3664 		return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3665 	}
3666 	if (orignode < 1 || orignode >= MD_MNMAXSIDES) {
3667 		return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3668 	}
3669 	if (un->un_nsm < 2) {
3670 		return (0);
3671 	}
3672 
3673 	/*
3674 	 * Only process this message if we're the owner of the mirror.
3675 	 */
3676 	if (!MD_MN_MIRROR_OWNER(un)) {
3677 		return (0);
3678 	}
3679 
3680 	RR_TO_BLK(startblk, start, un);
3681 	RR_TO_BLK(endblk, end, un);
3682 	return (mirror_mark_resync_region_owner(un, startblk, endblk,
3683 	    orignode));
3684 }
3685 
3686 /*
3687  * mirror_clean_rr_bits:
3688  * --------------------
3689  * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3690  * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3691  * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3692  * nodes. Callable from ioctl / interrupt / whatever context.
3693  * un_resync_mx is held on entry.
3694  */
3695 static void
mirror_clean_rr_bits(md_mn_rr_clean_params_t * iocp)3696 mirror_clean_rr_bits(
3697 	md_mn_rr_clean_params_t *iocp)
3698 {
3699 	minor_t			mnum = iocp->rr_mnum;
3700 	mm_unit_t		*un;
3701 	uint_t			cleared_bits;
3702 	md_mn_nodeid_t		node = iocp->rr_nodeid - 1;
3703 	md_mn_nodeid_t		orignode = iocp->rr_nodeid;
3704 	int			i, start, end;
3705 
3706 	un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3707 
3708 	cleared_bits = 0;
3709 	start = MDMN_RR_CLEAN_PARAMS_START_BIT(iocp);
3710 	end = start + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp) * NBBY;
3711 	rw_enter(&un->un_pernode_dirty_mx[node], RW_READER);
3712 	for (i = start; i < end; i++) {
3713 		if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp), i - start)) {
3714 			if (IS_PERNODE_DIRTY(orignode, i, un)) {
3715 				un->un_pernode_dirty_sum[i]--;
3716 				CLR_PERNODE_DIRTY(orignode, i, un);
3717 			}
3718 			if (un->un_pernode_dirty_sum[i] == 0) {
3719 				cleared_bits++;
3720 				CLR_REGION_DIRTY(i, un);
3721 				CLR_GOING_CLEAN(i, un);
3722 			}
3723 		}
3724 	}
3725 	rw_exit(&un->un_pernode_dirty_mx[node]);
3726 	if (cleared_bits) {
3727 		/*
3728 		 * We can only be called iff we are the mirror owner, however
3729 		 * as this is a (potentially) decoupled routine the ownership
3730 		 * may have moved from us by the time we get to execute the
3731 		 * bit clearing. Hence we still need to check for being the
3732 		 * owner before flushing the DRL to the replica.
3733 		 */
3734 		if (MD_MN_MIRROR_OWNER(un)) {
3735 			mutex_exit(&un->un_resync_mx);
3736 			mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3737 			mutex_enter(&un->un_resync_mx);
3738 		}
3739 	}
3740 }
3741 
3742 /*
3743  * mirror_drl_task:
3744  * ---------------
3745  * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
3746  * We need to obtain exclusive access to the un_resync_cv and then clear the
3747  * necessary bits.
3748  * On completion, we must also free the passed in argument as it is allocated
3749  * at the end of the ioctl handler and won't be freed on completion.
3750  */
3751 static void
mirror_drl_task(void * arg)3752 mirror_drl_task(void *arg)
3753 {
3754 	md_mn_rr_clean_params_t	*iocp = (md_mn_rr_clean_params_t *)arg;
3755 	minor_t			mnum = iocp->rr_mnum;
3756 	mm_unit_t		*un;
3757 
3758 	un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3759 
3760 	mutex_enter(&un->un_rrp_inflight_mx);
3761 	mutex_enter(&un->un_resync_mx);
3762 	un->un_waiting_to_clear++;
3763 	while (un->un_resync_flg & MM_RF_STALL_CLEAN)
3764 		cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3765 	un->un_waiting_to_clear--;
3766 
3767 	un->un_resync_flg |= MM_RF_GATECLOSED;
3768 	mirror_clean_rr_bits(iocp);
3769 	un->un_resync_flg &= ~MM_RF_GATECLOSED;
3770 	if (un->un_waiting_to_mark != 0 || un->un_waiting_to_clear != 0) {
3771 		cv_broadcast(&un->un_resync_cv);
3772 	}
3773 	mutex_exit(&un->un_resync_mx);
3774 	mutex_exit(&un->un_rrp_inflight_mx);
3775 
3776 	kmem_free((caddr_t)iocp, MDMN_RR_CLEAN_PARAMS_SIZE(iocp));
3777 }
3778 
3779 /*
3780  * mirror_set_clean_rr:
3781  * -------------------
3782  * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3783  * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3784  * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3785  * nodes.
3786  *
3787  * Only the mirror-owner need process this message as it is the only RR updater.
3788  * Non-owner nodes issue this request, but as we have no point-to-point message
3789  * support we will receive the message on all nodes.
3790  */
3791 int
mirror_set_clean_rr(md_mn_rr_clean_params_t * iocp)3792 mirror_set_clean_rr(md_mn_rr_clean_params_t *iocp)
3793 {
3794 
3795 	minor_t			mnum = iocp->rr_mnum;
3796 	mm_unit_t		*un;
3797 	set_t			setno = MD_MIN2SET(mnum);
3798 	md_mn_nodeid_t		node = iocp->rr_nodeid - 1;
3799 	int			can_clear = 0;
3800 	md_mn_rr_clean_params_t	*newiocp;
3801 	int			rval = 0;
3802 
3803 	mdclrerror(&iocp->mde);
3804 
3805 	if ((setno >= md_nsets) ||
3806 	    (MD_MIN2UNIT(mnum) >= md_nunits)) {
3807 		return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3808 	}
3809 
3810 	/* Must have _NO_ ioctl lock set if we update the RR on-disk */
3811 	un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3812 
3813 	if (un == NULL) {
3814 		return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3815 	}
3816 	if (un->c.un_type != MD_METAMIRROR) {
3817 		return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3818 	}
3819 	if (un->un_nsm < 2) {
3820 		return (0);
3821 	}
3822 
3823 	/*
3824 	 * Check to see if we're the mirror owner. If not, there's nothing
3825 	 * for us to to.
3826 	 */
3827 	if (!MD_MN_MIRROR_OWNER(un)) {
3828 		return (0);
3829 	}
3830 
3831 	/*
3832 	 * Process the to-be-cleaned bitmap. We need to update the pernode_dirty
3833 	 * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
3834 	 * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
3835 	 * we can just defer this cleaning until the next process_resync_regions
3836 	 * timeout.
3837 	 */
3838 	rw_enter(&un->un_pernode_dirty_mx[node], RW_WRITER);
3839 	if (un->un_pernode_dirty_bm[node] == NULL) {
3840 		un->un_pernode_dirty_bm[node] = (uchar_t *)kmem_zalloc(
3841 		    howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3842 	}
3843 	rw_exit(&un->un_pernode_dirty_mx[node]);
3844 
3845 	/*
3846 	 * See if we can simply clear the un_dirty_bm[] entries. If we're not
3847 	 * the issuing node _and_ we aren't in the process of marking/clearing
3848 	 * the RR bitmaps, we can simply update the bits as needed.
3849 	 * If we're the owning node and _not_ the issuing node, we should also
3850 	 * sync the RR if we clear any bits in it.
3851 	 */
3852 	mutex_enter(&un->un_resync_mx);
3853 	can_clear = (un->un_resync_flg & MM_RF_STALL_CLEAN) ? 0 : 1;
3854 	if (can_clear) {
3855 		un->un_resync_flg |= MM_RF_GATECLOSED;
3856 		mirror_clean_rr_bits(iocp);
3857 		un->un_resync_flg &= ~MM_RF_GATECLOSED;
3858 		if (un->un_waiting_to_mark != 0 ||
3859 		    un->un_waiting_to_clear != 0) {
3860 			cv_broadcast(&un->un_resync_cv);
3861 		}
3862 	}
3863 	mutex_exit(&un->un_resync_mx);
3864 
3865 	/*
3866 	 * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
3867 	 * we must schedule a blocking call to update the DRL on this node.
3868 	 * As we're invoked from an ioctl we are going to have the original data
3869 	 * disappear (kmem_free) once we return. So, copy the data into a new
3870 	 * structure and let the taskq routine release it on completion.
3871 	 */
3872 	if (!can_clear) {
3873 		size_t	sz = MDMN_RR_CLEAN_PARAMS_SIZE(iocp);
3874 
3875 		newiocp = (md_mn_rr_clean_params_t *)kmem_alloc(sz, KM_SLEEP);
3876 
3877 		bcopy(iocp, newiocp, sz);
3878 
3879 		if (ddi_taskq_dispatch(un->un_drl_task, mirror_drl_task,
3880 		    newiocp, DDI_NOSLEEP) != DDI_SUCCESS) {
3881 			kmem_free(newiocp, sz);
3882 			rval = ENOMEM;	/* probably starvation */
3883 		}
3884 	}
3885 
3886 	return (rval);
3887 }
3888