xref: /titanic_41/usr/src/uts/common/io/lvm/mirror/mirror.c (revision ab15e531679da9ab75e52c1c16011e74e3664a23)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/conf.h>
30 #include <sys/file.h>
31 #include <sys/user.h>
32 #include <sys/uio.h>
33 #include <sys/t_lock.h>
34 #include <sys/buf.h>
35 #include <sys/dkio.h>
36 #include <sys/vtoc.h>
37 #include <sys/kmem.h>
38 #include <vm/page.h>
39 #include <sys/cmn_err.h>
40 #include <sys/sysmacros.h>
41 #include <sys/types.h>
42 #include <sys/mkdev.h>
43 #include <sys/stat.h>
44 #include <sys/open.h>
45 #include <sys/modctl.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/debug.h>
49 #include <sys/dklabel.h>
50 #include <vm/hat.h>
51 #include <sys/lvm/mdvar.h>
52 #include <sys/lvm/md_mirror.h>
53 #include <sys/lvm/md_convert.h>
54 #include <sys/lvm/md_mddb.h>
55 #include <sys/esunddi.h>
56 
57 #include <sys/sysevent/eventdefs.h>
58 #include <sys/sysevent/svm.h>
59 #include <sys/lvm/mdmn_commd.h>
60 #include <sys/avl.h>
61 
62 md_ops_t		mirror_md_ops;
63 #ifndef	lint
64 md_ops_t		*md_interface_ops = &mirror_md_ops;
65 #endif
66 
67 extern mdq_anchor_t	md_done_daemon;
68 extern mdq_anchor_t	md_mstr_daemon;
69 extern mdq_anchor_t	md_mirror_daemon;
70 extern mdq_anchor_t	md_mirror_io_daemon;
71 extern mdq_anchor_t	md_mirror_rs_daemon;
72 extern mdq_anchor_t	md_mhs_daemon;
73 
74 extern unit_t		md_nunits;
75 extern set_t		md_nsets;
76 extern md_set_t		md_set[];
77 
78 extern int		md_status;
79 extern clock_t		md_hz;
80 
81 extern md_krwlock_t	md_unit_array_rw;
82 extern kmutex_t		md_mx;
83 extern kcondvar_t	md_cv;
84 extern int		md_mtioctl_cnt;
85 
86 daemon_request_t	mirror_timeout;
87 static daemon_request_t	hotspare_request;
88 static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
89 
90 int	md_mirror_mcs_buf_off;
91 
92 /* Flags for mdmn_ksend_message to allow debugging */
93 int	md_mirror_msg_flags;
94 
95 #ifdef DEBUG
96 /* Flag to switch on debug messages */
97 int	mirror_debug_flag = 0;
98 #endif
99 
100 /*
101  * Struct used to hold count of DMR reads and the timestamp of last DMR read
102  * It is used to verify, using a debugger, that the DMR read ioctl has been
103  * executed.
104  */
105 dmr_stats_t	mirror_dmr_stats = {0, 0};
106 
107 /*
108  * Mutex protecting list of non-failfast drivers.
109  */
110 static kmutex_t	non_ff_drv_mutex;
111 extern char	**non_ff_drivers;
112 
113 extern major_t	md_major;
114 
115 /*
116  * Write-On-Write memory pool.
117  */
118 static void		copy_write_cont(wowhdr_t *wowhdr);
119 static kmem_cache_t	*mirror_wowblk_cache = NULL;
120 static int		md_wowbuf_size = 16384;
121 static size_t		md_wowblk_size;
122 
123 /*
124  * This is a flag that allows:
125  *	- disabling the write-on-write mechanism.
126  *	- logging occurrences of write-on-write
127  *	- switching wow handling procedure processing
128  * Counter for occurences of WOW.
129  */
130 static uint_t	md_mirror_wow_flg = 0;
131 static int	md_mirror_wow_cnt = 0;
132 
133 /*
134  * Tunable to enable/disable dirty region
135  * processing when closing down a mirror.
136  */
137 static int	new_resync = 1;
138 kmem_cache_t	*mirror_parent_cache = NULL;
139 kmem_cache_t	*mirror_child_cache = NULL;
140 
141 extern int	md_ff_disable;		/* disable failfast */
142 
143 static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
144 static void	mirror_read_strategy(buf_t *, int, void *);
145 static void	mirror_write_strategy(buf_t *, int, void *);
146 static void	become_owner(daemon_queue_t *);
147 static int	mirror_done(struct buf *cb);
148 static int	mirror_done_common(struct buf *cb);
149 static void	clear_retry_error(struct buf *cb);
150 
151 /*
152  * patchables
153  */
154 int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
155 int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
156 
157 /*
158  * patchable to change delay before rescheduling mirror ownership request.
159  * Value is clock ticks, default 0.5 seconds
160  */
161 clock_t	md_mirror_owner_to = 500000;
162 
163 /*ARGSUSED1*/
164 static int
mirror_parent_constructor(void * p,void * d1,int d2)165 mirror_parent_constructor(void *p, void *d1, int d2)
166 {
167 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
168 	return (0);
169 }
170 
171 static void
mirror_parent_init(md_mps_t * ps)172 mirror_parent_init(md_mps_t *ps)
173 {
174 	bzero(ps, offsetof(md_mps_t, ps_mx));
175 	bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
176 }
177 
178 /*ARGSUSED1*/
179 static void
mirror_parent_destructor(void * p,void * d)180 mirror_parent_destructor(void *p, void *d)
181 {
182 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
183 }
184 
185 /*ARGSUSED1*/
186 static int
mirror_child_constructor(void * p,void * d1,int d2)187 mirror_child_constructor(void *p, void *d1, int d2)
188 {
189 	bioinit(&((md_mcs_t *)p)->cs_buf);
190 	return (0);
191 }
192 
193 void
mirror_child_init(md_mcs_t * cs)194 mirror_child_init(md_mcs_t *cs)
195 {
196 	cs->cs_ps = NULL;
197 	cs->cs_mdunit = 0;
198 	md_bioreset(&cs->cs_buf);
199 }
200 
201 /*ARGSUSED1*/
202 static void
mirror_child_destructor(void * p,void * d)203 mirror_child_destructor(void *p, void *d)
204 {
205 	biofini(&((md_mcs_t *)p)->cs_buf);
206 }
207 
208 static void
mirror_wowblk_init(wowhdr_t * p)209 mirror_wowblk_init(wowhdr_t *p)
210 {
211 	bzero(p, md_wowblk_size);
212 }
213 
214 static void
send_poke_hotspares_msg(daemon_request_t * drq)215 send_poke_hotspares_msg(daemon_request_t *drq)
216 {
217 	int			rval;
218 	int			nretries = 0;
219 	md_mn_msg_pokehsp_t	pokehsp;
220 	md_mn_kresult_t		*kresult;
221 	set_t			setno = (set_t)drq->dq.qlen;
222 
223 	pokehsp.pokehsp_setno = setno;
224 
225 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
226 
227 retry_sphmsg:
228 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
229 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
230 	    sizeof (pokehsp), kresult);
231 
232 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
233 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
234 		/* If we're shutting down already, pause things here. */
235 		if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
236 			while (!md_mn_is_commd_present()) {
237 				delay(md_hz);
238 			}
239 			/*
240 			 * commd has become reachable again, so retry once.
241 			 * If this fails we'll panic as the system is in an
242 			 * unexpected state.
243 			 */
244 			if (nretries++ == 0)
245 				goto retry_sphmsg;
246 		}
247 		cmn_err(CE_PANIC,
248 		    "ksend_message failure: POKE_HOTSPARES");
249 	}
250 	kmem_free(kresult, sizeof (md_mn_kresult_t));
251 
252 	/* Allow further requests to use this set's queue structure */
253 	mutex_enter(&drq->dr_mx);
254 	drq->dr_pending = 0;
255 	mutex_exit(&drq->dr_mx);
256 }
257 
258 /*
259  * Send a poke_hotspares message to the master node. To avoid swamping the
260  * commd handler with requests we only send a message if there is not one
261  * already outstanding. We punt the request to a separate thread context as
262  * cannot afford to block waiting on the request to be serviced. This is
263  * essential when a reconfig cycle is in progress as any open() of a multinode
264  * metadevice may result in a livelock.
265  */
266 static void
send_poke_hotspares(set_t setno)267 send_poke_hotspares(set_t setno)
268 {
269 	daemon_request_t	*drq = &mn_hs_request[setno];
270 
271 	mutex_enter(&drq->dr_mx);
272 	if (drq->dr_pending == 0) {
273 		drq->dr_pending = 1;
274 		drq->dq.qlen = (int)setno;
275 		daemon_request(&md_mhs_daemon,
276 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
277 	}
278 	mutex_exit(&drq->dr_mx);
279 }
280 
281 void
mirror_set_sm_state(mm_submirror_t * sm,mm_submirror_ic_t * smic,sm_state_t newstate,int force)282 mirror_set_sm_state(
283 	mm_submirror_t		*sm,
284 	mm_submirror_ic_t	*smic,
285 	sm_state_t		newstate,
286 	int			force)
287 {
288 	int			compcnt;
289 	int			i;
290 	int			errcnt;
291 	sm_state_t		origstate;
292 	md_m_shared_t		*shared;
293 
294 	if (force) {
295 		sm->sm_state = newstate;
296 		uniqtime32(&sm->sm_timestamp);
297 		return;
298 	}
299 
300 	origstate = newstate;
301 
302 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
303 	for (i = 0, errcnt = 0; i < compcnt; i++) {
304 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
305 		    (sm->sm_dev, sm, i);
306 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
307 			newstate |= SMS_COMP_ERRED;
308 		if (shared->ms_state & (CS_RESYNC))
309 			newstate |= SMS_COMP_RESYNC;
310 		if (shared->ms_state & CS_ERRED)
311 			errcnt++;
312 	}
313 
314 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
315 		newstate &= ~origstate;
316 
317 	if (errcnt == compcnt)
318 		newstate |= SMS_ALL_ERRED;
319 	else
320 		newstate &= ~SMS_ALL_ERRED;
321 
322 	sm->sm_state = newstate;
323 	uniqtime32(&sm->sm_timestamp);
324 }
325 
326 static int
mirror_geterror(mm_unit_t * un,int * smi,int * cip,int clr_error,int frm_probe)327 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
328 							int frm_probe)
329 {
330 	mm_submirror_t		*sm;
331 	mm_submirror_ic_t	*smic;
332 	md_m_shared_t		*shared;
333 	int			ci;
334 	int			i;
335 	int			compcnt;
336 	int			open_comp; /* flag for open component */
337 
338 	for (i = *smi; i < NMIRROR; i++) {
339 		sm = &un->un_sm[i];
340 		smic = &un->un_smic[i];
341 
342 		if (!SMS_IS(sm, SMS_INUSE))
343 			continue;
344 
345 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
346 		for (ci = *cip; ci < compcnt; ci++) {
347 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
348 			    (sm->sm_dev, sm, ci);
349 			/*
350 			 * if called from any routine but probe, we check for
351 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
352 			 * it sets MDM_S_PROBEOPEN flag and we test for this
353 			 * flag. They are both exclusive tests.
354 			 */
355 			open_comp = (frm_probe) ?
356 			    (shared->ms_flags & MDM_S_PROBEOPEN):
357 			    (shared->ms_flags & MDM_S_ISOPEN);
358 			if (((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
359 			    ((shared->ms_state == CS_OKAY) ||
360 			    (shared->ms_state == CS_RESYNC))) ||
361 			    (!open_comp &&
362 			    (shared->ms_state == CS_LAST_ERRED))) {
363 				if (clr_error) {
364 					shared->ms_flags &= ~MDM_S_IOERR;
365 				}
366 				*cip = ci;
367 				*smi = i;
368 				return (1);
369 			}
370 
371 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
372 				shared->ms_flags &= ~MDM_S_IOERR;
373 			}
374 		}
375 
376 		*cip = 0;
377 	}
378 	return (0);
379 }
380 
381 /*ARGSUSED*/
382 static void
mirror_run_queue(void * d)383 mirror_run_queue(void *d)
384 {
385 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
386 		md_daemon(1, &md_done_daemon);
387 }
388 /*
389  * check_comp_4_hotspares
390  *
391  * This function attempts to allocate a hotspare for this component if the
392  * component is in error. In a MN set, the function can be called in 2 modes.
393  * It can be called either when a component error has been detected or when a
394  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
395  * in flags and the request is sent to all nodes.
396  * The handler on each of the nodes then calls this function with
397  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
398  *
399  * For non-MN sets the function simply attempts to allocate a hotspare.
400  *
401  * On entry, the following locks are held
402  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
403  *	md_unit_writerlock
404  *
405  * Returns	0 if ok
406  *		1 if the unit containing the component has been cleared while
407  *		  the mdmn_ksend_message() was being executed
408  */
409 extern int
check_comp_4_hotspares(mm_unit_t * un,int smi,int ci,uint_t flags,mddb_recid_t hs_id,IOLOCK * lockp)410 check_comp_4_hotspares(
411 	mm_unit_t	*un,
412 	int		smi,
413 	int		ci,
414 	uint_t		flags,
415 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
416 	IOLOCK		*lockp	/* can be NULL */
417 )
418 {
419 	mm_submirror_t		*sm;
420 	mm_submirror_ic_t	*smic;
421 	md_m_shared_t		*shared;
422 	mddb_recid_t		recids[6];
423 	minor_t			mnum;
424 	intptr_t		(*hs_dev)();
425 	void			(*hs_done)();
426 	void			*hs_data;
427 	md_error_t		mde = mdnullerror;
428 	set_t			setno;
429 	md_mn_msg_allochsp_t	allochspmsg;
430 	md_mn_kresult_t		*kresult;
431 	mm_unit_t		*new_un;
432 	int			rval;
433 	int			nretries = 0;
434 
435 	mnum = MD_SID(un);
436 	setno = MD_UN2SET(un);
437 	sm = &un->un_sm[smi];
438 	smic = &un->un_smic[smi];
439 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
440 	    (sm->sm_dev, sm, ci);
441 
442 	if (shared->ms_state != CS_ERRED)
443 		return (0);
444 
445 	/* Don't start a new component resync if a resync is already running. */
446 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
447 		return (0);
448 
449 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
450 		uint_t		msgflags;
451 		md_mn_msgtype_t	msgtype;
452 
453 		/* Send allocate hotspare message to all nodes */
454 
455 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
456 		allochspmsg.msg_allochsp_sm = smi;
457 		allochspmsg.msg_allochsp_comp = ci;
458 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
459 
460 		/*
461 		 * Before calling mdmn_ksend_message(), release locks
462 		 * Can never be in the context of an ioctl.
463 		 */
464 		md_unit_writerexit(MDI_UNIT(mnum));
465 		if (flags & MD_HOTSPARE_LINKHELD)
466 			rw_exit(&mirror_md_ops.md_link_rw.lock);
467 #ifdef DEBUG
468 		if (mirror_debug_flag)
469 			printf("send alloc hotspare, flags="
470 			    "0x%x %x, %x, %x, %x\n", flags,
471 			    allochspmsg.msg_allochsp_mnum,
472 			    allochspmsg.msg_allochsp_sm,
473 			    allochspmsg.msg_allochsp_comp,
474 			    allochspmsg.msg_allochsp_hs_id);
475 #endif
476 		if (flags & MD_HOTSPARE_WMUPDATE) {
477 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
478 			/*
479 			 * When coming from an update of watermarks, there
480 			 * must already be a message logged that triggered
481 			 * this action. So, no need to log this message, too.
482 			 */
483 			msgflags = MD_MSGF_NO_LOG;
484 		} else {
485 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
486 			msgflags = MD_MSGF_DEFAULT_FLAGS;
487 		}
488 
489 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
490 
491 cc4hs_msg:
492 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
493 		    (char *)&allochspmsg, sizeof (allochspmsg),
494 		    kresult);
495 
496 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
497 #ifdef DEBUG
498 			if (mirror_debug_flag)
499 				mdmn_ksend_show_error(rval, kresult,
500 				    "ALLOCATE HOTSPARE");
501 #endif
502 			/*
503 			 * If message is sent ok but exitval indicates an error
504 			 * it must be because the mirror has been cleared. In
505 			 * this case re-obtain lock and return an error
506 			 */
507 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
508 				if (flags & MD_HOTSPARE_LINKHELD) {
509 					rw_enter(&mirror_md_ops.md_link_rw.lock,
510 					    RW_READER);
511 				}
512 				kmem_free(kresult, sizeof (md_mn_kresult_t));
513 				return (1);
514 			}
515 			/* If we're shutting down already, pause things here. */
516 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
517 				while (!md_mn_is_commd_present()) {
518 					delay(md_hz);
519 				}
520 				/*
521 				 * commd has become reachable again, so retry
522 				 * once. If this fails we'll panic as the
523 				 * system is in an unexpected state.
524 				 */
525 				if (nretries++ == 0)
526 					goto cc4hs_msg;
527 			}
528 			cmn_err(CE_PANIC,
529 			    "ksend_message failure: ALLOCATE_HOTSPARE");
530 		}
531 		kmem_free(kresult, sizeof (md_mn_kresult_t));
532 
533 		/*
534 		 * re-obtain the locks
535 		 */
536 		if (flags & MD_HOTSPARE_LINKHELD)
537 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
538 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
539 
540 		/*
541 		 * As we had to release the locks in order to send the
542 		 * message to all nodes, we need to check to see if the
543 		 * unit has changed. If it has we release the writerlock
544 		 * and return fail.
545 		 */
546 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
547 			md_unit_writerexit(MDI_UNIT(mnum));
548 			return (1);
549 		}
550 	} else {
551 		if (MD_MNSET_SETNO(setno)) {
552 			/*
553 			 * If 2 or more nodes simultaneously see a
554 			 * component failure, these nodes will each
555 			 * send an ALLOCATE_HOTSPARE[2] message.
556 			 * The first message will allocate the hotspare
557 			 * and the subsequent messages should do nothing.
558 			 *
559 			 * If a slave node doesn't have a hotspare allocated
560 			 * at the time the message is initiated, then the
561 			 * passed in hs_id will be 0.  If the node
562 			 * executing this routine has a component shared
563 			 * ms_hs_id of non-zero, but the message shows a
564 			 * hs_id of 0, then just return since a hotspare
565 			 * has already been allocated for this failing
566 			 * component.  When the slave node returns from
567 			 * the ksend_message the hotspare will have
568 			 * already been allocated.
569 			 *
570 			 * If the slave node does send an hs_id of non-zero,
571 			 * and the slave node's hs_id matches this node's
572 			 * ms_hs_id, then the hotspare has error'd and
573 			 * should be replaced.
574 			 *
575 			 * If the slave node sends an hs_id of non-zero and
576 			 * this node has a different shared ms_hs_id, then
577 			 * just return since this hotspare has already
578 			 * been hotspared.
579 			 */
580 			if (shared->ms_hs_id != 0) {
581 				if (hs_id == 0) {
582 #ifdef DEBUG
583 					if (mirror_debug_flag) {
584 						printf("check_comp_4_hotspares"
585 						    "(NOXMIT), short circuit "
586 						    "hs_id=0x%x, "
587 						    "ms_hs_id=0x%x\n",
588 						    hs_id, shared->ms_hs_id);
589 					}
590 #endif
591 					return (0);
592 				}
593 				if (hs_id != shared->ms_hs_id) {
594 #ifdef DEBUG
595 					if (mirror_debug_flag) {
596 						printf("check_comp_4_hotspares"
597 						    "(NOXMIT), short circuit2 "
598 						    "hs_id=0x%x, "
599 						    "ms_hs_id=0x%x\n",
600 						    hs_id, shared->ms_hs_id);
601 					}
602 #endif
603 					return (0);
604 				}
605 			}
606 		}
607 
608 		sm = &un->un_sm[smi];
609 		hs_dev = md_get_named_service(sm->sm_dev, 0,
610 		    "hotspare device", 0);
611 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
612 		    &hs_data) != 0)
613 			return (0);
614 
615 		/*
616 		 * set_sm_comp_state() commits the modified records.
617 		 * As we don't transmit the changes, no need to drop the lock.
618 		 */
619 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
620 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
621 
622 		(*hs_done)(sm->sm_dev, hs_data);
623 
624 		mirror_check_failfast(mnum);
625 
626 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
627 		    setno, MD_SID(un));
628 
629 		/*
630 		 * For a multi-node set we need to reset the un_rs_type,
631 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
632 		 * hot-spare resync must copy all applicable data.
633 		 */
634 		if (MD_MNSET_SETNO(setno)) {
635 			un->un_rs_type = MD_RS_NONE;
636 			un->un_rs_resync_done = 0;
637 			un->un_rs_resync_2_do = 0;
638 		}
639 
640 		/*
641 		 * Must drop writer lock since mirror_resync_unit will
642 		 * open devices and must be able to grab readerlock.
643 		 * Don't need to drop IOLOCK since any descendent routines
644 		 * calling ksend_messages will drop the IOLOCK as needed.
645 		 *
646 		 */
647 		if (lockp) {
648 			md_ioctl_writerexit(lockp);
649 		} else {
650 			md_unit_writerexit(MDI_UNIT(mnum));
651 		}
652 
653 		/* start resync */
654 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
655 
656 		if (lockp) {
657 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
658 		} else {
659 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
660 		}
661 	}
662 	return (0);
663 }
664 
665 /*
666  * check_unit_4_hotspares
667  *
668  * For a given mirror, allocate hotspares, if available for any components
669  * that are in error
670  *
671  * Returns	0 if ok
672  *		1 if check_comp_4_hotspares returns non-zero. This will only
673  *		  happen for a MN unit where the unit has been cleared while
674  *		  the allocate hotspare message is sent to all nodes.
675  */
676 static int
check_unit_4_hotspares(mm_unit_t * un,int flags)677 check_unit_4_hotspares(mm_unit_t *un, int flags)
678 {
679 	mm_submirror_t		*sm;
680 	mm_submirror_ic_t	*smic;
681 	int			ci;
682 	int			i;
683 	int			compcnt;
684 
685 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
686 		return (0);
687 
688 	for (i = 0; i < NMIRROR; i++) {
689 		sm = &un->un_sm[i];
690 		smic = &un->un_smic[i];
691 		if (!SMS_IS(sm, SMS_INUSE))
692 			continue;
693 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
694 		for (ci = 0; ci < compcnt; ci++) {
695 			md_m_shared_t		*shared;
696 
697 			shared = (md_m_shared_t *)
698 			    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
699 			/*
700 			 * Never called from ioctl context, so pass in
701 			 * (IOLOCK *)NULL.  Pass through flags from calling
702 			 * routine, also setting XMIT flag.
703 			 */
704 			if (check_comp_4_hotspares(un, i, ci,
705 			    (MD_HOTSPARE_XMIT | flags),
706 			    shared->ms_hs_id, (IOLOCK *)NULL) != 0)
707 				return (1);
708 		}
709 	}
710 	return (0);
711 }
712 
713 static void
check_4_hotspares(daemon_request_t * drq)714 check_4_hotspares(daemon_request_t *drq)
715 {
716 	mdi_unit_t	*ui;
717 	mm_unit_t	*un;
718 	md_link_t	*next;
719 	int		x;
720 
721 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
722 	drq->dr_pending = 0;		/* again in low level routine if */
723 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
724 
725 	/*
726 	 * Used to have a problem here. The disksets weren't marked as being
727 	 * MNHOLD. This opened a window where we could be searching for
728 	 * hotspares and have the disk set unloaded (released) from under
729 	 * us causing a panic in stripe_component_count().
730 	 * The way to prevent that is to mark the set MNHOLD which prevents
731 	 * any diskset from being released while we are scanning the mirrors,
732 	 * submirrors and components.
733 	 */
734 
735 	for (x = 0; x < md_nsets; x++)
736 		md_holdset_enter(x);
737 
738 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
739 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
740 		ui = MDI_UNIT(next->ln_id);
741 
742 		un = (mm_unit_t *)md_unit_readerlock(ui);
743 
744 		/*
745 		 * Only check the unit if we are the master for this set
746 		 * For an MN set, poke_hotspares() is only effective on the
747 		 * master
748 		 */
749 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
750 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
751 			md_unit_readerexit(ui);
752 			continue;
753 		}
754 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
755 			md_unit_readerexit(ui);
756 			continue;
757 		}
758 		md_unit_readerexit(ui);
759 
760 		un = (mm_unit_t *)md_unit_writerlock(ui);
761 		/*
762 		 * check_unit_4_hotspares will exit 1 if the unit has been
763 		 * removed during the process of allocating the hotspare.
764 		 * This can only happen for a MN metadevice. If unit no longer
765 		 * exists, no need to release writerlock
766 		 */
767 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
768 			md_unit_writerexit(ui);
769 		else {
770 			/*
771 			 * If check_unit_4_hotspares failed, queue another
772 			 * request and break out of this one
773 			 */
774 			(void) poke_hotspares();
775 			break;
776 		}
777 	}
778 	rw_exit(&mirror_md_ops.md_link_rw.lock);
779 
780 	for (x = 0; x < md_nsets; x++)
781 		md_holdset_exit(x);
782 }
783 
784 /*
785  * poke_hotspares
786  *
787  * If there is not a pending poke_hotspares request pending, queue a requent
788  * to call check_4_hotspares(). This will scan all mirrors and attempt to
789  * allocate hotspares for all components in error.
790  */
791 int
poke_hotspares()792 poke_hotspares()
793 {
794 	mutex_enter(&hotspare_request.dr_mx);
795 	if (hotspare_request.dr_pending == 0) {
796 		hotspare_request.dr_pending = 1;
797 		daemon_request(&md_mhs_daemon,
798 		    check_4_hotspares, (daemon_queue_t *)&hotspare_request,
799 		    REQ_OLD);
800 	}
801 	mutex_exit(&hotspare_request.dr_mx);
802 	return (0);
803 }
804 
805 static void
free_all_ecomps(err_comp_t * ecomp)806 free_all_ecomps(err_comp_t *ecomp)
807 {
808 	err_comp_t	*d;
809 
810 	while (ecomp != NULL) {
811 		d = ecomp;
812 		ecomp = ecomp->ec_next;
813 		kmem_free(d, sizeof (err_comp_t));
814 	}
815 }
816 
817 /*
818  * NAME: mirror_openfail_console_info
819  *
820  * DESCRIPTION: Prints a informative message to the console when mirror
821  *		cannot be opened.
822  *
823  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
824  *	       int		smi - submirror index
825  *	       int		ci - component index
826  */
827 
828 void
mirror_openfail_console_info(mm_unit_t * un,int smi,int ci)829 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
830 {
831 	void (*get_dev)();
832 	ms_cd_info_t cd;
833 	md_dev64_t tmpdev;
834 
835 	tmpdev = un->un_sm[smi].sm_dev;
836 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
837 	if (get_dev != NULL) {
838 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
839 		cmn_err(CE_WARN, "md %s: open error on %s",
840 		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
841 		    cd.cd_dev, NULL, 0));
842 	} else {
843 		cmn_err(CE_WARN, "md %s: open error",
844 		    md_shortname(MD_SID(un)));
845 	}
846 }
847 
848 static int
mirror_close_all_devs(mm_unit_t * un,int md_cflags)849 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
850 {
851 	int i;
852 	md_dev64_t dev;
853 
854 	for (i = 0; i < NMIRROR; i++) {
855 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
856 			continue;
857 		dev = un->un_sm[i].sm_dev;
858 		md_layered_close(dev, md_cflags);
859 	}
860 	return (0);
861 }
862 
863 /*
864  * Keep track of drivers that don't support failfast.  We use this so that
865  * we only log one diagnostic message for each of these drivers, no matter
866  * how many times we run the mirror_check_failfast function.
867  * Return 1 if this is a new driver that does not support failfast,
868  * return 0 if we have already seen this non-failfast driver.
869  */
870 static int
new_non_ff_driver(const char * s)871 new_non_ff_driver(const char *s)
872 {
873 	mutex_enter(&non_ff_drv_mutex);
874 	if (non_ff_drivers == NULL) {
875 		non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
876 		    KM_NOSLEEP);
877 		if (non_ff_drivers == NULL) {
878 			mutex_exit(&non_ff_drv_mutex);
879 			return (1);
880 		}
881 
882 		non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
883 		    KM_NOSLEEP);
884 		if (non_ff_drivers[0] == NULL) {
885 			kmem_free(non_ff_drivers, 2 * sizeof (char *));
886 			non_ff_drivers = NULL;
887 			mutex_exit(&non_ff_drv_mutex);
888 			return (1);
889 		}
890 
891 		(void) strcpy(non_ff_drivers[0], s);
892 		non_ff_drivers[1] = NULL;
893 
894 	} else {
895 		int i;
896 		char **tnames;
897 		char **tmp;
898 
899 		for (i = 0; non_ff_drivers[i] != NULL; i++) {
900 			if (strcmp(s, non_ff_drivers[i]) == 0) {
901 				mutex_exit(&non_ff_drv_mutex);
902 				return (0);
903 			}
904 		}
905 
906 		/* allow for new element and null */
907 		i += 2;
908 		tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
909 		if (tnames == NULL) {
910 			mutex_exit(&non_ff_drv_mutex);
911 			return (1);
912 		}
913 
914 		for (i = 0; non_ff_drivers[i] != NULL; i++)
915 			tnames[i] = non_ff_drivers[i];
916 
917 		tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
918 		if (tnames[i] == NULL) {
919 			/* adjust i so that it is the right count to free */
920 			kmem_free(tnames, (i + 2) * sizeof (char *));
921 			mutex_exit(&non_ff_drv_mutex);
922 			return (1);
923 		}
924 
925 		(void) strcpy(tnames[i++], s);
926 		tnames[i] = NULL;
927 
928 		tmp = non_ff_drivers;
929 		non_ff_drivers = tnames;
930 		/* i now represents the count we previously alloced */
931 		kmem_free(tmp, i * sizeof (char *));
932 	}
933 	mutex_exit(&non_ff_drv_mutex);
934 
935 	return (1);
936 }
937 
938 /*
939  * Check for the "ddi-failfast-supported" devtree property on each submirror
940  * component to indicate if we should do I/O to that submirror with the
941  * B_FAILFAST flag set or not.  This check is made at various state transitions
942  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
943  * only need to check one drive (e.g. hotspare) but since the check is
944  * fast and infrequent and sometimes needs to be done on all components we
945  * just check all components on each call.
946  */
947 void
mirror_check_failfast(minor_t mnum)948 mirror_check_failfast(minor_t mnum)
949 {
950 	int		i;
951 	mm_unit_t	*un;
952 
953 	if (md_ff_disable)
954 		return;
955 
956 	un = MD_UNIT(mnum);
957 
958 	for (i = 0; i < NMIRROR; i++) {
959 		int			ci;
960 		int			cnt;
961 		int			ff = 1;
962 		mm_submirror_t		*sm;
963 		mm_submirror_ic_t	*smic;
964 		void			(*get_dev)();
965 
966 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
967 			continue;
968 
969 		sm = &un->un_sm[i];
970 		smic = &un->un_smic[i];
971 
972 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
973 		    "get device", 0);
974 
975 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
976 		for (ci = 0; ci < cnt; ci++) {
977 			int		found = 0;
978 			dev_t		ci_dev;
979 			major_t		major;
980 			dev_info_t	*devi;
981 			ms_cd_info_t	cd;
982 
983 			/*
984 			 * this already returns the hs
985 			 * dev if the device is spared
986 			 */
987 			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
988 
989 			ci_dev = md_dev64_to_dev(cd.cd_dev);
990 			major = getmajor(ci_dev);
991 
992 			if (major == md_major) {
993 				/*
994 				 * this component must be a soft
995 				 * partition; get the real dev
996 				 */
997 				minor_t	dev_mnum;
998 				mdi_unit_t	*ui;
999 				mp_unit_t	*un;
1000 				set_t	setno;
1001 				side_t	side;
1002 				md_dev64_t	tmpdev;
1003 
1004 				ui = MDI_UNIT(getminor(ci_dev));
1005 
1006 				/* grab necessary lock */
1007 				un = (mp_unit_t *)md_unit_readerlock(ui);
1008 
1009 				dev_mnum = MD_SID(un);
1010 				setno = MD_MIN2SET(dev_mnum);
1011 				side = mddb_getsidenum(setno);
1012 
1013 				tmpdev = un->un_dev;
1014 
1015 				/* Get dev by device id */
1016 				if (md_devid_found(setno, side,
1017 				    un->un_key) == 1) {
1018 					tmpdev = md_resolve_bydevid(dev_mnum,
1019 					    tmpdev, un->un_key);
1020 				}
1021 
1022 				md_unit_readerexit(ui);
1023 
1024 				ci_dev = md_dev64_to_dev(tmpdev);
1025 				major = getmajor(ci_dev);
1026 			}
1027 
1028 			if (ci_dev != NODEV32 &&
1029 			    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
1030 			    != NULL) {
1031 				ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
1032 				int		propvalue = 0;
1033 				int		proplength = sizeof (int);
1034 				int		error;
1035 				struct cb_ops	*cb;
1036 
1037 				if ((cb = devopsp[major]->devo_cb_ops) !=
1038 				    NULL) {
1039 					error = (*cb->cb_prop_op)
1040 					    (DDI_DEV_T_ANY, devi, prop_op,
1041 					    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
1042 					    "ddi-failfast-supported",
1043 					    (caddr_t)&propvalue, &proplength);
1044 
1045 					if (error == DDI_PROP_SUCCESS)
1046 						found = 1;
1047 				}
1048 
1049 				if (!found && new_non_ff_driver(
1050 				    ddi_driver_name(devi))) {
1051 					cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
1052 					    "disabled on %s",
1053 					    ddi_driver_name(devi));
1054 				}
1055 
1056 				ddi_release_devi(devi);
1057 			}
1058 
1059 			/*
1060 			 * All components must support
1061 			 * failfast in the submirror.
1062 			 */
1063 			if (!found) {
1064 				ff = 0;
1065 				break;
1066 			}
1067 		}
1068 
1069 		if (ff) {
1070 			sm->sm_flags |= MD_SM_FAILFAST;
1071 		} else {
1072 			sm->sm_flags &= ~MD_SM_FAILFAST;
1073 		}
1074 	}
1075 }
1076 
1077 /*
1078  * Return true if the submirror is unavailable.
1079  * If any of the submirror components are opened then the submirror cannot
1080  * be unavailable (MD_INACCESSIBLE).
1081  * If any of the components are already in the errored state, then the submirror
1082  * cannot be unavailable (MD_INACCESSIBLE).
1083  */
1084 static bool_t
submirror_unavailable(mm_unit_t * un,int smi,int from_probe)1085 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1086 {
1087 	mm_submirror_t		*sm;
1088 	mm_submirror_ic_t	*smic;
1089 	md_m_shared_t		*shared;
1090 	int			ci;
1091 	int			compcnt;
1092 
1093 	sm = &un->un_sm[smi];
1094 	smic = &un->un_smic[smi];
1095 
1096 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1097 	for (ci = 0; ci < compcnt; ci++) {
1098 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1099 		    (sm->sm_dev, sm, ci);
1100 		if (from_probe) {
1101 			if (shared->ms_flags & MDM_S_PROBEOPEN)
1102 				return (B_FALSE);
1103 		} else {
1104 			if (shared->ms_flags & MDM_S_ISOPEN)
1105 				return (B_FALSE);
1106 		}
1107 		if (shared->ms_state == CS_ERRED ||
1108 		    shared->ms_state == CS_LAST_ERRED)
1109 			return (B_FALSE);
1110 	}
1111 
1112 	return (B_TRUE);
1113 }
1114 
1115 static int
mirror_open_all_devs(minor_t mnum,int md_oflags,IOLOCK * lockp)1116 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1117 {
1118 	int		i;
1119 	mm_unit_t	*un;
1120 	mdi_unit_t	*ui;
1121 	int		err;
1122 	int		smi;
1123 	int		ci;
1124 	err_comp_t	*c;
1125 	err_comp_t	*ecomps = NULL;
1126 	int		smmask = 0;
1127 	set_t		setno;
1128 	int		sm_cnt;
1129 	int		sm_unavail_cnt;
1130 
1131 	mirror_check_failfast(mnum);
1132 
1133 	un = MD_UNIT(mnum);
1134 	ui = MDI_UNIT(mnum);
1135 	setno = MD_UN2SET(un);
1136 
1137 	for (i = 0; i < NMIRROR; i++) {
1138 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1139 
1140 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1141 			continue;
1142 		if (md_layered_open(mnum, &tmpdev, md_oflags))
1143 			smmask |= SMI2BIT(i);
1144 		un->un_sm[i].sm_dev = tmpdev;
1145 	}
1146 
1147 	/*
1148 	 * If smmask is clear, all submirrors are accessible. Clear the
1149 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1150 	 * mirror device.   If smmask is set, we have to determine which of the
1151 	 * submirrors are in error. If no submirror is accessible we mark the
1152 	 * whole mirror as MD_INACCESSIBLE.
1153 	 */
1154 	if (smmask == 0) {
1155 		if (lockp) {
1156 			md_ioctl_readerexit(lockp);
1157 			(void) md_ioctl_writerlock(lockp, ui);
1158 		} else {
1159 			md_unit_readerexit(ui);
1160 			(void) md_unit_writerlock(ui);
1161 		}
1162 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1163 		if (lockp) {
1164 			md_ioctl_writerexit(lockp);
1165 			(void) md_ioctl_readerlock(lockp, ui);
1166 		} else {
1167 			md_unit_writerexit(ui);
1168 			(void) md_unit_readerlock(ui);
1169 		}
1170 
1171 		for (i = 0; i < NMIRROR; i++) {
1172 			md_dev64_t	tmpdev;
1173 			mdi_unit_t	*sm_ui;
1174 
1175 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1176 				continue;
1177 
1178 			tmpdev = un->un_sm[i].sm_dev;
1179 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1180 			(void) md_unit_writerlock(sm_ui);
1181 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1182 			md_unit_writerexit(sm_ui);
1183 		}
1184 
1185 		return (0);
1186 	}
1187 
1188 	for (i = 0; i < NMIRROR; i++) {
1189 		md_dev64_t tmpdev;
1190 
1191 		if (!(smmask & SMI2BIT(i)))
1192 			continue;
1193 
1194 		tmpdev = un->un_sm[i].sm_dev;
1195 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1196 		un->un_sm[i].sm_dev = tmpdev;
1197 		ASSERT(err == 0);
1198 	}
1199 
1200 	if (lockp) {
1201 		md_ioctl_readerexit(lockp);
1202 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1203 	} else {
1204 		md_unit_readerexit(ui);
1205 		un = (mm_unit_t *)md_unit_writerlock(ui);
1206 	}
1207 
1208 	/*
1209 	 * We want to make sure the unavailable flag is not masking a real
1210 	 * error on the submirror.
1211 	 * For each submirror,
1212 	 *    if all of the submirror components couldn't be opened and there
1213 	 *    are no errors on the submirror, then set the unavailable flag
1214 	 *    otherwise, clear unavailable.
1215 	 */
1216 	sm_cnt = 0;
1217 	sm_unavail_cnt = 0;
1218 	for (i = 0; i < NMIRROR; i++) {
1219 		md_dev64_t	tmpdev;
1220 		mdi_unit_t	*sm_ui;
1221 
1222 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1223 			continue;
1224 
1225 		sm_cnt++;
1226 		tmpdev = un->un_sm[i].sm_dev;
1227 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1228 
1229 		(void) md_unit_writerlock(sm_ui);
1230 		if (submirror_unavailable(un, i, 0)) {
1231 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
1232 			sm_unavail_cnt++;
1233 		} else {
1234 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1235 		}
1236 		md_unit_writerexit(sm_ui);
1237 	}
1238 
1239 	/*
1240 	 * If all of the submirrors are unavailable, the mirror is also
1241 	 * unavailable.
1242 	 */
1243 	if (sm_cnt == sm_unavail_cnt) {
1244 		ui->ui_tstate |= MD_INACCESSIBLE;
1245 	} else {
1246 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1247 	}
1248 
1249 	smi = 0;
1250 	ci = 0;
1251 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1252 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
1253 
1254 			free_all_ecomps(ecomps);
1255 			(void) mirror_close_all_devs(un, md_oflags);
1256 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1257 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1258 			mirror_openfail_console_info(un, smi, ci);
1259 			if (lockp) {
1260 				md_ioctl_writerexit(lockp);
1261 				(void) md_ioctl_readerlock(lockp, ui);
1262 			} else {
1263 				md_unit_writerexit(ui);
1264 				(void) md_unit_readerlock(ui);
1265 			}
1266 			return (ENXIO);
1267 		}
1268 
1269 		/* track all component states that need changing */
1270 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1271 		c->ec_next = ecomps;
1272 		c->ec_smi = smi;
1273 		c->ec_ci = ci;
1274 		ecomps = c;
1275 		ci++;
1276 	}
1277 
1278 	/* Make all state changes and commit them */
1279 	for (c = ecomps; c != NULL; c = c->ec_next) {
1280 		/*
1281 		 * If lockp is set, then entering kernel through ioctl.
1282 		 * For a MN set, the only ioctl path is via a commd message
1283 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1284 		 * being sent to each node.
1285 		 * In this case, set NO_XMIT so that set_sm_comp_state
1286 		 * won't attempt to send a message on a message.
1287 		 *
1288 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1289 		 * which flag is passed.
1290 		 */
1291 		if (lockp) {
1292 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1293 			    MD_STATE_NO_XMIT, lockp);
1294 		} else {
1295 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1296 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1297 		}
1298 		/*
1299 		 * For a MN set, the NOTIFY is done when the state change is
1300 		 * processed on each node
1301 		 */
1302 		if (!MD_MNSET_SETNO(setno)) {
1303 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1304 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1305 		}
1306 	}
1307 
1308 	if (lockp) {
1309 		md_ioctl_writerexit(lockp);
1310 		(void) md_ioctl_readerlock(lockp, ui);
1311 	} else {
1312 		md_unit_writerexit(ui);
1313 		(void) md_unit_readerlock(ui);
1314 	}
1315 
1316 	free_all_ecomps(ecomps);
1317 
1318 	/* allocate hotspares for all errored components */
1319 	if (MD_MNSET_SETNO(setno)) {
1320 		/*
1321 		 * If we're called from an ioctl (lockp set) then we cannot
1322 		 * directly call send_poke_hotspares as this will block until
1323 		 * the message gets despatched to all nodes. If the cluster is
1324 		 * going through a reconfig cycle then the message will block
1325 		 * until the cycle is complete, and as we originate from a
1326 		 * service call from commd we will livelock.
1327 		 */
1328 		if (lockp == NULL) {
1329 			md_unit_readerexit(ui);
1330 			send_poke_hotspares(setno);
1331 			(void) md_unit_readerlock(ui);
1332 		}
1333 	} else {
1334 		(void) poke_hotspares();
1335 	}
1336 	return (0);
1337 }
1338 
1339 void
mirror_overlap_tree_remove(md_mps_t * ps)1340 mirror_overlap_tree_remove(md_mps_t *ps)
1341 {
1342 	mm_unit_t	*un;
1343 
1344 	if (panicstr)
1345 		return;
1346 
1347 	VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
1348 	un = ps->ps_un;
1349 
1350 	mutex_enter(&un->un_overlap_tree_mx);
1351 	avl_remove(&un->un_overlap_root, ps);
1352 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1353 	if (un->un_overlap_tree_flag != 0) {
1354 		un->un_overlap_tree_flag = 0;
1355 		cv_broadcast(&un->un_overlap_tree_cv);
1356 	}
1357 	mutex_exit(&un->un_overlap_tree_mx);
1358 }
1359 
1360 
1361 /*
1362  * wait_for_overlaps:
1363  * -----------------
1364  * Check that given i/o request does not cause an overlap with already pending
1365  * i/o. If it does, block until the overlapped i/o completes.
1366  *
1367  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1368  * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
1369  * it must not already be in the tree.
1370  */
1371 static void
wait_for_overlaps(md_mps_t * ps,int flags)1372 wait_for_overlaps(md_mps_t *ps, int flags)
1373 {
1374 	mm_unit_t	*un;
1375 	avl_index_t	where;
1376 	md_mps_t	*ps1;
1377 
1378 	if (panicstr)
1379 		return;
1380 
1381 	un = ps->ps_un;
1382 	mutex_enter(&un->un_overlap_tree_mx);
1383 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1384 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1385 		mutex_exit(&un->un_overlap_tree_mx);
1386 		return;
1387 	}
1388 
1389 	VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1390 
1391 	do {
1392 		ps1 = avl_find(&un->un_overlap_root, ps, &where);
1393 		if (ps1 == NULL) {
1394 			/*
1395 			 * The candidate range does not overlap with any
1396 			 * range in the tree.  Insert it and be done.
1397 			 */
1398 			avl_insert(&un->un_overlap_root, ps, where);
1399 			ps->ps_flags |= MD_MPS_ON_OVERLAP;
1400 		} else {
1401 			/*
1402 			 * The candidate range would overlap.  Set the flag
1403 			 * indicating we need to be woken up, and sleep
1404 			 * until another thread removes a range.  If upon
1405 			 * waking up we find this mps was put on the tree
1406 			 * by another thread, the loop terminates.
1407 			 */
1408 			un->un_overlap_tree_flag = 1;
1409 			cv_wait(&un->un_overlap_tree_cv,
1410 			    &un->un_overlap_tree_mx);
1411 		}
1412 	} while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1413 	mutex_exit(&un->un_overlap_tree_mx);
1414 }
1415 
1416 /*
1417  * This function is called from mirror_done to check whether any pages have
1418  * been modified while a mirrored write was in progress.  Returns 0 if
1419  * all pages associated with bp are clean, 1 otherwise.
1420  */
1421 static int
any_pages_dirty(struct buf * bp)1422 any_pages_dirty(struct buf *bp)
1423 {
1424 	int	rval;
1425 
1426 	rval = biomodified(bp);
1427 	if (rval == -1)
1428 		rval = 0;
1429 
1430 	return (rval);
1431 }
1432 
1433 #define	MAX_EXTRAS 10
1434 
1435 void
mirror_commit(mm_unit_t * un,int smmask,mddb_recid_t * extras)1436 mirror_commit(
1437 	mm_unit_t	*un,
1438 	int		smmask,
1439 	mddb_recid_t	*extras
1440 )
1441 {
1442 	mm_submirror_t		*sm;
1443 	md_unit_t		*su;
1444 	int			i;
1445 
1446 	/* 2=mirror,null id */
1447 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
1448 
1449 	int			ri = 0;
1450 
1451 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1452 		return;
1453 
1454 	/* Add two, this includes the mirror unit and the null recid */
1455 	if (extras != NULL) {
1456 		int	nrecids = 0;
1457 		while (extras[nrecids] != 0) {
1458 			nrecids++;
1459 		}
1460 		ASSERT(nrecids <= MAX_EXTRAS);
1461 	}
1462 
1463 	if (un != NULL)
1464 		recids[ri++] = un->c.un_record_id;
1465 	for (i = 0;  i < NMIRROR; i++) {
1466 		if (!(smmask & SMI2BIT(i)))
1467 			continue;
1468 		sm = &un->un_sm[i];
1469 		if (!SMS_IS(sm, SMS_INUSE))
1470 			continue;
1471 		if (md_getmajor(sm->sm_dev) != md_major)
1472 			continue;
1473 		su =  MD_UNIT(md_getminor(sm->sm_dev));
1474 		recids[ri++] = su->c.un_record_id;
1475 	}
1476 
1477 	if (extras != NULL)
1478 		while (*extras != 0) {
1479 			recids[ri++] = *extras;
1480 			extras++;
1481 		}
1482 
1483 	if (ri == 0)
1484 		return;
1485 	recids[ri] = 0;
1486 
1487 	/*
1488 	 * Ok to hold ioctl lock across record commit to mddb as
1489 	 * long as the record(s) being committed aren't resync records.
1490 	 */
1491 	mddb_commitrecs_wrapper(recids);
1492 }
1493 
1494 
1495 /*
1496  * This routine is used to set a bit in the writable_bm bitmap
1497  * which represents each submirror in a metamirror which
1498  * is writable. The first writable submirror index is assigned
1499  * to the sm_index.  The number of writable submirrors are returned in nunits.
1500  *
1501  * This routine returns the submirror's unit number.
1502  */
1503 
1504 static void
select_write_units(struct mm_unit * un,md_mps_t * ps)1505 select_write_units(struct mm_unit *un, md_mps_t *ps)
1506 {
1507 
1508 	int		i;
1509 	unsigned	writable_bm = 0;
1510 	unsigned	nunits = 0;
1511 
1512 	for (i = 0; i < NMIRROR; i++) {
1513 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1514 			/* set bit of all writable units */
1515 			writable_bm |= SMI2BIT(i);
1516 			nunits++;
1517 		}
1518 	}
1519 	ps->ps_writable_sm = writable_bm;
1520 	ps->ps_active_cnt = nunits;
1521 	ps->ps_current_sm = 0;
1522 }
1523 
1524 static
1525 unsigned
select_write_after_read_units(struct mm_unit * un,md_mps_t * ps)1526 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1527 {
1528 
1529 	int		i;
1530 	unsigned	writable_bm = 0;
1531 	unsigned	nunits = 0;
1532 
1533 	for (i = 0; i < NMIRROR; i++) {
1534 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1535 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1536 			writable_bm |= SMI2BIT(i);
1537 			nunits++;
1538 		}
1539 	}
1540 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1541 		writable_bm &= ~ps->ps_allfrom_sm;
1542 		nunits--;
1543 	}
1544 	ps->ps_writable_sm = writable_bm;
1545 	ps->ps_active_cnt = nunits;
1546 	ps->ps_current_sm = 0;
1547 	return (nunits);
1548 }
1549 
1550 static md_dev64_t
select_read_unit(mm_unit_t * un,diskaddr_t blkno,u_longlong_t reqcount,u_longlong_t * cando,int must_be_opened,md_m_shared_t ** shared,md_mcs_t * cs)1551 select_read_unit(
1552 	mm_unit_t	*un,
1553 	diskaddr_t	blkno,
1554 	u_longlong_t	reqcount,
1555 	u_longlong_t	*cando,
1556 	int		must_be_opened,
1557 	md_m_shared_t	**shared,
1558 	md_mcs_t	*cs)
1559 {
1560 	int			i;
1561 	md_m_shared_t		*s;
1562 	uint_t			lasterrcnt = 0;
1563 	md_dev64_t		dev = 0;
1564 	u_longlong_t		cnt;
1565 	u_longlong_t		mincnt;
1566 	mm_submirror_t		*sm;
1567 	mm_submirror_ic_t	*smic;
1568 	mdi_unit_t		*ui;
1569 
1570 	mincnt = reqcount;
1571 	for (i = 0; i < NMIRROR; i++) {
1572 		if (!SUBMIRROR_IS_READABLE(un, i))
1573 			continue;
1574 		sm = &un->un_sm[i];
1575 		smic = &un->un_smic[i];
1576 		cnt = reqcount;
1577 
1578 		/*
1579 		 * If the current submirror is marked as inaccessible, do not
1580 		 * try to access it.
1581 		 */
1582 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1583 		(void) md_unit_readerlock(ui);
1584 		if (ui->ui_tstate & MD_INACCESSIBLE) {
1585 			md_unit_readerexit(ui);
1586 			continue;
1587 		}
1588 		md_unit_readerexit(ui);
1589 
1590 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1591 		    (sm->sm_dev, sm, blkno, &cnt);
1592 
1593 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1594 			continue;
1595 		if (s->ms_state == CS_OKAY) {
1596 			*cando = cnt;
1597 			if (shared != NULL)
1598 				*shared = s;
1599 
1600 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1601 			    cs != NULL) {
1602 				cs->cs_buf.b_flags |= B_FAILFAST;
1603 			}
1604 
1605 			return (un->un_sm[i].sm_dev);
1606 		}
1607 		if (s->ms_state != CS_LAST_ERRED)
1608 			continue;
1609 
1610 		/* don't use B_FAILFAST since we're Last Erred */
1611 
1612 		if (mincnt > cnt)
1613 			mincnt = cnt;
1614 		if (s->ms_lasterrcnt > lasterrcnt) {
1615 			lasterrcnt = s->ms_lasterrcnt;
1616 			if (shared != NULL)
1617 				*shared = s;
1618 			dev = un->un_sm[i].sm_dev;
1619 		}
1620 	}
1621 	*cando = mincnt;
1622 	return (dev);
1623 }
1624 
1625 /*
1626  * Given a 32-bit bitmap, this routine will return the bit number
1627  * of the nth bit set.	The nth bit set is passed via the index integer.
1628  *
1629  * This routine is used to run through the writable submirror bitmap
1630  * and starting all of the writes.  See the value returned is the
1631  * index to appropriate submirror structure, in the md_sm
1632  * array for metamirrors.
1633  */
1634 static int
md_find_nth_unit(uint_t mask,int index)1635 md_find_nth_unit(uint_t mask, int index)
1636 {
1637 	int	bit, nfound;
1638 
1639 	for (bit = -1, nfound = -1; nfound != index; bit++) {
1640 		ASSERT(mask != 0);
1641 		nfound += (mask & 1);
1642 		mask >>= 1;
1643 	}
1644 	return (bit);
1645 }
1646 
1647 static int
fast_select_read_unit(md_mps_t * ps,md_mcs_t * cs)1648 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1649 {
1650 	mm_unit_t	*un;
1651 	buf_t		*bp;
1652 	int		i;
1653 	unsigned	nunits = 0;
1654 	int		iunit;
1655 	uint_t		running_bm = 0;
1656 	uint_t		sm_index;
1657 
1658 	bp = &cs->cs_buf;
1659 	un = ps->ps_un;
1660 
1661 	for (i = 0; i < NMIRROR; i++) {
1662 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1663 			continue;
1664 		running_bm |= SMI2BIT(i);
1665 		nunits++;
1666 	}
1667 	if (nunits == 0)
1668 		return (1);
1669 
1670 	/*
1671 	 * For directed mirror read (DMR) we only use the specified side and
1672 	 * do not compute the source of the read.
1673 	 * If we're running with MD_MPS_DIRTY_RD set we always return the
1674 	 * first mirror side (this prevents unnecessary ownership switching).
1675 	 * Otherwise we return the submirror according to the mirror read option
1676 	 */
1677 	if (ps->ps_flags & MD_MPS_DMR) {
1678 		sm_index = un->un_dmr_last_read;
1679 	} else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
1680 		sm_index = md_find_nth_unit(running_bm, 0);
1681 	} else {
1682 		/* Normal (non-DMR) operation */
1683 		switch (un->un_read_option) {
1684 		case RD_GEOMETRY:
1685 			iunit = (int)(bp->b_lblkno /
1686 			    howmany(un->c.un_total_blocks, nunits));
1687 			sm_index = md_find_nth_unit(running_bm, iunit);
1688 			break;
1689 		case RD_FIRST:
1690 			sm_index = md_find_nth_unit(running_bm, 0);
1691 			break;
1692 		case RD_LOAD_BAL:
1693 			/* this is intentional to fall into the default */
1694 		default:
1695 			un->un_last_read = (un->un_last_read + 1) % nunits;
1696 			sm_index = md_find_nth_unit(running_bm,
1697 			    un->un_last_read);
1698 			break;
1699 		}
1700 	}
1701 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1702 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
1703 
1704 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1705 		bp->b_flags |= B_FAILFAST;
1706 	}
1707 
1708 	return (0);
1709 }
1710 
1711 static
1712 int
mirror_are_submirrors_available(mm_unit_t * un)1713 mirror_are_submirrors_available(mm_unit_t *un)
1714 {
1715 	int i;
1716 	for (i = 0; i < NMIRROR; i++) {
1717 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1718 
1719 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1720 		    md_getmajor(tmpdev) != md_major)
1721 			continue;
1722 
1723 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1724 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1725 			return (0);
1726 
1727 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1728 			return (0);
1729 	}
1730 	return (1);
1731 }
1732 
1733 void
build_submirror(mm_unit_t * un,int i,int snarfing)1734 build_submirror(mm_unit_t *un, int i, int snarfing)
1735 {
1736 	struct mm_submirror	*sm;
1737 	struct mm_submirror_ic	*smic;
1738 	md_unit_t		*su;
1739 	set_t			setno;
1740 
1741 	sm = &un->un_sm[i];
1742 	smic = &un->un_smic[i];
1743 
1744 	sm->sm_flags = 0; /* sometime we may need to do more here */
1745 
1746 	setno = MD_UN2SET(un);
1747 
1748 	if (!SMS_IS(sm, SMS_INUSE))
1749 		return;
1750 	if (snarfing) {
1751 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1752 		    sm->sm_key, MD_NOTRUST_DEVT);
1753 	} else {
1754 		if (md_getmajor(sm->sm_dev) == md_major) {
1755 			su = MD_UNIT(md_getminor(sm->sm_dev));
1756 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1757 			/* submirror can no longer be soft partitioned */
1758 			MD_CAPAB(su) &= (~MD_CAN_SP);
1759 		}
1760 	}
1761 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1762 	    0, "shared by blk", 0);
1763 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1764 	    0, "shared by indx", 0);
1765 	smic->sm_get_component_count = (int (*)())md_get_named_service(
1766 	    sm->sm_dev, 0, "get component count", 0);
1767 	smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
1768 	    "get block count skip size", 0);
1769 	sm->sm_state &= ~SMS_IGNORE;
1770 	if (SMS_IS(sm, SMS_OFFLINE))
1771 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1772 	md_set_parent(sm->sm_dev, MD_SID(un));
1773 }
1774 
1775 static void
mirror_cleanup(mm_unit_t * un)1776 mirror_cleanup(mm_unit_t *un)
1777 {
1778 	mddb_recid_t	recid;
1779 	int		smi;
1780 	sv_dev_t	sv[NMIRROR];
1781 	int		nsv = 0;
1782 
1783 	/*
1784 	 * If a MN diskset and this node is not the master, do
1785 	 * not delete any records on snarf of the mirror records.
1786 	 */
1787 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1788 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1789 		return;
1790 	}
1791 
1792 	for (smi = 0; smi < NMIRROR; smi++) {
1793 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1794 			continue;
1795 		sv[nsv].setno = MD_UN2SET(un);
1796 		sv[nsv++].key = un->un_sm[smi].sm_key;
1797 	}
1798 
1799 	recid = un->un_rr_dirty_recid;
1800 	mddb_deleterec_wrapper(un->c.un_record_id);
1801 	if (recid > 0)
1802 		mddb_deleterec_wrapper(recid);
1803 
1804 	md_rem_names(sv, nsv);
1805 }
1806 
1807 /*
1808  * Comparison function for the avl tree which tracks
1809  * outstanding writes on submirrors.
1810  *
1811  * Returns:
1812  *	-1: ps1 < ps2
1813  *	 0: ps1 and ps2 overlap
1814  *	 1: ps1 > ps2
1815  */
1816 static int
mirror_overlap_compare(const void * p1,const void * p2)1817 mirror_overlap_compare(const void *p1, const void *p2)
1818 {
1819 	const md_mps_t *ps1 = (md_mps_t *)p1;
1820 	const md_mps_t *ps2 = (md_mps_t *)p2;
1821 
1822 	if (ps1->ps_firstblk < ps2->ps_firstblk) {
1823 		if (ps1->ps_lastblk >= ps2->ps_firstblk)
1824 			return (0);
1825 		return (-1);
1826 	}
1827 
1828 	if (ps1->ps_firstblk > ps2->ps_firstblk) {
1829 		if (ps1->ps_firstblk <= ps2->ps_lastblk)
1830 			return (0);
1831 		return (1);
1832 	}
1833 
1834 	return (0);
1835 }
1836 
1837 /*
1838  * Collapse any sparse submirror entries snarfed from the on-disk replica.
1839  * Only the in-core entries are updated. The replica will be updated on-disk
1840  * when the in-core replica is committed on shutdown of the SVM subsystem.
1841  */
1842 static void
collapse_submirrors(mm_unit_t * un)1843 collapse_submirrors(mm_unit_t *un)
1844 {
1845 	int			smi, nremovals, smiremove;
1846 	mm_submirror_t		*sm, *new_sm, *old_sm;
1847 	mm_submirror_ic_t	*smic;
1848 	int			nsmidx = un->un_nsm - 1;
1849 
1850 rescan:
1851 	nremovals = 0;
1852 	smiremove = -1;
1853 
1854 	for (smi = 0; smi <= nsmidx; smi++) {
1855 		sm = &un->un_sm[smi];
1856 
1857 		/*
1858 		 * Check to see if this submirror is marked as in-use.
1859 		 * If it isn't then it is a potential sparse entry and
1860 		 * may need to be cleared from the configuration.
1861 		 * The records should _already_ have been cleared by the
1862 		 * original mirror_detach() code, but we need to shuffle
1863 		 * any NULL entries in un_sm[] to the end of the array.
1864 		 * Any NULL un_smic[] entries need to be reset to the underlying
1865 		 * submirror/slice accessor functions.
1866 		 */
1867 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
1868 			nremovals++;
1869 			smiremove = smi;
1870 			break;
1871 		}
1872 	}
1873 
1874 	if (nremovals == 0) {
1875 		/*
1876 		 * Ensure that we have a matching contiguous set of un_smic[]
1877 		 * entries for the corresponding un_sm[] entries
1878 		 */
1879 		for (smi = 0; smi <= nsmidx; smi++) {
1880 			smic = &un->un_smic[smi];
1881 			sm = &un->un_sm[smi];
1882 
1883 			smic->sm_shared_by_blk =
1884 			    md_get_named_service(sm->sm_dev, 0,
1885 			    "shared by_blk", 0);
1886 			smic->sm_shared_by_indx =
1887 			    md_get_named_service(sm->sm_dev, 0,
1888 			    "shared by indx", 0);
1889 			smic->sm_get_component_count =
1890 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
1891 			    "get component count", 0);
1892 			smic->sm_get_bcss =
1893 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
1894 			    "get block count skip size", 0);
1895 		}
1896 		return;
1897 	}
1898 
1899 	/*
1900 	 * Reshuffle the submirror devices so that we do not have a dead record
1901 	 * in the middle of the array. Once we've done this we need to rescan
1902 	 * the mirror to check for any other holes.
1903 	 */
1904 	for (smi = 0; smi < NMIRROR; smi++) {
1905 		if (smi < smiremove)
1906 			continue;
1907 		if (smi > smiremove) {
1908 			old_sm = &un->un_sm[smi];
1909 			new_sm = &un->un_sm[smi - 1];
1910 			bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
1911 			bzero(old_sm, sizeof (mm_submirror_t));
1912 		}
1913 	}
1914 
1915 	/*
1916 	 * Now we need to rescan the array to find the next potential dead
1917 	 * entry.
1918 	 */
1919 	goto rescan;
1920 }
1921 
1922 /* Return a -1 if optimized record unavailable and set should be released */
1923 int
mirror_build_incore(mm_unit_t * un,int snarfing)1924 mirror_build_incore(mm_unit_t *un, int snarfing)
1925 {
1926 	int		i;
1927 
1928 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1929 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1930 		return (1);
1931 	}
1932 
1933 	if (mirror_are_submirrors_available(un) == 0)
1934 		return (1);
1935 
1936 	if (MD_UNIT(MD_SID(un)) != NULL)
1937 		return (0);
1938 
1939 	MD_STATUS(un) = 0;
1940 
1941 	/* pre-4.1 didn't define CAN_META_CHILD capability */
1942 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1943 
1944 	un->un_overlap_tree_flag = 0;
1945 	avl_create(&un->un_overlap_root, mirror_overlap_compare,
1946 	    sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
1947 
1948 	/*
1949 	 * We need to collapse any sparse submirror entries into a non-sparse
1950 	 * array. This is to cover the case where we have an old replica image
1951 	 * which has not been updated (i.e. snarfed) since being modified.
1952 	 * The new code expects all submirror access to be sequential (i.e.
1953 	 * both the un_sm[] and un_smic[] entries correspond to non-empty
1954 	 * submirrors.
1955 	 */
1956 
1957 	collapse_submirrors(un);
1958 
1959 	for (i = 0; i < NMIRROR; i++)
1960 		build_submirror(un, i, snarfing);
1961 
1962 	if (unit_setup_resync(un, snarfing) != 0) {
1963 		if (snarfing) {
1964 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1965 			/*
1966 			 * If a MN set and set is not stale, then return -1
1967 			 * which will force the caller to unload the set.
1968 			 * The MN diskset nodes will return failure if
1969 			 * unit_setup_resync fails so that nodes won't
1970 			 * get out of sync.
1971 			 *
1972 			 * If set is STALE, the master node can't allocate
1973 			 * a resync record (if needed), but node needs to
1974 			 * join the set so that user can delete broken mddbs.
1975 			 * So, if set is STALE, just continue on.
1976 			 */
1977 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1978 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1979 				return (-1);
1980 			}
1981 		} else
1982 			return (1);
1983 	}
1984 
1985 	mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
1986 	cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
1987 
1988 	un->un_suspend_wr_flag = 0;
1989 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1990 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1991 
1992 	/*
1993 	 * Allocate mutexes for mirror-owner and resync-owner changes.
1994 	 * All references to the owner message state field must be guarded
1995 	 * by this mutex.
1996 	 */
1997 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1998 
1999 	/*
2000 	 * Allocate mutex and condvar for resync thread manipulation. These
2001 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
2002 	 */
2003 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
2004 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
2005 
2006 	/*
2007 	 * Allocate mutex and condvar for resync progress thread manipulation.
2008 	 * This allows resyncs to be continued across an intervening reboot.
2009 	 */
2010 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
2011 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
2012 
2013 	/*
2014 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
2015 	 * provides synchronization between a user-ioctl and the resulting
2016 	 * strategy() call that performs the read().
2017 	 */
2018 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
2019 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
2020 
2021 	/*
2022 	 * Allocate rwlocks for un_pernode_dirty_bm accessing.
2023 	 */
2024 	for (i = 0; i < MD_MNMAXSIDES; i++) {
2025 		rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
2026 	}
2027 
2028 	/* place various information in the in-core data structures */
2029 	md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
2030 	MD_UNIT(MD_SID(un)) = un;
2031 
2032 	return (0);
2033 }
2034 
2035 
2036 void
reset_mirror(struct mm_unit * un,minor_t mnum,int removing)2037 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
2038 {
2039 	mddb_recid_t	recid, vtoc_id;
2040 	size_t		bitcnt;
2041 	size_t		shortcnt;
2042 	int		smi;
2043 	sv_dev_t	sv[NMIRROR];
2044 	int		nsv = 0;
2045 	uint_t		bits = 0;
2046 	minor_t		selfid;
2047 	md_unit_t	*su;
2048 	int		i;
2049 
2050 	md_destroy_unit_incore(mnum, &mirror_md_ops);
2051 
2052 	shortcnt = un->un_rrd_num * sizeof (short);
2053 	bitcnt = howmany(un->un_rrd_num, NBBY);
2054 
2055 	if (un->un_outstanding_writes)
2056 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
2057 	if (un->un_goingclean_bm)
2058 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
2059 	if (un->un_goingdirty_bm)
2060 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
2061 	if (un->un_resync_bm)
2062 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
2063 	if (un->un_pernode_dirty_sum)
2064 		kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
2065 
2066 	/*
2067 	 * Destroy the taskq for deferred processing of DRL clean requests.
2068 	 * This taskq will only be present for Multi Owner mirrors.
2069 	 */
2070 	if (un->un_drl_task != NULL)
2071 		ddi_taskq_destroy(un->un_drl_task);
2072 
2073 	md_nblocks_set(mnum, -1ULL);
2074 	MD_UNIT(mnum) = NULL;
2075 
2076 	/*
2077 	 * Attempt release of its minor node
2078 	 */
2079 	md_remove_minor_node(mnum);
2080 
2081 	if (!removing)
2082 		return;
2083 
2084 	for (smi = 0; smi < NMIRROR; smi++) {
2085 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
2086 			continue;
2087 		/* reallow soft partitioning of submirror and reset parent */
2088 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
2089 		MD_CAPAB(su) |= MD_CAN_SP;
2090 		md_reset_parent(un->un_sm[smi].sm_dev);
2091 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
2092 
2093 		sv[nsv].setno = MD_MIN2SET(mnum);
2094 		sv[nsv++].key = un->un_sm[smi].sm_key;
2095 		bits |= SMI2BIT(smi);
2096 	}
2097 
2098 	MD_STATUS(un) |= MD_UN_BEING_RESET;
2099 	recid = un->un_rr_dirty_recid;
2100 	vtoc_id = un->c.un_vtoc_id;
2101 	selfid = MD_SID(un);
2102 
2103 	mirror_commit(un, bits, 0);
2104 
2105 	avl_destroy(&un->un_overlap_root);
2106 
2107 	/* Destroy all mutexes and condvars before returning. */
2108 	mutex_destroy(&un->un_suspend_wr_mx);
2109 	cv_destroy(&un->un_suspend_wr_cv);
2110 	mutex_destroy(&un->un_overlap_tree_mx);
2111 	cv_destroy(&un->un_overlap_tree_cv);
2112 	mutex_destroy(&un->un_owner_mx);
2113 	mutex_destroy(&un->un_rs_thread_mx);
2114 	cv_destroy(&un->un_rs_thread_cv);
2115 	mutex_destroy(&un->un_rs_progress_mx);
2116 	cv_destroy(&un->un_rs_progress_cv);
2117 	mutex_destroy(&un->un_dmr_mx);
2118 	cv_destroy(&un->un_dmr_cv);
2119 
2120 	for (i = 0; i < MD_MNMAXSIDES; i++) {
2121 		rw_destroy(&un->un_pernode_dirty_mx[i]);
2122 		if (un->un_pernode_dirty_bm[i])
2123 			kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
2124 	}
2125 
2126 	/*
2127 	 * Remove self from the namespace
2128 	 */
2129 	if (un->c.un_revision & MD_FN_META_DEV) {
2130 		(void) md_rem_selfname(un->c.un_self_id);
2131 	}
2132 
2133 	/* This frees the unit structure. */
2134 	mddb_deleterec_wrapper(un->c.un_record_id);
2135 
2136 	if (recid != 0)
2137 		mddb_deleterec_wrapper(recid);
2138 
2139 	/* Remove the vtoc, if present */
2140 	if (vtoc_id)
2141 		mddb_deleterec_wrapper(vtoc_id);
2142 
2143 	md_rem_names(sv, nsv);
2144 
2145 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2146 	    MD_MIN2SET(selfid), selfid);
2147 }
2148 
2149 int
mirror_internal_open(minor_t mnum,int flag,int otyp,int md_oflags,IOLOCK * lockp)2150 mirror_internal_open(
2151 	minor_t		mnum,
2152 	int		flag,
2153 	int		otyp,
2154 	int		md_oflags,
2155 	IOLOCK		*lockp		/* can be NULL */
2156 )
2157 {
2158 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2159 	int		err = 0;
2160 
2161 tryagain:
2162 	/* single thread */
2163 	if (lockp) {
2164 		/*
2165 		 * If ioctl lock is held, use openclose_enter
2166 		 * routine that will set the ioctl flag when
2167 		 * grabbing the readerlock.
2168 		 */
2169 		(void) md_ioctl_openclose_enter(lockp, ui);
2170 	} else {
2171 		(void) md_unit_openclose_enter(ui);
2172 	}
2173 
2174 	/*
2175 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2176 	 * message in a MN diskset and this requires that the openclose
2177 	 * lock is dropped in order to send this message.  So, another
2178 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2179 	 * attempting an open while this thread has an open in progress.
2180 	 * Call the *_lh version of the lock exit routines since the ui_mx
2181 	 * mutex must be held from checking for OPENINPROGRESS until
2182 	 * after the cv_wait call.
2183 	 */
2184 	mutex_enter(&ui->ui_mx);
2185 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2186 		if (lockp) {
2187 			(void) md_ioctl_openclose_exit_lh(lockp);
2188 		} else {
2189 			md_unit_openclose_exit_lh(ui);
2190 		}
2191 		cv_wait(&ui->ui_cv, &ui->ui_mx);
2192 		mutex_exit(&ui->ui_mx);
2193 		goto tryagain;
2194 	}
2195 
2196 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
2197 	mutex_exit(&ui->ui_mx);
2198 
2199 	/* open devices, if necessary */
2200 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2201 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2202 			goto out;
2203 	}
2204 
2205 	/* count open */
2206 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2207 		goto out;
2208 
2209 	/* unlock, return success */
2210 out:
2211 	mutex_enter(&ui->ui_mx);
2212 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2213 	mutex_exit(&ui->ui_mx);
2214 
2215 	if (lockp) {
2216 		/*
2217 		 * If ioctl lock is held, use openclose_exit
2218 		 * routine that will clear the lockp reader flag.
2219 		 */
2220 		(void) md_ioctl_openclose_exit(lockp);
2221 	} else {
2222 		md_unit_openclose_exit(ui);
2223 	}
2224 	return (err);
2225 }
2226 
2227 int
mirror_internal_close(minor_t mnum,int otyp,int md_cflags,IOLOCK * lockp)2228 mirror_internal_close(
2229 	minor_t		mnum,
2230 	int		otyp,
2231 	int		md_cflags,
2232 	IOLOCK		*lockp		/* can be NULL */
2233 )
2234 {
2235 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2236 	mm_unit_t	*un;
2237 	int		err = 0;
2238 
2239 	/* single thread */
2240 	if (lockp) {
2241 		/*
2242 		 * If ioctl lock is held, use openclose_enter
2243 		 * routine that will set the ioctl flag when
2244 		 * grabbing the readerlock.
2245 		 */
2246 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2247 	} else {
2248 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
2249 	}
2250 
2251 	/* count closed */
2252 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
2253 		goto out;
2254 
2255 	/* close devices, if necessary */
2256 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2257 		/*
2258 		 * Clean up dirty bitmap for this unit. Do this
2259 		 * before closing the underlying devices to avoid
2260 		 * race conditions with reset_mirror() as a
2261 		 * result of a 'metaset -r' command running in
2262 		 * parallel. This might cause deallocation of
2263 		 * dirty region bitmaps; with underlying metadevices
2264 		 * in place this can't happen.
2265 		 * Don't do this if a MN set and ABR not set
2266 		 */
2267 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2268 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2269 			    !(ui->ui_tstate & MD_ABR_CAP))
2270 				mirror_process_unit_resync(un);
2271 		}
2272 		(void) mirror_close_all_devs(un, md_cflags);
2273 
2274 		/*
2275 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
2276 		 * clear these capabilities on the last open in the cluster.
2277 		 * To do this we send a message to all nodes to see of the
2278 		 * device is open.
2279 		 */
2280 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2281 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2282 			if (lockp) {
2283 				(void) md_ioctl_openclose_exit(lockp);
2284 			} else {
2285 				md_unit_openclose_exit(ui);
2286 			}
2287 
2288 			/*
2289 			 * if we are in the context of an ioctl, drop the
2290 			 * ioctl lock.
2291 			 * Otherwise, no other locks should be held.
2292 			 */
2293 			if (lockp) {
2294 				IOLOCK_RETURN_RELEASE(0, lockp);
2295 			}
2296 
2297 			mdmn_clear_all_capabilities(mnum);
2298 
2299 			/* if dropped the lock previously, regain it */
2300 			if (lockp) {
2301 				IOLOCK_RETURN_REACQUIRE(lockp);
2302 			}
2303 			return (0);
2304 		}
2305 		/* unlock and return success */
2306 	}
2307 out:
2308 	/* Call whether lockp is NULL or not. */
2309 	if (lockp) {
2310 		md_ioctl_openclose_exit(lockp);
2311 	} else {
2312 		md_unit_openclose_exit(ui);
2313 	}
2314 	return (err);
2315 }
2316 
2317 /*
2318  * When a component has completed resyncing and is now ok, check if the
2319  * corresponding component in the other submirrors is in the Last Erred
2320  * state.  If it is, we want to change that to the Erred state so we stop
2321  * using that component and start using this good component instead.
2322  *
2323  * This is called from set_sm_comp_state and recursively calls
2324  * set_sm_comp_state if it needs to change the Last Erred state.
2325  */
2326 static void
reset_lasterred(mm_unit_t * un,int smi,mddb_recid_t * extras,uint_t flags,IOLOCK * lockp)2327 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2328 	IOLOCK *lockp)
2329 {
2330 	mm_submirror_t		*sm;
2331 	mm_submirror_ic_t	*smic;
2332 	int			ci;
2333 	int			i;
2334 	int			compcnt;
2335 	int			changed = 0;
2336 
2337 	for (i = 0; i < NMIRROR; i++) {
2338 		sm = &un->un_sm[i];
2339 		smic = &un->un_smic[i];
2340 
2341 		if (!SMS_IS(sm, SMS_INUSE))
2342 			continue;
2343 
2344 		/* ignore the submirror that we just made ok */
2345 		if (i == smi)
2346 			continue;
2347 
2348 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2349 		for (ci = 0; ci < compcnt; ci++) {
2350 			md_m_shared_t	*shared;
2351 
2352 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2353 			    (sm->sm_dev, sm, ci);
2354 
2355 			if ((shared->ms_state & CS_LAST_ERRED) &&
2356 			    !mirror_other_sources(un, i, ci, 1)) {
2357 
2358 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2359 				    flags, lockp);
2360 				changed = 1;
2361 			}
2362 		}
2363 	}
2364 
2365 	/* maybe there is a hotspare for this newly erred component */
2366 	if (changed) {
2367 		set_t	setno;
2368 
2369 		setno = MD_UN2SET(un);
2370 		if (MD_MNSET_SETNO(setno)) {
2371 			send_poke_hotspares(setno);
2372 		} else {
2373 			(void) poke_hotspares();
2374 		}
2375 	}
2376 }
2377 
2378 /*
2379  * set_sm_comp_state
2380  *
2381  * Set the state of a submirror component to the specified new state.
2382  * If the mirror is in a multi-node set, send messages to all nodes to
2383  * block all writes to the mirror and then update the state and release the
2384  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2385  * MD_STATE_XMIT will be unset in 2 cases:
2386  * 1. When the state is changed to CS_RESYNC as this state change
2387  * will already have been updated on each node by the processing of the
2388  * distributed metasync command, hence no need to xmit.
2389  * 2. When the state is change to CS_OKAY after a resync has completed. Again
2390  * the resync completion will already have been processed on each node by
2391  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2392  * resync, hence no need to xmit.
2393  *
2394  * In case we are called from the updates of a watermark,
2395  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2396  * a metainit or similar. In this case the message that we sent to propagate
2397  * the state change must not be a class1 message as that would deadlock with
2398  * the metainit command that is still being processed.
2399  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2400  * instead. This also makes the submessage generator to create a class2
2401  * submessage rather than a class1 (which would also block)
2402  *
2403  * On entry, unit_writerlock is held
2404  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2405  * also held.
2406  */
2407 void
set_sm_comp_state(mm_unit_t * un,int smi,int ci,int newstate,mddb_recid_t * extras,uint_t flags,IOLOCK * lockp)2408 set_sm_comp_state(
2409 	mm_unit_t	*un,
2410 	int		smi,
2411 	int		ci,
2412 	int		newstate,
2413 	mddb_recid_t	*extras,
2414 	uint_t		flags,
2415 	IOLOCK		*lockp
2416 )
2417 {
2418 	mm_submirror_t		*sm;
2419 	mm_submirror_ic_t	*smic;
2420 	md_m_shared_t		*shared;
2421 	int			origstate;
2422 	void			(*get_dev)();
2423 	ms_cd_info_t		cd;
2424 	char			devname[MD_MAX_CTDLEN];
2425 	int			err;
2426 	set_t			setno = MD_UN2SET(un);
2427 	md_mn_msg_stch_t	stchmsg;
2428 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2429 	md_mn_kresult_t		*kresult;
2430 	int			rval;
2431 	uint_t			msgflags;
2432 	md_mn_msgtype_t		msgtype;
2433 	int			save_lock = 0;
2434 	mdi_unit_t		*ui_sm;
2435 	int			nretries = 0;
2436 
2437 	sm = &un->un_sm[smi];
2438 	smic = &un->un_smic[smi];
2439 
2440 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
2441 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2442 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2443 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
2444 		ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2445 	}
2446 
2447 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2448 	    (sm->sm_dev, sm, ci);
2449 	origstate = shared->ms_state;
2450 
2451 	/*
2452 	 * If the new state is an error and the old one wasn't, generate
2453 	 * a console message. We do this before we send the state to other
2454 	 * nodes in a MN set because the state change may change the component
2455 	 * name  if a hotspare is allocated.
2456 	 */
2457 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2458 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2459 
2460 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2461 		    "get device", 0);
2462 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2463 
2464 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2465 		    cd.cd_dev, devname, sizeof (devname));
2466 
2467 		if (err == ENOENT) {
2468 			(void) md_devname(setno, cd.cd_dev, devname,
2469 			    sizeof (devname));
2470 		}
2471 
2472 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2473 		    md_shortname(md_getminor(sm->sm_dev)), devname);
2474 
2475 		if (newstate & CS_LAST_ERRED) {
2476 			cmn_err(CE_WARN, "md: %s: %s last erred",
2477 			    md_shortname(md_getminor(sm->sm_dev)),
2478 			    devname);
2479 
2480 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
2481 			/*
2482 			 * Close the broken device and clear the open flag on
2483 			 * it.  Closing the device means the RCM framework will
2484 			 * be able to unconfigure the device if required.
2485 			 *
2486 			 * We have to check that the device is open, otherwise
2487 			 * the first open on it has resulted in the error that
2488 			 * is being processed and the actual cd.cd_dev will be
2489 			 * NODEV64.
2490 			 *
2491 			 * If this is a multi-node mirror, then the multinode
2492 			 * state checks following this code will cause the
2493 			 * slave nodes to close the mirror in the function
2494 			 * mirror_set_state().
2495 			 */
2496 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2497 			shared->ms_flags &= ~MDM_S_ISOPEN;
2498 		}
2499 
2500 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2501 	    (shared->ms_flags & MDM_S_ISOPEN)) {
2502 		/*
2503 		 * Similar to logic above except no log messages since we
2504 		 * are just transitioning from Last Erred to Erred.
2505 		 */
2506 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2507 		    "get device", 0);
2508 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2509 
2510 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2511 		shared->ms_flags &= ~MDM_S_ISOPEN;
2512 	}
2513 
2514 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2515 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2516 		/*
2517 		 * For a multi-node mirror, send the state change to the
2518 		 * master, which broadcasts to all nodes, including this
2519 		 * one. Once the message is received, the state is set
2520 		 * in-core and the master commits the change to disk.
2521 		 * There is a case, comp_replace,  where this function
2522 		 * can be called from within an ioctl and therefore in this
2523 		 * case, as the ioctl will already be called on each node,
2524 		 * there is no need to xmit the state change to the master for
2525 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
2526 		 * to indicate whether a xmit is required. The mirror's
2527 		 * transient state is set to MD_ERR_PENDING to avoid sending
2528 		 * multiple messages.
2529 		 */
2530 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
2531 			ui->ui_tstate |= MD_ERR_PENDING;
2532 
2533 		/*
2534 		 * Send a state update message to all nodes. This message
2535 		 * will generate 2 submessages, the first one to suspend
2536 		 * all writes to the mirror and the second to update the
2537 		 * state and resume writes.
2538 		 */
2539 		stchmsg.msg_stch_mnum = un->c.un_self_id;
2540 		stchmsg.msg_stch_sm = smi;
2541 		stchmsg.msg_stch_comp = ci;
2542 		stchmsg.msg_stch_new_state = newstate;
2543 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2544 #ifdef DEBUG
2545 		if (mirror_debug_flag)
2546 			printf("send set state, %x, %x, %x, %x, %x\n",
2547 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2548 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2549 			    stchmsg.msg_stch_hs_id);
2550 #endif
2551 		if (flags & MD_STATE_WMUPDATE) {
2552 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
2553 			/*
2554 			 * When coming from an update of watermarks, there
2555 			 * must already be a message logged that triggered
2556 			 * this action. So, no need to log this message, too.
2557 			 */
2558 			msgflags = MD_MSGF_NO_LOG;
2559 		} else {
2560 			msgtype  = MD_MN_MSG_STATE_UPDATE;
2561 			msgflags = MD_MSGF_DEFAULT_FLAGS;
2562 		}
2563 
2564 		/*
2565 		 * If we are in the context of an ioctl, drop the ioctl lock.
2566 		 * lockp holds the list of locks held.
2567 		 *
2568 		 * Otherwise, increment the appropriate reacquire counters.
2569 		 * If openclose lock is *held, then must reacquire reader
2570 		 * lock before releasing the openclose lock.
2571 		 * Do not drop the ARRAY_WRITER lock as we may not be able
2572 		 * to reacquire it.
2573 		 */
2574 		if (lockp) {
2575 			if (lockp->l_flags & MD_ARRAY_WRITER) {
2576 				save_lock = MD_ARRAY_WRITER;
2577 				lockp->l_flags &= ~MD_ARRAY_WRITER;
2578 			} else if (lockp->l_flags & MD_ARRAY_READER) {
2579 				save_lock = MD_ARRAY_READER;
2580 				lockp->l_flags &= ~MD_ARRAY_READER;
2581 			}
2582 			IOLOCK_RETURN_RELEASE(0, lockp);
2583 		} else {
2584 			if (flags & MD_STATE_OCHELD) {
2585 				md_unit_writerexit(ui);
2586 				(void) md_unit_readerlock(ui);
2587 				md_unit_openclose_exit(ui);
2588 			} else {
2589 				md_unit_writerexit(ui);
2590 			}
2591 		}
2592 
2593 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2594 sscs_msg:
2595 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
2596 		    (char *)&stchmsg, sizeof (stchmsg), kresult);
2597 
2598 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2599 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2600 			/* If we're shutting down already, pause things here. */
2601 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
2602 				while (!md_mn_is_commd_present()) {
2603 					delay(md_hz);
2604 				}
2605 				/*
2606 				 * commd is now available; retry the message
2607 				 * one time. If that fails we fall through and
2608 				 * panic as the system is in an unexpected state
2609 				 */
2610 				if (nretries++ == 0)
2611 					goto sscs_msg;
2612 			}
2613 			cmn_err(CE_PANIC,
2614 			    "ksend_message failure: STATE_UPDATE");
2615 		}
2616 		kmem_free(kresult, sizeof (md_mn_kresult_t));
2617 
2618 		/* if dropped the lock previously, regain it */
2619 		if (lockp) {
2620 			IOLOCK_RETURN_REACQUIRE(lockp);
2621 			lockp->l_flags |= save_lock;
2622 		} else {
2623 			/*
2624 			 * Reacquire dropped locks and update acquirecnts
2625 			 * appropriately.
2626 			 */
2627 			if (flags & MD_STATE_OCHELD) {
2628 				/*
2629 				 * openclose also grabs readerlock.
2630 				 */
2631 				(void) md_unit_openclose_enter(ui);
2632 				md_unit_readerexit(ui);
2633 				(void) md_unit_writerlock(ui);
2634 			} else {
2635 				(void) md_unit_writerlock(ui);
2636 			}
2637 		}
2638 
2639 		ui->ui_tstate &= ~MD_ERR_PENDING;
2640 	} else {
2641 		shared->ms_state = newstate;
2642 		uniqtime32(&shared->ms_timestamp);
2643 
2644 		if (newstate == CS_ERRED)
2645 			shared->ms_flags |= MDM_S_NOWRITE;
2646 		else
2647 			shared->ms_flags &= ~MDM_S_NOWRITE;
2648 
2649 		shared->ms_flags &= ~MDM_S_IOERR;
2650 		un->un_changecnt++;
2651 		shared->ms_lasterrcnt = un->un_changecnt;
2652 
2653 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2654 		mirror_commit(un, SMI2BIT(smi), extras);
2655 	}
2656 
2657 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2658 		/*
2659 		 * Resetting the Last Erred state will recursively call back
2660 		 * into this function (set_sm_comp_state) to update the state.
2661 		 */
2662 		reset_lasterred(un, smi, extras, flags, lockp);
2663 	}
2664 }
2665 
2666 static int
find_another_logical(mm_unit_t * un,mm_submirror_t * esm,diskaddr_t blk,u_longlong_t cnt,int must_be_open,int state,int err_cnt)2667 find_another_logical(
2668 	mm_unit_t		*un,
2669 	mm_submirror_t		*esm,
2670 	diskaddr_t		blk,
2671 	u_longlong_t		cnt,
2672 	int			must_be_open,
2673 	int			state,
2674 	int			err_cnt)
2675 {
2676 	u_longlong_t	cando;
2677 	md_dev64_t	dev;
2678 	md_m_shared_t	*s;
2679 
2680 	esm->sm_state |= SMS_IGNORE;
2681 	while (cnt != 0) {
2682 		u_longlong_t	 mcnt;
2683 
2684 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
2685 
2686 		dev = select_read_unit(un, blk, mcnt, &cando,
2687 		    must_be_open, &s, NULL);
2688 		if (dev == (md_dev64_t)0)
2689 			break;
2690 
2691 		if ((state == CS_LAST_ERRED) &&
2692 		    (s->ms_state == CS_LAST_ERRED) &&
2693 		    (err_cnt > s->ms_lasterrcnt))
2694 			break;
2695 
2696 		cnt -= cando;
2697 		blk += cando;
2698 	}
2699 	esm->sm_state &= ~SMS_IGNORE;
2700 	return (cnt != 0);
2701 }
2702 
2703 int
mirror_other_sources(mm_unit_t * un,int smi,int ci,int must_be_open)2704 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2705 {
2706 	mm_submirror_t		*sm;
2707 	mm_submirror_ic_t	*smic;
2708 	size_t			count;
2709 	diskaddr_t		block;
2710 	u_longlong_t		skip;
2711 	u_longlong_t		size;
2712 	md_dev64_t		dev;
2713 	int			cnt;
2714 	md_m_shared_t		*s;
2715 	int			not_found;
2716 
2717 	sm = &un->un_sm[smi];
2718 	smic = &un->un_smic[smi];
2719 	dev = sm->sm_dev;
2720 
2721 	/*
2722 	 * Make sure every component of the submirror
2723 	 * has other sources.
2724 	 */
2725 	if (ci < 0) {
2726 		/* Find the highest lasterrcnt */
2727 		cnt = (*(smic->sm_get_component_count))(dev, sm);
2728 		for (ci = 0; ci < cnt; ci++) {
2729 			not_found = mirror_other_sources(un, smi, ci,
2730 			    must_be_open);
2731 			if (not_found)
2732 				return (1);
2733 		}
2734 		return (0);
2735 	}
2736 
2737 	/*
2738 	 * Make sure this component has other sources
2739 	 */
2740 	(void) (*(smic->sm_get_bcss))
2741 	    (dev, sm, ci, &block, &count, &skip, &size);
2742 
2743 	if (count == 0)
2744 		return (1);
2745 
2746 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2747 
2748 	while (count--) {
2749 		if (block >= un->c.un_total_blocks)
2750 			return (0);
2751 
2752 		if ((block + size) > un->c.un_total_blocks)
2753 			size = un->c.un_total_blocks - block;
2754 
2755 		not_found = find_another_logical(un, sm, block, size,
2756 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
2757 		if (not_found)
2758 			return (1);
2759 
2760 		block += size + skip;
2761 	}
2762 	return (0);
2763 }
2764 
2765 static void
finish_error(md_mps_t * ps)2766 finish_error(md_mps_t *ps)
2767 {
2768 	struct buf	*pb;
2769 	mm_unit_t	*un;
2770 	mdi_unit_t	*ui;
2771 	uint_t		new_str_flags;
2772 
2773 	pb = ps->ps_bp;
2774 	un = ps->ps_un;
2775 	ui = ps->ps_ui;
2776 
2777 	/*
2778 	 * Must flag any error to the resync originator if we're performing
2779 	 * a Write-after-Read. This corresponds to an i/o error on a resync
2780 	 * target device and in this case we ought to abort the resync as there
2781 	 * is nothing that can be done to recover from this without operator
2782 	 * intervention. If we don't set the B_ERROR flag we will continue
2783 	 * reading from the mirror but won't write to the target (as it will
2784 	 * have been placed into an errored state).
2785 	 * To handle the case of multiple components within a submirror we only
2786 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2787 	 * The originator of the resync read will cause this bit to be set if
2788 	 * the underlying component count is one for a submirror resync. All
2789 	 * other resync types will have the flag set as there is no underlying
2790 	 * resync which can be performed on a contained metadevice for these
2791 	 * resync types (optimized or component).
2792 	 */
2793 
2794 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2795 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2796 			pb->b_flags |= B_ERROR;
2797 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2798 		MPS_FREE(mirror_parent_cache, ps);
2799 		md_unit_readerexit(ui);
2800 		md_biodone(pb);
2801 		return;
2802 	}
2803 	/*
2804 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2805 	 * operation therefore this I/O request has already been counted,
2806 	 * the I/O count variable will be decremented by mirror_done()'s
2807 	 * call to md_biodone().
2808 	 */
2809 	if (ps->ps_changecnt != un->un_changecnt) {
2810 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2811 		if (ps->ps_flags & MD_MPS_WOW)
2812 			new_str_flags |= MD_STR_WOW;
2813 		if (ps->ps_flags & MD_MPS_MAPPED)
2814 			new_str_flags |= MD_STR_MAPPED;
2815 		/*
2816 		 * If this I/O request was a read that was part of a resync,
2817 		 * set MD_STR_WAR for the retried read to ensure that the
2818 		 * resync write (i.e. write-after-read) will be performed
2819 		 */
2820 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
2821 			new_str_flags |= MD_STR_WAR;
2822 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2823 		MPS_FREE(mirror_parent_cache, ps);
2824 		md_unit_readerexit(ui);
2825 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
2826 		return;
2827 	}
2828 
2829 	pb->b_flags |= B_ERROR;
2830 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2831 	MPS_FREE(mirror_parent_cache, ps);
2832 	md_unit_readerexit(ui);
2833 	md_biodone(pb);
2834 }
2835 
2836 static void
error_update_unit(md_mps_t * ps)2837 error_update_unit(md_mps_t *ps)
2838 {
2839 	mm_unit_t		*un;
2840 	mdi_unit_t		*ui;
2841 	int			smi;	/* sub mirror index */
2842 	int			ci;	/* errored component */
2843 	set_t			setno;
2844 	uint_t			flags;	/* for set_sm_comp_state() */
2845 	uint_t			hspflags; /* for check_comp_4_hotspares() */
2846 
2847 	ui = ps->ps_ui;
2848 	un = (mm_unit_t *)md_unit_writerlock(ui);
2849 	setno = MD_UN2SET(un);
2850 
2851 	/* All of these updates have to propagated in case of MN set */
2852 	flags = MD_STATE_XMIT;
2853 	hspflags = MD_HOTSPARE_XMIT;
2854 
2855 	/* special treatment if we are called during updating watermarks */
2856 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
2857 		flags |= MD_STATE_WMUPDATE;
2858 		hspflags |= MD_HOTSPARE_WMUPDATE;
2859 	}
2860 	smi = 0;
2861 	ci = 0;
2862 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2863 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
2864 
2865 			/* Never called from ioctl context, so (IOLOCK *)NULL */
2866 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2867 			    (IOLOCK *)NULL);
2868 			/*
2869 			 * For a MN set, the NOTIFY is done when the state
2870 			 * change is processed on each node
2871 			 */
2872 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2873 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2874 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
2875 			}
2876 			continue;
2877 		}
2878 		/* Never called from ioctl context, so (IOLOCK *)NULL */
2879 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2880 		    (IOLOCK *)NULL);
2881 		/*
2882 		 * For a MN set, the NOTIFY is done when the state
2883 		 * change is processed on each node
2884 		 */
2885 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2886 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2887 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2888 		}
2889 		smi = 0;
2890 		ci = 0;
2891 	}
2892 
2893 	md_unit_writerexit(ui);
2894 	if (MD_MNSET_SETNO(setno)) {
2895 		send_poke_hotspares(setno);
2896 	} else {
2897 		(void) poke_hotspares();
2898 	}
2899 	(void) md_unit_readerlock(ui);
2900 
2901 	finish_error(ps);
2902 }
2903 
2904 /*
2905  * When we have a B_FAILFAST IO error on a Last Erred component we need to
2906  * retry the IO without B_FAILFAST set so that we try to ensure that the
2907  * component "sees" each IO.
2908  */
2909 static void
last_err_retry(md_mcs_t * cs)2910 last_err_retry(md_mcs_t *cs)
2911 {
2912 	struct buf	*cb;
2913 	md_mps_t	*ps;
2914 	uint_t		flags;
2915 
2916 	cb = &cs->cs_buf;
2917 	cb->b_flags &= ~B_FAILFAST;
2918 
2919 	/* if we're panicing just let this I/O error out */
2920 	if (panicstr) {
2921 		(void) mirror_done(cb);
2922 		return;
2923 	}
2924 
2925 	/* reissue the I/O */
2926 
2927 	ps = cs->cs_ps;
2928 
2929 	bioerror(cb, 0);
2930 
2931 	mutex_enter(&ps->ps_mx);
2932 
2933 	flags = MD_STR_NOTTOP;
2934 	if (ps->ps_flags & MD_MPS_MAPPED)
2935 		flags |= MD_STR_MAPPED;
2936 	if (ps->ps_flags & MD_MPS_NOBLOCK)
2937 		flags |= MD_NOBLOCK;
2938 
2939 	mutex_exit(&ps->ps_mx);
2940 
2941 	clear_retry_error(cb);
2942 
2943 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2944 	    md_shortname(getminor(cb->b_edev)));
2945 
2946 	md_call_strategy(cb, flags, NULL);
2947 }
2948 
2949 static void
mirror_error(md_mps_t * ps)2950 mirror_error(md_mps_t *ps)
2951 {
2952 	int		smi;	/* sub mirror index */
2953 	int		ci;	/* errored component */
2954 
2955 	if (panicstr) {
2956 		finish_error(ps);
2957 		return;
2958 	}
2959 
2960 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2961 		mirror_overlap_tree_remove(ps);
2962 
2963 	smi = 0;
2964 	ci = 0;
2965 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2966 		md_unit_readerexit(ps->ps_ui);
2967 		daemon_request(&md_mstr_daemon, error_update_unit,
2968 		    (daemon_queue_t *)ps, REQ_OLD);
2969 		return;
2970 	}
2971 
2972 	finish_error(ps);
2973 }
2974 
2975 static int
copy_write_done(struct buf * cb)2976 copy_write_done(struct buf *cb)
2977 {
2978 	md_mps_t	*ps;
2979 	buf_t		*pb;
2980 	char		*wowbuf;
2981 	wowhdr_t	*wowhdr;
2982 	ssize_t		wow_resid;
2983 
2984 	/* get wowbuf ans save structure */
2985 	wowbuf = cb->b_un.b_addr;
2986 	wowhdr = WOWBUF_HDR(wowbuf);
2987 	ps = wowhdr->wow_ps;
2988 	pb = ps->ps_bp;
2989 
2990 	/* Save error information, then free cb */
2991 	if (cb->b_flags & B_ERROR)
2992 		pb->b_flags |= B_ERROR;
2993 
2994 	if (cb->b_flags & B_REMAPPED)
2995 		bp_mapout(cb);
2996 
2997 	freerbuf(cb);
2998 
2999 	/* update residual and continue if needed */
3000 	if ((pb->b_flags & B_ERROR) == 0) {
3001 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
3002 		pb->b_resid = wow_resid;
3003 		if (wow_resid > 0)  {
3004 			daemon_request(&md_mstr_daemon, copy_write_cont,
3005 			    (daemon_queue_t *)wowhdr, REQ_OLD);
3006 			return (1);
3007 		}
3008 	}
3009 
3010 	/* Write is complete, release resources. */
3011 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
3012 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3013 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3014 	MPS_FREE(mirror_parent_cache, ps);
3015 	md_biodone(pb);
3016 	return (0);
3017 }
3018 
3019 static void
copy_write_cont(wowhdr_t * wowhdr)3020 copy_write_cont(wowhdr_t *wowhdr)
3021 {
3022 	buf_t		*pb;
3023 	buf_t		*cb;
3024 	char		*wowbuf;
3025 	int		wow_offset;
3026 	size_t		wow_resid;
3027 	diskaddr_t	wow_blkno;
3028 
3029 	wowbuf = WOWHDR_BUF(wowhdr);
3030 	pb = wowhdr->wow_ps->ps_bp;
3031 
3032 	/* get data on current location */
3033 	wow_offset = wowhdr->wow_offset;
3034 	wow_resid = pb->b_bcount - wow_offset;
3035 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
3036 
3037 	/* setup child buffer */
3038 	cb = getrbuf(KM_SLEEP);
3039 	cb->b_flags = B_WRITE;
3040 	cb->b_edev = pb->b_edev;
3041 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
3042 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
3043 	cb->b_iodone = copy_write_done;
3044 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
3045 	cb->b_lblkno = wow_blkno;
3046 
3047 	/* move offset to next section */
3048 	wowhdr->wow_offset += cb->b_bcount;
3049 
3050 	/* copy and setup write for current section */
3051 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
3052 
3053 	/* do it */
3054 	/*
3055 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
3056 	 * that handles the WOW condition. The resultant increment on the
3057 	 * I/O count variable is cleared by copy_write_done()'s call to
3058 	 * md_biodone().
3059 	 */
3060 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
3061 	    | MD_STR_MAPPED, NULL);
3062 }
3063 
3064 static void
md_mirror_copy_write(md_mps_t * ps)3065 md_mirror_copy_write(md_mps_t *ps)
3066 {
3067 	wowhdr_t	*wowhdr;
3068 
3069 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
3070 	mirror_wowblk_init(wowhdr);
3071 	wowhdr->wow_ps = ps;
3072 	wowhdr->wow_offset = 0;
3073 	copy_write_cont(wowhdr);
3074 }
3075 
3076 static void
handle_wow(md_mps_t * ps)3077 handle_wow(md_mps_t *ps)
3078 {
3079 	buf_t		*pb;
3080 
3081 	pb = ps->ps_bp;
3082 
3083 	bp_mapin(pb);
3084 
3085 	md_mirror_wow_cnt++;
3086 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
3087 		cmn_err(CE_NOTE,
3088 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
3089 		    md_shortname(getminor(pb->b_edev)),
3090 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
3091 	}
3092 
3093 	/*
3094 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
3095 	 * operation therefore this I/O request has already been counted,
3096 	 * the I/O count variable will be decremented by mirror_done()'s
3097 	 * call to md_biodone().
3098 	 */
3099 	if (md_mirror_wow_flg & WOW_NOCOPY)
3100 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
3101 		    MD_STR_MAPPED | MD_IO_COUNTED, ps);
3102 	else
3103 		md_mirror_copy_write(ps);
3104 }
3105 
3106 /*
3107  * Return true if the specified submirror is either in the Last Erred
3108  * state or is transitioning into the Last Erred state.
3109  */
3110 static bool_t
submirror_is_lasterred(mm_unit_t * un,int smi)3111 submirror_is_lasterred(mm_unit_t *un, int smi)
3112 {
3113 	mm_submirror_t		*sm;
3114 	mm_submirror_ic_t	*smic;
3115 	md_m_shared_t		*shared;
3116 	int			ci;
3117 	int			compcnt;
3118 
3119 	sm = &un->un_sm[smi];
3120 	smic = &un->un_smic[smi];
3121 
3122 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
3123 	for (ci = 0; ci < compcnt; ci++) {
3124 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3125 		    (sm->sm_dev, sm, ci);
3126 
3127 		if (shared->ms_state == CS_LAST_ERRED)
3128 			return (B_TRUE);
3129 
3130 		/*
3131 		 * It is not currently Last Erred, check if entering Last Erred.
3132 		 */
3133 		if ((shared->ms_flags & MDM_S_IOERR) &&
3134 		    ((shared->ms_state == CS_OKAY) ||
3135 		    (shared->ms_state == CS_RESYNC))) {
3136 			if (mirror_other_sources(un, smi, ci, 0) == 1)
3137 				return (B_TRUE);
3138 		}
3139 	}
3140 
3141 	return (B_FALSE);
3142 }
3143 
3144 
3145 static int
mirror_done(struct buf * cb)3146 mirror_done(struct buf *cb)
3147 {
3148 	md_mps_t	*ps;
3149 	md_mcs_t	*cs;
3150 
3151 	/*LINTED*/
3152 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3153 	ps = cs->cs_ps;
3154 
3155 	mutex_enter(&ps->ps_mx);
3156 
3157 	/* check if we need to retry an errored failfast I/O */
3158 	if (cb->b_flags & B_ERROR) {
3159 		struct buf *pb = ps->ps_bp;
3160 
3161 		if (cb->b_flags & B_FAILFAST) {
3162 			int		i;
3163 			mm_unit_t	*un = ps->ps_un;
3164 
3165 			for (i = 0; i < NMIRROR; i++) {
3166 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3167 					continue;
3168 
3169 				if (cb->b_edev ==
3170 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3171 
3172 					/*
3173 					 * This is the submirror that had the
3174 					 * error.  Check if it is Last Erred.
3175 					 */
3176 					if (submirror_is_lasterred(un, i)) {
3177 						daemon_queue_t *dqp;
3178 
3179 						mutex_exit(&ps->ps_mx);
3180 						dqp = (daemon_queue_t *)cs;
3181 						dqp->dq_prev = NULL;
3182 						dqp->dq_next = NULL;
3183 						daemon_request(&md_done_daemon,
3184 						    last_err_retry, dqp,
3185 						    REQ_OLD);
3186 						return (1);
3187 					}
3188 					break;
3189 				}
3190 			}
3191 		}
3192 
3193 		/* continue to process the buf without doing a retry */
3194 		ps->ps_flags |= MD_MPS_ERROR;
3195 		pb->b_error = cb->b_error;
3196 	}
3197 
3198 	return (mirror_done_common(cb));
3199 }
3200 
3201 /*
3202  * Split from the original mirror_done function so we can handle bufs after a
3203  * retry.
3204  * ps->ps_mx is already held in the caller of this function and the cb error
3205  * has already been checked and handled in the caller.
3206  */
3207 static int
mirror_done_common(struct buf * cb)3208 mirror_done_common(struct buf *cb)
3209 {
3210 	struct buf	*pb;
3211 	mm_unit_t	*un;
3212 	mdi_unit_t	*ui;
3213 	md_mps_t	*ps;
3214 	md_mcs_t	*cs;
3215 	size_t		end_rr, start_rr, current_rr;
3216 
3217 	/*LINTED*/
3218 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3219 	ps = cs->cs_ps;
3220 	pb = ps->ps_bp;
3221 
3222 	if (cb->b_flags & B_REMAPPED)
3223 		bp_mapout(cb);
3224 
3225 	ps->ps_frags--;
3226 	if (ps->ps_frags != 0) {
3227 		mutex_exit(&ps->ps_mx);
3228 		kmem_cache_free(mirror_child_cache, cs);
3229 		return (1);
3230 	}
3231 	un = ps->ps_un;
3232 	ui = ps->ps_ui;
3233 
3234 	/*
3235 	 * Do not update outstanding_writes if we're running with ABR
3236 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
3237 	 * Also a resync initiated write() has no outstanding_writes update
3238 	 * either.
3239 	 */
3240 	if (((cb->b_flags & B_READ) == 0) &&
3241 	    (un->un_nsm >= 2) &&
3242 	    (ps->ps_call == NULL) &&
3243 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3244 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3245 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3246 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3247 		mutex_enter(&un->un_resync_mx);
3248 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3249 			un->un_outstanding_writes[current_rr]--;
3250 		mutex_exit(&un->un_resync_mx);
3251 	}
3252 	kmem_cache_free(mirror_child_cache, cs);
3253 	mutex_exit(&ps->ps_mx);
3254 
3255 	if (ps->ps_call != NULL) {
3256 		daemon_request(&md_done_daemon, ps->ps_call,
3257 		    (daemon_queue_t *)ps, REQ_OLD);
3258 		return (1);
3259 	}
3260 
3261 	if ((ps->ps_flags & MD_MPS_ERROR)) {
3262 		daemon_request(&md_done_daemon, mirror_error,
3263 		    (daemon_queue_t *)ps, REQ_OLD);
3264 		return (1);
3265 	}
3266 
3267 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3268 		mirror_overlap_tree_remove(ps);
3269 
3270 	/*
3271 	 * Handle Write-on-Write problem.
3272 	 * Skip In case of Raw and Direct I/O as they are
3273 	 * handled earlier.
3274 	 *
3275 	 */
3276 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3277 	    !(pb->b_flags & B_READ) &&
3278 	    !(ps->ps_flags & MD_MPS_WOW) &&
3279 	    !(pb->b_flags & B_PHYS) &&
3280 	    any_pages_dirty(pb)) {
3281 		md_unit_readerexit(ps->ps_ui);
3282 		daemon_request(&md_mstr_daemon, handle_wow,
3283 		    (daemon_queue_t *)ps, REQ_OLD);
3284 		return (1);
3285 	}
3286 
3287 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3288 	MPS_FREE(mirror_parent_cache, ps);
3289 	md_unit_readerexit(ui);
3290 	md_biodone(pb);
3291 	return (0);
3292 }
3293 
3294 /*
3295  * Clear error state in submirror component if the retry worked after
3296  * a failfast error.
3297  */
3298 static void
clear_retry_error(struct buf * cb)3299 clear_retry_error(struct buf *cb)
3300 {
3301 	int			smi;
3302 	md_mcs_t		*cs;
3303 	mm_unit_t		*un;
3304 	mdi_unit_t		*ui_sm;
3305 	mm_submirror_t		*sm;
3306 	mm_submirror_ic_t	*smic;
3307 	u_longlong_t		cnt;
3308 	md_m_shared_t		*shared;
3309 
3310 	/*LINTED*/
3311 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3312 	un = cs->cs_ps->ps_un;
3313 
3314 	for (smi = 0; smi < NMIRROR; smi++) {
3315 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3316 			continue;
3317 
3318 		if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
3319 			break;
3320 	}
3321 
3322 	if (smi >= NMIRROR)
3323 		return;
3324 
3325 	sm = &un->un_sm[smi];
3326 	smic = &un->un_smic[smi];
3327 	cnt = cb->b_bcount;
3328 
3329 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
3330 	(void) md_unit_writerlock(ui_sm);
3331 
3332 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3333 	    cb->b_blkno, &cnt);
3334 
3335 	if (shared->ms_flags & MDM_S_IOERR) {
3336 		shared->ms_flags &= ~MDM_S_IOERR;
3337 
3338 	} else {
3339 		/* the buf spans components and the first one is not erred */
3340 		int	cnt;
3341 		int	i;
3342 
3343 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3344 		for (i = 0; i < cnt; i++) {
3345 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3346 			    (sm->sm_dev, sm, i);
3347 
3348 			if (shared->ms_flags & MDM_S_IOERR &&
3349 			    shared->ms_state == CS_OKAY) {
3350 
3351 				shared->ms_flags &= ~MDM_S_IOERR;
3352 				break;
3353 			}
3354 		}
3355 	}
3356 
3357 	md_unit_writerexit(ui_sm);
3358 }
3359 
3360 static size_t
mirror_map_read(md_mps_t * ps,md_mcs_t * cs,diskaddr_t blkno,u_longlong_t count)3361 mirror_map_read(
3362 	md_mps_t *ps,
3363 	md_mcs_t *cs,
3364 	diskaddr_t blkno,
3365 	u_longlong_t	count
3366 )
3367 {
3368 	mm_unit_t	*un;
3369 	buf_t		*bp;
3370 	u_longlong_t	cando;
3371 
3372 	bp = &cs->cs_buf;
3373 	un = ps->ps_un;
3374 
3375 	bp->b_lblkno = blkno;
3376 	if (fast_select_read_unit(ps, cs) == 0) {
3377 		bp->b_bcount = ldbtob(count);
3378 		return (0);
3379 	}
3380 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
3381 	    count, &cando, 0, NULL, cs));
3382 	bp->b_bcount = ldbtob(cando);
3383 	if (count != cando)
3384 		return (cando);
3385 	return (0);
3386 }
3387 
3388 static void
write_after_read(md_mps_t * ps)3389 write_after_read(md_mps_t *ps)
3390 {
3391 	struct buf	*pb;
3392 	int		flags;
3393 
3394 	if (ps->ps_flags & MD_MPS_ERROR) {
3395 		mirror_error(ps);
3396 		return;
3397 	}
3398 
3399 	pb = ps->ps_bp;
3400 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3401 	ps->ps_call = NULL;
3402 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3403 	flags = MD_STR_NOTTOP | MD_STR_WAR;
3404 	if (ps->ps_flags & MD_MPS_MAPPED)
3405 		flags |= MD_STR_MAPPED;
3406 	if (ps->ps_flags & MD_MPS_NOBLOCK)
3407 		flags |= MD_NOBLOCK;
3408 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
3409 		flags |= MD_STR_DIRTY_RD;
3410 	(void) mirror_write_strategy(pb, flags, ps);
3411 }
3412 
3413 static void
continue_serial(md_mps_t * ps)3414 continue_serial(md_mps_t *ps)
3415 {
3416 	md_mcs_t	*cs;
3417 	buf_t		*cb;
3418 	mm_unit_t	*un;
3419 	int		flags;
3420 
3421 	un = ps->ps_un;
3422 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3423 	mirror_child_init(cs);
3424 	cb = &cs->cs_buf;
3425 	ps->ps_call = NULL;
3426 	ps->ps_frags = 1;
3427 	(void) mirror_map_write(un, cs, ps, 0);
3428 	flags = MD_STR_NOTTOP;
3429 	if (ps->ps_flags & MD_MPS_MAPPED)
3430 		flags |= MD_STR_MAPPED;
3431 	md_call_strategy(cb, flags, NULL);
3432 }
3433 
3434 static int
mirror_map_write(mm_unit_t * un,md_mcs_t * cs,md_mps_t * ps,int war)3435 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3436 {
3437 	int i;
3438 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
3439 	buf_t		*cb;
3440 	buf_t		*pb;
3441 	diskaddr_t	blkno;
3442 	size_t		bcount;
3443 	off_t		offset;
3444 
3445 	pb = ps->ps_bp;
3446 	cb = &cs->cs_buf;
3447 	cs->cs_ps = ps;
3448 
3449 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3450 
3451 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3452 
3453 	blkno = pb->b_lblkno;
3454 	bcount = pb->b_bcount;
3455 	offset = 0;
3456 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3457 		blkno = DK_LABEL_LOC + 1;
3458 		/*
3459 		 * This handles the case where we're requesting
3460 		 * a write to block 0 on a label partition
3461 		 * and the request size was smaller than the
3462 		 * size of the label.  If this is the case
3463 		 * then we'll return -1.  Failure to do so will
3464 		 * either cause the calling thread to hang due to
3465 		 * an ssd bug, or worse if the bcount were allowed
3466 		 * to go negative (ie large).
3467 		 */
3468 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3469 			return (-1);
3470 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3471 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3472 	}
3473 
3474 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3475 	    cb, KM_NOSLEEP);
3476 	if (war)
3477 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3478 
3479 	/*
3480 	 * If the submirror is in the erred stated, check if any component is
3481 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3482 	 * flag on the IO.
3483 	 *
3484 	 * Provide a fast path for the non-erred case (which should be the
3485 	 * normal case).
3486 	 */
3487 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3488 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3489 			mm_submirror_t		*sm;
3490 			mm_submirror_ic_t	*smic;
3491 			int			ci;
3492 			int			compcnt;
3493 
3494 			sm = &un->un_sm[i];
3495 			smic = &un->un_smic[i];
3496 
3497 			compcnt = (*(smic->sm_get_component_count))
3498 			    (sm->sm_dev, un);
3499 			for (ci = 0; ci < compcnt; ci++) {
3500 				md_m_shared_t	*shared;
3501 
3502 				shared = (md_m_shared_t *)
3503 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
3504 				    sm, ci);
3505 
3506 				if (shared->ms_state == CS_LAST_ERRED)
3507 					break;
3508 			}
3509 			if (ci >= compcnt)
3510 				cb->b_flags |= B_FAILFAST;
3511 
3512 		} else {
3513 			cb->b_flags |= B_FAILFAST;
3514 		}
3515 	}
3516 
3517 	ps->ps_current_sm++;
3518 	if (ps->ps_current_sm != ps->ps_active_cnt) {
3519 		if (un->un_write_option == WR_SERIAL) {
3520 			ps->ps_call = continue_serial;
3521 			return (0);
3522 		}
3523 		return (1);
3524 	}
3525 	return (0);
3526 }
3527 
3528 /*
3529  * directed_read_done:
3530  * ------------------
3531  * Completion routine called when a DMR request has been returned from the
3532  * underlying driver. Wake-up the original ioctl() and return the data to
3533  * the user.
3534  */
3535 static void
directed_read_done(md_mps_t * ps)3536 directed_read_done(md_mps_t *ps)
3537 {
3538 	mm_unit_t	*un;
3539 	mdi_unit_t	*ui;
3540 
3541 	un = ps->ps_un;
3542 	ui = ps->ps_ui;
3543 
3544 	md_unit_readerexit(ui);
3545 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3546 	ps->ps_call = NULL;
3547 
3548 	mutex_enter(&un->un_dmr_mx);
3549 	cv_signal(&un->un_dmr_cv);
3550 	mutex_exit(&un->un_dmr_mx);
3551 
3552 	/* release the parent structure */
3553 	kmem_cache_free(mirror_parent_cache, ps);
3554 }
3555 
3556 /*
3557  * daemon_io:
3558  * ------------
3559  * Called to issue a mirror_write_strategy() or mirror_read_strategy
3560  * call from a blockable context. NOTE: no mutex can be held on entry to this
3561  * routine
3562  */
3563 static void
daemon_io(daemon_queue_t * dq)3564 daemon_io(daemon_queue_t *dq)
3565 {
3566 	md_mps_t	*ps = (md_mps_t *)dq;
3567 	int		flag = MD_STR_NOTTOP;
3568 	buf_t		*pb = ps->ps_bp;
3569 
3570 	if (ps->ps_flags & MD_MPS_MAPPED)
3571 		flag |= MD_STR_MAPPED;
3572 	if (ps->ps_flags & MD_MPS_WOW)
3573 		flag |= MD_STR_WOW;
3574 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3575 		flag |= MD_STR_WAR;
3576 	if (ps->ps_flags & MD_MPS_ABR)
3577 		flag |= MD_STR_ABR;
3578 	if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
3579 		flag |= MD_STR_BLOCK_OK;
3580 
3581 	/*
3582 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3583 	 * MD_STR_WAR before calling mirror_read_strategy
3584 	 */
3585 	if (pb->b_flags & B_READ) {
3586 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3587 			flag |= MD_STR_WAR;
3588 		mirror_read_strategy(pb, flag, ps);
3589 	} else
3590 		mirror_write_strategy(pb, flag, ps);
3591 }
3592 
3593 /*
3594  * update_resync:
3595  * -------------
3596  * Called to update the in-core version of the resync record with the latest
3597  * version that was committed to disk when the previous mirror owner
3598  * relinquished ownership. This call is likely to block as we must hold-off
3599  * any current resync processing that may be occurring.
3600  * On completion of the resync record update we issue the mirror_write_strategy
3601  * call to complete the i/o that first started this sequence. To remove a race
3602  * condition between a new write() request which is submitted and the resync
3603  * record update we acquire the writerlock. This will hold off all i/o to the
3604  * mirror until the resync update has completed.
3605  * NOTE: no mutex can be held on entry to this routine
3606  */
3607 static void
update_resync(daemon_queue_t * dq)3608 update_resync(daemon_queue_t *dq)
3609 {
3610 	md_mps_t	*ps = (md_mps_t *)dq;
3611 	buf_t		*pb = ps->ps_bp;
3612 	mdi_unit_t	*ui = ps->ps_ui;
3613 	mm_unit_t	*un = MD_UNIT(ui->ui_link.ln_id);
3614 	set_t		setno;
3615 	int		restart_resync;
3616 
3617 	mutex_enter(&un->un_rrp_inflight_mx);
3618 	(void) md_unit_writerlock(ui);
3619 	ps->ps_un = un;
3620 	setno = MD_MIN2SET(getminor(pb->b_edev));
3621 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3622 		/*
3623 		 * Synchronize our in-core view of what regions need to be
3624 		 * resync'd with the on-disk version.
3625 		 */
3626 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3627 		    un->un_dirty_bm);
3628 
3629 		/* Region dirty map is now up to date */
3630 	}
3631 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3632 	md_unit_writerexit(ui);
3633 	mutex_exit(&un->un_rrp_inflight_mx);
3634 
3635 	/* Restart the resync thread if it was previously blocked */
3636 	if (restart_resync) {
3637 		mutex_enter(&un->un_rs_thread_mx);
3638 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3639 		cv_signal(&un->un_rs_thread_cv);
3640 		mutex_exit(&un->un_rs_thread_mx);
3641 	}
3642 	/* Continue with original deferred i/o */
3643 	daemon_io(dq);
3644 }
3645 
3646 /*
3647  * owner_timeout:
3648  * -------------
3649  * Called if the original mdmn_ksend_message() failed and the request is to be
3650  * retried. Reattempt the original ownership change.
3651  *
3652  * NOTE: called at interrupt context (see timeout(9f)).
3653  */
3654 static void
owner_timeout(void * arg)3655 owner_timeout(void *arg)
3656 {
3657 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
3658 
3659 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3660 }
3661 
3662 /*
3663  * become_owner:
3664  * ------------
3665  * Called to issue RPC request to become the owner of the mirror
3666  * associated with this i/o request. We assume that the ownership request
3667  * is synchronous, so if it succeeds we will issue the request via
3668  * mirror_write_strategy().
3669  * If multiple i/o's are outstanding we will be called from the mirror_daemon
3670  * service thread.
3671  * NOTE: no mutex should be held on entry to this routine.
3672  */
3673 static void
become_owner(daemon_queue_t * dq)3674 become_owner(daemon_queue_t *dq)
3675 {
3676 	md_mps_t	*ps = (md_mps_t *)dq;
3677 	mm_unit_t	*un = ps->ps_un;
3678 	buf_t		*pb = ps->ps_bp;
3679 	set_t		setno;
3680 	md_mn_kresult_t	*kres;
3681 	int		msg_flags = md_mirror_msg_flags;
3682 	md_mps_t	*ps1;
3683 
3684 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3685 
3686 	/*
3687 	 * If we're already the mirror owner we do not need to send a message
3688 	 * but can simply process the i/o request immediately.
3689 	 * If we've already sent the request to become owner we requeue the
3690 	 * request as we're waiting for the synchronous ownership message to
3691 	 * be processed.
3692 	 */
3693 	if (MD_MN_MIRROR_OWNER(un)) {
3694 		/*
3695 		 * As the strategy() call will potentially block we need to
3696 		 * punt this to a separate thread and complete this request
3697 		 * as quickly as possible. Note: if we're a read request
3698 		 * this must be a resync, we cannot afford to be queued
3699 		 * behind any intervening i/o requests. In this case we put the
3700 		 * request on the md_mirror_rs_daemon queue.
3701 		 */
3702 		if (pb->b_flags & B_READ) {
3703 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3704 			    REQ_OLD);
3705 		} else {
3706 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3707 			    REQ_OLD);
3708 		}
3709 	} else {
3710 		mutex_enter(&un->un_owner_mx);
3711 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3712 			md_mn_req_owner_t	*msg;
3713 			int			rval = 0;
3714 
3715 			/*
3716 			 * Check to see that we haven't exceeded the maximum
3717 			 * retry count. If we have we fail the i/o as the
3718 			 * comms mechanism has become wedged beyond recovery.
3719 			 */
3720 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
3721 				mutex_exit(&un->un_owner_mx);
3722 				cmn_err(CE_WARN,
3723 				    "md_mirror: Request exhausted ownership "
3724 				    "retry limit of %d attempts", dq->qlen);
3725 				pb->b_error = EIO;
3726 				pb->b_flags |= B_ERROR;
3727 				pb->b_resid = pb->b_bcount;
3728 				kmem_cache_free(mirror_parent_cache, ps);
3729 				md_biodone(pb);
3730 				return;
3731 			}
3732 
3733 			/*
3734 			 * Issue request to change ownership. The call is
3735 			 * synchronous so when it returns we can complete the
3736 			 * i/o (if successful), or enqueue it again so that
3737 			 * the operation will be retried.
3738 			 */
3739 			un->un_owner_state |= MM_MN_OWNER_SENT;
3740 			mutex_exit(&un->un_owner_mx);
3741 
3742 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3743 			setno = MD_MIN2SET(getminor(pb->b_edev));
3744 			msg->mnum = MD_SID(un);
3745 			msg->owner = md_mn_mynode_id;
3746 			msg_flags |= MD_MSGF_NO_LOG;
3747 			/*
3748 			 * If this IO is triggered by updating a watermark,
3749 			 * it might be issued by the creation of a softpartition
3750 			 * while the commd subsystem is suspended.
3751 			 * We don't want this message to block.
3752 			 */
3753 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
3754 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3755 			}
3756 
3757 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3758 			rval = mdmn_ksend_message(setno,
3759 			    MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
3760 			    (char *)msg, sizeof (md_mn_req_owner_t), kres);
3761 
3762 			kmem_free(msg, sizeof (md_mn_req_owner_t));
3763 
3764 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
3765 				dq->qlen = 0;
3766 				/*
3767 				 * Successfully changed owner, reread the
3768 				 * resync record so that we have a valid idea of
3769 				 * any previously committed incomplete write()s.
3770 				 * NOTE: As we need to acquire the resync mutex
3771 				 * this may block, so we defer it to a separate
3772 				 * thread handler. This makes us (effectively)
3773 				 * non-blocking once the ownership message
3774 				 * handling has completed.
3775 				 */
3776 				mutex_enter(&un->un_owner_mx);
3777 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3778 					un->un_mirror_owner = md_mn_mynode_id;
3779 					/* Sets owner of un_rr_dirty record */
3780 					if (un->un_rr_dirty_recid)
3781 						(void) mddb_setowner(
3782 						    un->un_rr_dirty_recid,
3783 						    md_mn_mynode_id);
3784 					un->un_owner_state &=
3785 					    ~MM_MN_BECOME_OWNER;
3786 					/*
3787 					 * Release the block on the current
3788 					 * resync region if it is blocked
3789 					 */
3790 					ps1 = un->un_rs_prev_overlap;
3791 					if ((ps1 != NULL) &&
3792 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3793 						mirror_overlap_tree_remove(ps1);
3794 					mutex_exit(&un->un_owner_mx);
3795 
3796 					/*
3797 					 * If we're a read, this must be a
3798 					 * resync request, issue
3799 					 * the i/o request on the
3800 					 * md_mirror_rs_daemon queue. This is
3801 					 * to avoid a deadlock between the
3802 					 * resync_unit thread and
3803 					 * subsequent i/o requests that may
3804 					 * block on the resync region.
3805 					 */
3806 					if (pb->b_flags & B_READ) {
3807 						daemon_request(
3808 						    &md_mirror_rs_daemon,
3809 						    update_resync, dq, REQ_OLD);
3810 					} else {
3811 						daemon_request(
3812 						    &md_mirror_io_daemon,
3813 						    update_resync, dq, REQ_OLD);
3814 					}
3815 					kmem_free(kres,
3816 					    sizeof (md_mn_kresult_t));
3817 					return;
3818 				} else {
3819 					/*
3820 					 * Some other node has beaten us to
3821 					 * obtain ownership. We need to
3822 					 * reschedule our ownership request
3823 					 */
3824 					mutex_exit(&un->un_owner_mx);
3825 				}
3826 			} else {
3827 				mdmn_ksend_show_error(rval, kres,
3828 				    "MD_MN_MSG_REQUIRE_OWNER");
3829 				/*
3830 				 * Message transport failure is handled by the
3831 				 * comms layer. If the ownership change request
3832 				 * does not succeed we need to flag the error to
3833 				 * the initiator of the i/o. This is handled by
3834 				 * the retry logic above. As the request failed
3835 				 * we do not know _who_ the owner of the mirror
3836 				 * currently is. We reset our idea of the owner
3837 				 * to None so that any further write()s will
3838 				 * attempt to become the owner again. This stops
3839 				 * multiple nodes writing to the same mirror
3840 				 * simultaneously.
3841 				 */
3842 				mutex_enter(&un->un_owner_mx);
3843 				un->un_owner_state &=
3844 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3845 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3846 				mutex_exit(&un->un_owner_mx);
3847 			}
3848 			kmem_free(kres, sizeof (md_mn_kresult_t));
3849 		} else
3850 			mutex_exit(&un->un_owner_mx);
3851 
3852 		/*
3853 		 * Re-enqueue this request on the deferred i/o list. Delay the
3854 		 * request for md_mirror_owner_to usecs to stop thrashing.
3855 		 */
3856 		(void) timeout(owner_timeout, dq,
3857 		    drv_usectohz(md_mirror_owner_to));
3858 	}
3859 }
3860 
3861 static void
mirror_write_strategy(buf_t * pb,int flag,void * private)3862 mirror_write_strategy(buf_t *pb, int flag, void *private)
3863 {
3864 	md_mps_t	*ps;
3865 	md_mcs_t	*cs;
3866 	int		more;
3867 	mm_unit_t	*un;
3868 	mdi_unit_t	*ui;
3869 	buf_t		*cb;		/* child buf pointer */
3870 	set_t		setno;
3871 	int		rs_on_overlap = 0;
3872 
3873 	ui = MDI_UNIT(getminor(pb->b_edev));
3874 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3875 
3876 
3877 	md_kstat_waitq_enter(ui);
3878 
3879 	/*
3880 	 * If a state change is in progress for this mirror in a MN set,
3881 	 * suspend all non-resync writes until the state change is complete.
3882 	 * The objective of this suspend is to ensure that it is not
3883 	 * possible for one node to read data from a submirror that another node
3884 	 * has not written to because of the state change. Therefore we
3885 	 * suspend all writes until the state change has been made. As it is
3886 	 * not possible to read from the target of a resync, there is no need
3887 	 * to suspend resync writes.
3888 	 * Note that we only block here if the caller can handle a busy-wait.
3889 	 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
3890 	 */
3891 
3892 	if (!(flag & MD_STR_WAR)) {
3893 		if (flag & MD_STR_BLOCK_OK) {
3894 			mutex_enter(&un->un_suspend_wr_mx);
3895 			while (un->un_suspend_wr_flag) {
3896 				cv_wait(&un->un_suspend_wr_cv,
3897 				    &un->un_suspend_wr_mx);
3898 			}
3899 			mutex_exit(&un->un_suspend_wr_mx);
3900 		}
3901 		(void) md_unit_readerlock(ui);
3902 	}
3903 
3904 	if (!(flag & MD_STR_NOTTOP)) {
3905 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3906 			md_kstat_waitq_exit(ui);
3907 			return;
3908 		}
3909 	}
3910 
3911 	setno = MD_MIN2SET(getminor(pb->b_edev));
3912 
3913 	/* If an ABR write has been requested, set MD_STR_ABR flag */
3914 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3915 		flag |= MD_STR_ABR;
3916 
3917 	if (private == NULL) {
3918 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3919 		mirror_parent_init(ps);
3920 	} else {
3921 		ps = private;
3922 		private = NULL;
3923 	}
3924 	if (flag & MD_STR_MAPPED)
3925 		ps->ps_flags |= MD_MPS_MAPPED;
3926 
3927 	if (flag & MD_STR_WOW)
3928 		ps->ps_flags |= MD_MPS_WOW;
3929 
3930 	if (flag & MD_STR_ABR)
3931 		ps->ps_flags |= MD_MPS_ABR;
3932 
3933 	if (flag & MD_STR_WMUPDATE)
3934 		ps->ps_flags |= MD_MPS_WMUPDATE;
3935 
3936 	/*
3937 	 * Save essential information from the original buffhdr
3938 	 * in the md_save structure.
3939 	 */
3940 	ps->ps_un = un;
3941 	ps->ps_ui = ui;
3942 	ps->ps_bp = pb;
3943 	ps->ps_addr = pb->b_un.b_addr;
3944 	ps->ps_firstblk = pb->b_lblkno;
3945 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3946 	ps->ps_changecnt = un->un_changecnt;
3947 
3948 	/*
3949 	 * Check for suspended writes here. This is where we can defer the
3950 	 * write request to the daemon_io queue which will then call us with
3951 	 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
3952 	 * the top of this routine.
3953 	 */
3954 	if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
3955 		mutex_enter(&un->un_suspend_wr_mx);
3956 		if (un->un_suspend_wr_flag) {
3957 			ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
3958 			mutex_exit(&un->un_suspend_wr_mx);
3959 			md_unit_readerexit(ui);
3960 			daemon_request(&md_mirror_daemon, daemon_io,
3961 			    (daemon_queue_t *)ps, REQ_OLD);
3962 			return;
3963 		}
3964 		mutex_exit(&un->un_suspend_wr_mx);
3965 	}
3966 
3967 	/*
3968 	 * If not MN owner and this is an ABR write, make sure the current
3969 	 * resync region is in the overlaps tree
3970 	 */
3971 	mutex_enter(&un->un_owner_mx);
3972 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3973 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3974 		md_mps_t	*ps1;
3975 		/* Block the current resync region, if not already blocked */
3976 		ps1 = un->un_rs_prev_overlap;
3977 
3978 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3979 		    (ps1->ps_lastblk != 0))) {
3980 			/* Drop locks to avoid deadlock */
3981 			mutex_exit(&un->un_owner_mx);
3982 			md_unit_readerexit(ui);
3983 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3984 			rs_on_overlap = 1;
3985 			(void) md_unit_readerlock(ui);
3986 			mutex_enter(&un->un_owner_mx);
3987 			/*
3988 			 * Check to see if we have obtained ownership
3989 			 * while waiting for overlaps. If we have, remove
3990 			 * the resync_region entry from the overlap tree
3991 			 */
3992 			if (MD_MN_MIRROR_OWNER(un) &&
3993 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3994 				mirror_overlap_tree_remove(ps1);
3995 				rs_on_overlap = 0;
3996 			}
3997 		}
3998 	}
3999 	mutex_exit(&un->un_owner_mx);
4000 
4001 
4002 	/*
4003 	 * following keep write after read from writing to the
4004 	 * source in the case where it all came from one place
4005 	 */
4006 	if (flag & MD_STR_WAR) {
4007 		int	abort_write = 0;
4008 		/*
4009 		 * We are perfoming a write-after-read. This is either as a
4010 		 * result of a resync read or as a result of a read in a
4011 		 * dirty resync region when the optimized resync is not
4012 		 * complete. If in a MN set and a resync generated i/o,
4013 		 * if the current block is not in the current
4014 		 * resync region terminate the write as another node must have
4015 		 * completed this resync region
4016 		 */
4017 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
4018 		    (!(flag & MD_STR_DIRTY_RD))) {
4019 			if (!IN_RESYNC_REGION(un, ps))
4020 				abort_write = 1;
4021 		}
4022 		if ((select_write_after_read_units(un, ps) == 0) ||
4023 		    (abort_write)) {
4024 #ifdef DEBUG
4025 			if (mirror_debug_flag)
4026 				printf("Abort resync write on %x, block %lld\n",
4027 				    MD_SID(un), ps->ps_firstblk);
4028 #endif
4029 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4030 				mirror_overlap_tree_remove(ps);
4031 			kmem_cache_free(mirror_parent_cache, ps);
4032 			md_kstat_waitq_exit(ui);
4033 			md_unit_readerexit(ui);
4034 			md_biodone(pb);
4035 			return;
4036 		}
4037 	} else {
4038 		select_write_units(un, ps);
4039 
4040 		/* Drop readerlock to avoid deadlock */
4041 		md_unit_readerexit(ui);
4042 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4043 		un = md_unit_readerlock(ui);
4044 		/*
4045 		 * For a MN set with an ABR write, if we are now the
4046 		 * owner and we have a resync region in the overlap
4047 		 * tree, remove the entry from overlaps and retry the write.
4048 		 */
4049 
4050 		if (MD_MNSET_SETNO(setno) &&
4051 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
4052 			mutex_enter(&un->un_owner_mx);
4053 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
4054 				mirror_overlap_tree_remove(ps);
4055 				md_kstat_waitq_exit(ui);
4056 				mutex_exit(&un->un_owner_mx);
4057 				md_unit_readerexit(ui);
4058 				daemon_request(&md_mirror_daemon, daemon_io,
4059 				    (daemon_queue_t *)ps, REQ_OLD);
4060 				return;
4061 			}
4062 			mutex_exit(&un->un_owner_mx);
4063 		}
4064 	}
4065 
4066 	/*
4067 	 * For Multinode mirrors with no owner and a Resync Region (not ABR)
4068 	 * we need to become the mirror owner before continuing with the
4069 	 * write(). For ABR mirrors we check that we 'own' the resync if
4070 	 * we're in write-after-read mode. We do this _after_ ensuring that
4071 	 * there are no overlaps to ensure that once we know that we are
4072 	 * the owner, the readerlock will not be released until the write is
4073 	 * complete. As a change of ownership in a MN set requires the
4074 	 * writerlock, this ensures that ownership cannot be changed until
4075 	 * the write is complete.
4076 	 */
4077 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
4078 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
4079 		if (MD_MN_NO_MIRROR_OWNER(un))  {
4080 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4081 				mirror_overlap_tree_remove(ps);
4082 			md_kstat_waitq_exit(ui);
4083 			ASSERT(!(flag & MD_STR_WAR));
4084 			md_unit_readerexit(ui);
4085 			daemon_request(&md_mirror_daemon, become_owner,
4086 			    (daemon_queue_t *)ps, REQ_OLD);
4087 			return;
4088 		}
4089 	}
4090 
4091 	/*
4092 	 * Mark resync region if mirror has a Resync Region _and_ we are not
4093 	 * a resync initiated write(). Don't mark region if we're flagged as
4094 	 * an ABR write.
4095 	 */
4096 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
4097 	    !(flag & MD_STR_WAR)) {
4098 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
4099 		    ps->ps_lastblk, md_mn_mynode_id)) {
4100 			pb->b_flags |= B_ERROR;
4101 			pb->b_resid = pb->b_bcount;
4102 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4103 				mirror_overlap_tree_remove(ps);
4104 			kmem_cache_free(mirror_parent_cache, ps);
4105 			md_kstat_waitq_exit(ui);
4106 			md_unit_readerexit(ui);
4107 			md_biodone(pb);
4108 			return;
4109 		}
4110 	}
4111 
4112 	ps->ps_childbflags = pb->b_flags | B_WRITE;
4113 	ps->ps_childbflags &= ~B_READ;
4114 	if (flag & MD_STR_MAPPED)
4115 		ps->ps_childbflags &= ~B_PAGEIO;
4116 
4117 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4118 		/* Disable WOW and don't free ps */
4119 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
4120 
4121 	md_kstat_waitq_to_runq(ui);
4122 
4123 	/*
4124 	 * Treat Raw and Direct I/O as Write-on-Write always
4125 	 */
4126 
4127 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
4128 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
4129 	    (pb->b_flags & B_PHYS) &&
4130 	    !(ps->ps_flags & MD_MPS_WOW)) {
4131 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4132 			mirror_overlap_tree_remove(ps);
4133 		md_unit_readerexit(ui);
4134 		daemon_request(&md_mstr_daemon, handle_wow,
4135 		    (daemon_queue_t *)ps, REQ_OLD);
4136 		return;
4137 	}
4138 
4139 	ps->ps_frags = 1;
4140 	do {
4141 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4142 		mirror_child_init(cs);
4143 		cb = &cs->cs_buf;
4144 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
4145 
4146 		/*
4147 		 * This handles the case where we're requesting
4148 		 * a write to block 0 on a label partition.  (more < 0)
4149 		 * means that the request size was smaller than the
4150 		 * size of the label.  If so this request is done.
4151 		 */
4152 		if (more < 0) {
4153 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4154 				mirror_overlap_tree_remove(ps);
4155 			md_kstat_runq_exit(ui);
4156 			kmem_cache_free(mirror_child_cache, cs);
4157 			kmem_cache_free(mirror_parent_cache, ps);
4158 			md_unit_readerexit(ui);
4159 			md_biodone(pb);
4160 			return;
4161 		}
4162 		if (more) {
4163 			mutex_enter(&ps->ps_mx);
4164 			ps->ps_frags++;
4165 			mutex_exit(&ps->ps_mx);
4166 		}
4167 		md_call_strategy(cb, flag, private);
4168 	} while (more);
4169 
4170 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4171 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4172 			md_daemon(1, &md_done_daemon);
4173 			drv_usecwait(10);
4174 		}
4175 		kmem_cache_free(mirror_parent_cache, ps);
4176 	}
4177 }
4178 
4179 static void
mirror_read_strategy(buf_t * pb,int flag,void * private)4180 mirror_read_strategy(buf_t *pb, int flag, void *private)
4181 {
4182 	md_mps_t	*ps;
4183 	md_mcs_t	*cs;
4184 	size_t		more;
4185 	mm_unit_t	*un;
4186 	mdi_unit_t	*ui;
4187 	size_t		current_count;
4188 	diskaddr_t	current_blkno;
4189 	off_t		current_offset;
4190 	buf_t		*cb;		/* child buf pointer */
4191 	set_t		setno;
4192 
4193 	ui = MDI_UNIT(getminor(pb->b_edev));
4194 
4195 	md_kstat_waitq_enter(ui);
4196 
4197 	un = (mm_unit_t *)md_unit_readerlock(ui);
4198 
4199 	if (!(flag & MD_STR_NOTTOP)) {
4200 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4201 			md_kstat_waitq_exit(ui);
4202 			return;
4203 		}
4204 	}
4205 
4206 	if (private == NULL) {
4207 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4208 		mirror_parent_init(ps);
4209 	} else {
4210 		ps = private;
4211 		private = NULL;
4212 	}
4213 
4214 	if (flag & MD_STR_MAPPED)
4215 		ps->ps_flags |= MD_MPS_MAPPED;
4216 	if (flag & MD_NOBLOCK)
4217 		ps->ps_flags |= MD_MPS_NOBLOCK;
4218 	if (flag & MD_STR_WMUPDATE)
4219 		ps->ps_flags |= MD_MPS_WMUPDATE;
4220 
4221 	/*
4222 	 * Check to see if this is a DMR driven read. If so we need to use the
4223 	 * specified side (in un->un_dmr_last_read) for the source of the data.
4224 	 */
4225 	if (flag & MD_STR_DMR)
4226 		ps->ps_flags |= MD_MPS_DMR;
4227 
4228 	/*
4229 	 * Save essential information from the original buffhdr
4230 	 * in the md_save structure.
4231 	 */
4232 	ps->ps_un = un;
4233 	ps->ps_ui = ui;
4234 	ps->ps_bp = pb;
4235 	ps->ps_addr = pb->b_un.b_addr;
4236 	ps->ps_firstblk = pb->b_lblkno;
4237 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4238 	ps->ps_changecnt = un->un_changecnt;
4239 
4240 	current_count = btodb(pb->b_bcount);
4241 	current_blkno = pb->b_lblkno;
4242 	current_offset = 0;
4243 
4244 	/*
4245 	 * If flag has MD_STR_WAR set this means that the read is issued by a
4246 	 * resync thread which may or may not be an optimised resync.
4247 	 *
4248 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4249 	 * code has not completed; either a resync has not started since snarf,
4250 	 * or there is an optimized resync in progress.
4251 	 *
4252 	 * We need to generate a write after this read in the following two
4253 	 * cases,
4254 	 *
4255 	 * 1. Any Resync-Generated read
4256 	 *
4257 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
4258 	 *    pending or in progress.
4259 	 *
4260 	 * The write after read is done in these cases to ensure that all sides
4261 	 * of the mirror are in sync with the read data and that it is not
4262 	 * possible for an application to read the same block multiple times
4263 	 * and get different data.
4264 	 *
4265 	 * This would be possible if the block was in a dirty region.
4266 	 *
4267 	 * If we're performing a directed read we don't write the data out as
4268 	 * the application is responsible for restoring the mirror to a known
4269 	 * state.
4270 	 */
4271 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4272 	    !(flag & MD_STR_DMR)) {
4273 		size_t	start_rr, i, end_rr;
4274 		int	region_dirty = 1;
4275 
4276 		/*
4277 		 * We enter here under three circumstances,
4278 		 *
4279 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
4280 		 * 0			1
4281 		 * 1			0
4282 		 * 1			1
4283 		 *
4284 		 * To be optimal we only care to explicitly check for dirty
4285 		 * regions in the second case since if MD_STR_WAR is set we
4286 		 * always do the write after read.
4287 		 */
4288 		if (!(flag & MD_STR_WAR)) {
4289 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4290 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4291 
4292 			for (i = start_rr; i <= end_rr; i++)
4293 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4294 					break;
4295 		}
4296 
4297 		if ((region_dirty) &&
4298 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4299 			ps->ps_call = write_after_read;
4300 			/*
4301 			 * Mark this as a RESYNC_READ in ps_flags.
4302 			 * This is used if the read fails during a
4303 			 * resync of a 3-way mirror to ensure that
4304 			 * the retried read to the remaining
4305 			 * good submirror has MD_STR_WAR set. This
4306 			 * is needed to ensure that the resync write
4307 			 * (write-after-read) takes place.
4308 			 */
4309 			ps->ps_flags |= MD_MPS_RESYNC_READ;
4310 
4311 			/*
4312 			 * If MD_STR_FLAG_ERR is set in the flags we
4313 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
4314 			 * write (issued by write_after_read) will be flagged
4315 			 * to the biowait'ing resync thread. This allows us to
4316 			 * avoid issuing further resync requests to a device
4317 			 * that has had a write failure.
4318 			 */
4319 			if (flag & MD_STR_FLAG_ERR)
4320 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
4321 
4322 			setno = MD_UN2SET(un);
4323 			/*
4324 			 * Drop the readerlock to avoid
4325 			 * deadlock
4326 			 */
4327 			md_unit_readerexit(ui);
4328 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4329 			un = md_unit_readerlock(ui);
4330 			/*
4331 			 * Ensure that we are owner
4332 			 */
4333 			if (MD_MNSET_SETNO(setno)) {
4334 				/*
4335 				 * For a non-resync read that requires a
4336 				 * write-after-read to be done, set a flag
4337 				 * in the parent structure, so that the
4338 				 * write_strategy routine can omit the
4339 				 * test that the write is still within the
4340 				 * resync region
4341 				 */
4342 				if (!(flag & MD_STR_WAR))
4343 					ps->ps_flags |= MD_MPS_DIRTY_RD;
4344 
4345 				/*
4346 				 * Before reading the buffer, see if
4347 				 * there is an owner.
4348 				 */
4349 				if (MD_MN_NO_MIRROR_OWNER(un))  {
4350 					ps->ps_call = NULL;
4351 					mirror_overlap_tree_remove(ps);
4352 					md_kstat_waitq_exit(ui);
4353 					md_unit_readerexit(ui);
4354 					daemon_request(
4355 					    &md_mirror_daemon,
4356 					    become_owner,
4357 					    (daemon_queue_t *)ps,
4358 					    REQ_OLD);
4359 					return;
4360 				}
4361 				/*
4362 				 * For a resync read, check to see if I/O is
4363 				 * outside of the current resync region, or
4364 				 * the resync has finished. If so
4365 				 * just terminate the I/O
4366 				 */
4367 				if ((flag & MD_STR_WAR) &&
4368 				    (!(un->c.un_status & MD_UN_WAR) ||
4369 				    (!IN_RESYNC_REGION(un, ps)))) {
4370 #ifdef DEBUG
4371 					if (mirror_debug_flag)
4372 						printf("Abort resync read "
4373 						    "%x: %lld\n",
4374 						    MD_SID(un),
4375 						    ps->ps_firstblk);
4376 #endif
4377 					mirror_overlap_tree_remove(ps);
4378 					kmem_cache_free(mirror_parent_cache,
4379 					    ps);
4380 					md_kstat_waitq_exit(ui);
4381 					md_unit_readerexit(ui);
4382 					md_biodone(pb);
4383 					return;
4384 				}
4385 			}
4386 		}
4387 	}
4388 
4389 	if (flag & MD_STR_DMR) {
4390 		ps->ps_call = directed_read_done;
4391 	}
4392 
4393 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4394 		ps->ps_flags |= MD_MPS_DONTFREE;
4395 
4396 	md_kstat_waitq_to_runq(ui);
4397 
4398 	ps->ps_frags++;
4399 	do {
4400 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4401 		mirror_child_init(cs);
4402 		cb = &cs->cs_buf;
4403 		cs->cs_ps = ps;
4404 
4405 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
4406 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
4407 
4408 		more = mirror_map_read(ps, cs, current_blkno,
4409 		    (u_longlong_t)current_count);
4410 		if (more) {
4411 			mutex_enter(&ps->ps_mx);
4412 			ps->ps_frags++;
4413 			mutex_exit(&ps->ps_mx);
4414 		}
4415 
4416 		/*
4417 		 * Do these calculations now,
4418 		 *  so that we pickup a valid b_bcount from the chld_bp.
4419 		 */
4420 		current_count -= more;
4421 		current_offset += cb->b_bcount;
4422 		current_blkno +=  more;
4423 		md_call_strategy(cb, flag, private);
4424 	} while (more);
4425 
4426 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4427 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4428 			md_daemon(1, &md_done_daemon);
4429 			drv_usecwait(10);
4430 		}
4431 		kmem_cache_free(mirror_parent_cache, ps);
4432 	}
4433 }
4434 
4435 void
md_mirror_strategy(buf_t * bp,int flag,void * private)4436 md_mirror_strategy(buf_t *bp, int flag, void *private)
4437 {
4438 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
4439 
4440 	/*
4441 	 * When doing IO to a multi owner meta device, check if set is halted.
4442 	 * We do this check without the needed lock held, for performance
4443 	 * reasons.
4444 	 * If an IO just slips through while the set is locked via an
4445 	 * MD_MN_SUSPEND_SET, we don't care about it.
4446 	 * Only check for suspension if we are a top-level i/o request
4447 	 * (MD_STR_NOTTOP is cleared in 'flag').
4448 	 */
4449 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4450 	    (MD_SET_HALTED | MD_SET_MNSET)) {
4451 		if ((flag & MD_STR_NOTTOP) == 0) {
4452 			mutex_enter(&md_mx);
4453 			/* Here we loop until the set is no longer halted */
4454 			while (md_set[setno].s_status & MD_SET_HALTED) {
4455 				cv_wait(&md_cv, &md_mx);
4456 			}
4457 			mutex_exit(&md_mx);
4458 		}
4459 	}
4460 
4461 	if ((flag & MD_IO_COUNTED) == 0) {
4462 		if ((flag & MD_NOBLOCK) == 0) {
4463 			if (md_inc_iocount(setno) != 0) {
4464 				bp->b_flags |= B_ERROR;
4465 				bp->b_error = ENXIO;
4466 				bp->b_resid = bp->b_bcount;
4467 				biodone(bp);
4468 				return;
4469 			}
4470 		} else {
4471 			md_inc_iocount_noblock(setno);
4472 		}
4473 	}
4474 
4475 	if (bp->b_flags & B_READ)
4476 		mirror_read_strategy(bp, flag, private);
4477 	else
4478 		mirror_write_strategy(bp, flag, private);
4479 }
4480 
4481 /*
4482  * mirror_directed_read:
4483  * --------------------
4484  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4485  * so that the application can determine what (if any) resync needs to be
4486  * performed. The data is copied out to the user-supplied buffer.
4487  *
4488  * Parameters:
4489  *	mdev	- dev_t for the mirror device
4490  *	vdr	- directed read parameters specifying location and submirror
4491  *		  to perform the read from
4492  *	mode	- used to ddi_copyout() any resulting data from the read
4493  *
4494  * Returns:
4495  *	0	success
4496  *	!0	error code
4497  *		EINVAL - invalid request format
4498  */
4499 int
mirror_directed_read(dev_t mdev,vol_directed_rd_t * vdr,int mode)4500 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4501 {
4502 	buf_t		*bp;
4503 	minor_t		mnum = getminor(mdev);
4504 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4505 	mm_unit_t	*un;
4506 	mm_submirror_t	*sm;
4507 	char		*sm_nm;
4508 	uint_t		next_side;
4509 	void		*kbuffer;
4510 
4511 	if (ui == NULL)
4512 		return (ENXIO);
4513 
4514 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4515 		return (EINVAL);
4516 	}
4517 
4518 	/* Check for aligned block access. We disallow non-aligned requests. */
4519 	if (vdr->vdr_offset % DEV_BSIZE) {
4520 		return (EINVAL);
4521 	}
4522 
4523 	/*
4524 	 * Allocate kernel buffer for target of read(). If we had a reliable
4525 	 * (sorry functional) DDI this wouldn't be needed.
4526 	 */
4527 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4528 	if (kbuffer == NULL) {
4529 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4530 		    " bytes\n", vdr->vdr_nbytes);
4531 		return (ENOMEM);
4532 	}
4533 
4534 	bp = getrbuf(KM_SLEEP);
4535 
4536 	bp->b_un.b_addr = kbuffer;
4537 	bp->b_flags = B_READ;
4538 	bp->b_bcount = vdr->vdr_nbytes;
4539 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
4540 	bp->b_edev = mdev;
4541 
4542 	un = md_unit_readerlock(ui);
4543 
4544 	/*
4545 	 * If DKV_SIDE_INIT is set we need to determine the first available
4546 	 * side to start reading from. If it isn't set we increment to the
4547 	 * next readable submirror.
4548 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4549 	 * Note: we check for a readable submirror on completion of the i/o so
4550 	 * we should _always_ have one available. If this becomes unavailable
4551 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4552 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
4553 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4554 	 * The chance of this is small, but not non-existent.
4555 	 */
4556 	if (vdr->vdr_side == DKV_SIDE_INIT) {
4557 		next_side = 0;
4558 	} else {
4559 		next_side = vdr->vdr_side + 1;
4560 	}
4561 	while ((next_side < NMIRROR) &&
4562 	    !SUBMIRROR_IS_READABLE(un, next_side))
4563 		next_side++;
4564 	if (next_side >= NMIRROR) {
4565 		vdr->vdr_flags |= DKV_DMR_ERROR;
4566 		freerbuf(bp);
4567 		vdr->vdr_bytesread = 0;
4568 		md_unit_readerexit(ui);
4569 		return (0);
4570 	}
4571 
4572 	/* Set the side to read from */
4573 	un->un_dmr_last_read = next_side;
4574 
4575 	md_unit_readerexit(ui);
4576 
4577 	/*
4578 	 * Save timestamp for verification purposes. Can be read by debugger
4579 	 * to verify that this ioctl has been executed and to find the number
4580 	 * of DMR reads and the time of the last DMR read.
4581 	 */
4582 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
4583 	mirror_dmr_stats.dmr_count++;
4584 
4585 	/* Issue READ request and wait for completion */
4586 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4587 
4588 	mutex_enter(&un->un_dmr_mx);
4589 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4590 	mutex_exit(&un->un_dmr_mx);
4591 
4592 	/*
4593 	 * Check to see if we encountered an error during the read. If so we
4594 	 * can make no guarantee about any possibly returned data.
4595 	 */
4596 	if ((bp->b_flags & B_ERROR) == 0) {
4597 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
4598 		if (bp->b_resid) {
4599 			vdr->vdr_flags |= DKV_DMR_SHORT;
4600 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4601 		} else {
4602 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
4603 			vdr->vdr_bytesread = vdr->vdr_nbytes;
4604 		}
4605 		/* Copy the data read back out to the user supplied buffer */
4606 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4607 		    mode)) {
4608 			kmem_free(kbuffer, vdr->vdr_nbytes);
4609 			return (EFAULT);
4610 		}
4611 
4612 	} else {
4613 		/* Error out with DKV_DMR_ERROR */
4614 		vdr->vdr_flags |= DKV_DMR_ERROR;
4615 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4616 	}
4617 	/*
4618 	 * Update the DMR parameters with the side and name of submirror that
4619 	 * we have just read from (un->un_dmr_last_read)
4620 	 */
4621 	un = md_unit_readerlock(ui);
4622 
4623 	vdr->vdr_side = un->un_dmr_last_read;
4624 	sm = &un->un_sm[un->un_dmr_last_read];
4625 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
4626 
4627 	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4628 
4629 	/*
4630 	 * Determine if we've completed the read cycle. This is true iff the
4631 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4632 	 * use un_nsm as we need to handle a sparse array of submirrors (which
4633 	 * can occur if a submirror is metadetached).
4634 	 */
4635 	next_side = un->un_dmr_last_read + 1;
4636 	while ((next_side < NMIRROR) &&
4637 	    !SUBMIRROR_IS_READABLE(un, next_side))
4638 		next_side++;
4639 	if (next_side >= NMIRROR) {
4640 		/* We've finished */
4641 		vdr->vdr_flags |= DKV_DMR_DONE;
4642 	}
4643 
4644 	md_unit_readerexit(ui);
4645 	freerbuf(bp);
4646 	kmem_free(kbuffer, vdr->vdr_nbytes);
4647 
4648 	return (0);
4649 }
4650 
4651 /*
4652  * mirror_resync_message:
4653  * ---------------------
4654  * Handle the multi-node resync messages that keep all nodes within a given
4655  * disk-set in sync with their view of a mirror's resync status.
4656  *
4657  * The message types dealt with are:
4658  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
4659  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
4660  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
4661  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
4662  *
4663  * Returns:
4664  *	0	Success
4665  *	>0	Failure error number
4666  */
4667 int
mirror_resync_message(md_mn_rs_params_t * p,IOLOCK * lockp)4668 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4669 {
4670 	mdi_unit_t		*ui;
4671 	mm_unit_t		*un;
4672 	set_t			setno;
4673 	int			is_ABR;
4674 	int			smi;
4675 	int			ci;
4676 	sm_state_t		state;
4677 	int			broke_out;
4678 	mm_submirror_t		*sm;
4679 	mm_submirror_ic_t	*smic;
4680 	md_m_shared_t		*shared;
4681 	md_error_t		mde = mdnullerror;
4682 	md_mps_t		*ps;
4683 	int			rs_active;
4684 	int			rr, rr_start, rr_end;
4685 
4686 	/* Check that the given device is part of a multi-node set */
4687 	setno = MD_MIN2SET(p->mnum);
4688 	if (setno >= md_nsets) {
4689 		return (ENXIO);
4690 	}
4691 	if (!MD_MNSET_SETNO(setno)) {
4692 		return (EINVAL);
4693 	}
4694 
4695 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4696 		return (EINVAL);
4697 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
4698 		return (EINVAL);
4699 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4700 
4701 	/* Obtain the current resync status */
4702 	(void) md_ioctl_readerlock(lockp, ui);
4703 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4704 	md_ioctl_readerexit(lockp);
4705 
4706 	switch ((md_mn_msgtype_t)p->msg_type) {
4707 	case MD_MN_MSG_RESYNC_STARTING:
4708 		/* Start the resync thread for the mirror */
4709 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4710 		break;
4711 
4712 	case MD_MN_MSG_RESYNC_NEXT:
4713 		/*
4714 		 * We have to release any previously marked overlap regions
4715 		 * so that i/o can resume. Then we need to block the region
4716 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4717 		 * Update un_rs_resync_done and un_rs_resync_2_do.
4718 		 */
4719 		(void) md_ioctl_readerlock(lockp, ui);
4720 		/*
4721 		 * Ignore the message if there is no active resync thread or
4722 		 * if it is for a resync type that we have already completed.
4723 		 * un_resync_completed is set to the last resync completed
4724 		 * when processing a PHASE_DONE message.
4725 		 */
4726 		if (!rs_active || (p->rs_type == un->un_resync_completed))
4727 			break;
4728 		/*
4729 		 * If this message is for the same resync and is for an earlier
4730 		 * resync region, just ignore it. This can only occur if this
4731 		 * node has progressed on to the next resync region before
4732 		 * we receive this message. This can occur if the class for
4733 		 * this message is busy and the originator has to retry thus
4734 		 * allowing this node to move onto the next resync_region.
4735 		 */
4736 		if ((p->rs_type == un->un_rs_type) &&
4737 		    (p->rs_start < un->un_resync_startbl))
4738 			break;
4739 		ps = un->un_rs_prev_overlap;
4740 
4741 		/* Allocate previous overlap reference if needed */
4742 		if (ps == NULL) {
4743 			ps = kmem_cache_alloc(mirror_parent_cache,
4744 			    MD_ALLOCFLAGS);
4745 			ps->ps_un = un;
4746 			ps->ps_ui = ui;
4747 			ps->ps_firstblk = 0;
4748 			ps->ps_lastblk = 0;
4749 			ps->ps_flags = 0;
4750 			md_ioctl_readerexit(lockp);
4751 			(void) md_ioctl_writerlock(lockp, ui);
4752 			un->un_rs_prev_overlap = ps;
4753 			md_ioctl_writerexit(lockp);
4754 		} else
4755 			md_ioctl_readerexit(lockp);
4756 
4757 		if (p->rs_originator != md_mn_mynode_id) {
4758 			/*
4759 			 * Clear our un_resync_bm for the regions completed.
4760 			 * The owner (originator) will take care of itself.
4761 			 */
4762 			BLK_TO_RR(rr_end, ps->ps_lastblk, un);
4763 			BLK_TO_RR(rr_start, p->rs_start, un);
4764 			if (ps->ps_lastblk && rr_end < rr_start) {
4765 				BLK_TO_RR(rr_start, ps->ps_firstblk, un);
4766 				mutex_enter(&un->un_resync_mx);
4767 				/*
4768 				 * Update our resync bitmap to reflect that
4769 				 * another node has synchronized this range.
4770 				 */
4771 				for (rr = rr_start; rr <= rr_end; rr++) {
4772 					CLR_KEEPDIRTY(rr, un);
4773 				}
4774 				mutex_exit(&un->un_resync_mx);
4775 			}
4776 
4777 			/*
4778 			 * On all but the originating node, first update
4779 			 * the resync state, then unblock the previous
4780 			 * region and block the next one. No need
4781 			 * to do this if the region is already blocked.
4782 			 * Update the submirror state and flags from the
4783 			 * originator. This keeps the cluster in sync with
4784 			 * regards to the resync status.
4785 			 */
4786 
4787 			(void) md_ioctl_writerlock(lockp, ui);
4788 			un->un_rs_resync_done = p->rs_done;
4789 			un->un_rs_resync_2_do = p->rs_2_do;
4790 			un->un_rs_type = p->rs_type;
4791 			un->un_resync_startbl = p->rs_start;
4792 			md_ioctl_writerexit(lockp);
4793 			/*
4794 			 * Use un_owner_mx to ensure that an ownership change
4795 			 * cannot happen at the same time as this message
4796 			 */
4797 			mutex_enter(&un->un_owner_mx);
4798 			if (MD_MN_MIRROR_OWNER(un)) {
4799 				ps->ps_firstblk = p->rs_start;
4800 				ps->ps_lastblk = ps->ps_firstblk +
4801 				    p->rs_size - 1;
4802 			} else {
4803 				if ((ps->ps_firstblk != p->rs_start) ||
4804 				    (ps->ps_lastblk != p->rs_start +
4805 				    p->rs_size - 1)) {
4806 					/* Remove previous overlap range */
4807 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4808 						mirror_overlap_tree_remove(ps);
4809 
4810 					ps->ps_firstblk = p->rs_start;
4811 					ps->ps_lastblk = ps->ps_firstblk +
4812 					    p->rs_size - 1;
4813 
4814 					mutex_exit(&un->un_owner_mx);
4815 					/* Block this range from all i/o. */
4816 					if (ps->ps_firstblk != 0 ||
4817 					    ps->ps_lastblk != 0)
4818 						wait_for_overlaps(ps,
4819 						    MD_OVERLAP_ALLOW_REPEAT);
4820 					mutex_enter(&un->un_owner_mx);
4821 					/*
4822 					 * Check to see if we have obtained
4823 					 * ownership while waiting for
4824 					 * overlaps. If we have, remove
4825 					 * the resync_region entry from the
4826 					 * overlap tree
4827 					 */
4828 					if (MD_MN_MIRROR_OWNER(un) &&
4829 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
4830 						mirror_overlap_tree_remove(ps);
4831 				}
4832 			}
4833 			mutex_exit(&un->un_owner_mx);
4834 
4835 			/*
4836 			 * If this is the first RESYNC_NEXT message (i.e.
4837 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4838 			 * issue RESYNC_START NOTIFY event
4839 			 */
4840 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4841 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4842 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4843 				    MD_SID(un));
4844 			}
4845 
4846 			/* Ensure that our local resync thread is running */
4847 			if (un->un_rs_thread == NULL) {
4848 				(void) mirror_resync_unit(p->mnum, NULL,
4849 				    &p->mde, lockp);
4850 			}
4851 		}
4852 
4853 		break;
4854 	case MD_MN_MSG_RESYNC_FINISH:
4855 		/*
4856 		 * Complete the resync by stopping the resync thread.
4857 		 * Also release the previous overlap region field.
4858 		 * Update the resync_progress_thread by cv_signal'ing it so
4859 		 * that we mark the end of the resync as soon as possible. This
4860 		 * stops an unnecessary delay should be panic after resync
4861 		 * completion.
4862 		 */
4863 #ifdef DEBUG
4864 		if (!rs_active) {
4865 			if (mirror_debug_flag)
4866 				printf("RESYNC_FINISH (mnum = %x), "
4867 				    "Resync *NOT* active",
4868 				    p->mnum);
4869 		}
4870 #endif
4871 
4872 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4873 		    (p->rs_originator != md_mn_mynode_id)) {
4874 			mutex_enter(&un->un_rs_thread_mx);
4875 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4876 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4877 			un->un_rs_thread_flags &=
4878 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4879 			cv_signal(&un->un_rs_thread_cv);
4880 			mutex_exit(&un->un_rs_thread_mx);
4881 		}
4882 		if (is_ABR) {
4883 			/* Resync finished, if ABR set owner to NULL */
4884 			mutex_enter(&un->un_owner_mx);
4885 			un->un_mirror_owner = 0;
4886 			mutex_exit(&un->un_owner_mx);
4887 		}
4888 		(void) md_ioctl_writerlock(lockp, ui);
4889 		ps = un->un_rs_prev_overlap;
4890 		if (ps != NULL) {
4891 			/* Remove previous overlap range */
4892 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4893 				mirror_overlap_tree_remove(ps);
4894 			/*
4895 			 * Release the overlap range reference
4896 			 */
4897 			un->un_rs_prev_overlap = NULL;
4898 			kmem_cache_free(mirror_parent_cache,
4899 			    ps);
4900 		}
4901 		md_ioctl_writerexit(lockp);
4902 
4903 		/* Mark the resync as complete in the metadb */
4904 		un->un_rs_resync_done = p->rs_done;
4905 		un->un_rs_resync_2_do = p->rs_2_do;
4906 		un->un_rs_type = p->rs_type;
4907 		mutex_enter(&un->un_rs_progress_mx);
4908 		cv_signal(&un->un_rs_progress_cv);
4909 		mutex_exit(&un->un_rs_progress_mx);
4910 
4911 		un = md_ioctl_writerlock(lockp, ui);
4912 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4913 		/* Deal with any pending grow_unit */
4914 		if (un->c.un_status & MD_UN_GROW_PENDING) {
4915 			if ((mirror_grow_unit(un, &mde) != 0) ||
4916 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4917 				un->c.un_status &= ~MD_UN_GROW_PENDING;
4918 			}
4919 		}
4920 		md_ioctl_writerexit(lockp);
4921 		break;
4922 
4923 	case MD_MN_MSG_RESYNC_PHASE_DONE:
4924 		/*
4925 		 * A phase of the resync, optimized. component or
4926 		 * submirror is complete. Update mirror status.
4927 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4928 		 * mirror owner is peforming a resync. If we have just snarfed
4929 		 * this set, then we must clear any of the flags set at snarf
4930 		 * time by unit_setup_resync().
4931 		 * Note that unit_setup_resync() sets up these flags to
4932 		 * indicate that an optimized resync is required. These flags
4933 		 * need to be reset because if we get here,  the mirror owner
4934 		 * will have handled the optimized resync.
4935 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4936 		 * MD_UN_WAR. In addition, for each submirror,
4937 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4938 		 * set to SMS_OFFLINE.
4939 		 */
4940 #ifdef DEBUG
4941 		if (mirror_debug_flag)
4942 			printf("phase done mess received from %d, mnum=%x,"
4943 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4944 			    p->rs_type, p->rs_flags);
4945 #endif
4946 		/*
4947 		 * Ignore the message if there is no active resync thread.
4948 		 */
4949 		if (!rs_active)
4950 			break;
4951 
4952 		broke_out = p->rs_flags & MD_MN_RS_ERR;
4953 		switch (RS_TYPE(p->rs_type)) {
4954 		case MD_RS_OPTIMIZED:
4955 			un = md_ioctl_writerlock(lockp, ui);
4956 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4957 				/* If we are originator, just clear rs_type */
4958 				if (p->rs_originator == md_mn_mynode_id) {
4959 					SET_RS_TYPE_NONE(un->un_rs_type);
4960 					md_ioctl_writerexit(lockp);
4961 					break;
4962 				}
4963 				/*
4964 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
4965 				 * flags if OPT_NOT_DONE is set *and* rs_type
4966 				 * is MD_RS_NONE.
4967 				 */
4968 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4969 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4970 					/* No resync in progress */
4971 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4972 					un->c.un_status &= ~MD_UN_WAR;
4973 				} else {
4974 					/*
4975 					 * We are in the middle of an
4976 					 * optimized resync and this message
4977 					 * should be ignored.
4978 					 */
4979 					md_ioctl_writerexit(lockp);
4980 					break;
4981 				}
4982 			} else {
4983 				/*
4984 				 * This is the end of an optimized resync,
4985 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4986 				 */
4987 
4988 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4989 				if (!broke_out)
4990 					un->c.un_status &= ~MD_UN_WAR;
4991 
4992 				/*
4993 				 * Clear our un_resync_bm for the regions
4994 				 * completed.  The owner (originator) will
4995 				 * take care of itself.
4996 				 */
4997 				if (p->rs_originator != md_mn_mynode_id &&
4998 				    (ps = un->un_rs_prev_overlap) != NULL) {
4999 					BLK_TO_RR(rr_start, ps->ps_firstblk,
5000 					    un);
5001 					BLK_TO_RR(rr_end, ps->ps_lastblk, un);
5002 					mutex_enter(&un->un_resync_mx);
5003 					for (rr = rr_start; rr <= rr_end;
5004 					    rr++) {
5005 						CLR_KEEPDIRTY(rr, un);
5006 					}
5007 					mutex_exit(&un->un_resync_mx);
5008 				}
5009 			}
5010 
5011 			/*
5012 			 * Set resync_completed to last resync type and then
5013 			 * clear resync_type to indicate no resync in progress
5014 			 */
5015 			un->un_resync_completed = un->un_rs_type;
5016 			SET_RS_TYPE_NONE(un->un_rs_type);
5017 
5018 			/*
5019 			 * If resync is as a result of a submirror ONLINE,
5020 			 * reset the submirror state to SMS_RUNNING if the
5021 			 * resync was ok else set back to SMS_OFFLINE.
5022 			 */
5023 			for (smi = 0; smi < NMIRROR; smi++) {
5024 				un->un_sm[smi].sm_flags &=
5025 				    ~MD_SM_RESYNC_TARGET;
5026 				if (SMS_BY_INDEX_IS(un, smi,
5027 				    SMS_OFFLINE_RESYNC)) {
5028 					if (p->rs_flags &
5029 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
5030 						state = SMS_OFFLINE;
5031 					} else {
5032 						state = (broke_out ?
5033 						    SMS_OFFLINE : SMS_RUNNING);
5034 					}
5035 					mirror_set_sm_state(
5036 					    &un->un_sm[smi],
5037 					    &un->un_smic[smi], state,
5038 					    broke_out);
5039 					mirror_commit(un, NO_SUBMIRRORS,
5040 					    0);
5041 				}
5042 				/*
5043 				 * If we still have an offline submirror, reset
5044 				 * the OFFLINE_SM flag in the mirror status
5045 				 */
5046 				if (SMS_BY_INDEX_IS(un, smi,
5047 				    SMS_OFFLINE))
5048 					un->c.un_status |=
5049 					    MD_UN_OFFLINE_SM;
5050 			}
5051 			md_ioctl_writerexit(lockp);
5052 			break;
5053 		case MD_RS_SUBMIRROR:
5054 			un = md_ioctl_writerlock(lockp, ui);
5055 			smi = RS_SMI(p->rs_type);
5056 			sm = &un->un_sm[smi];
5057 			smic = &un->un_smic[smi];
5058 			/* Clear RESYNC target */
5059 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5060 			/*
5061 			 * Set resync_completed to last resync type and then
5062 			 * clear resync_type to indicate no resync in progress
5063 			 */
5064 			un->un_resync_completed = un->un_rs_type;
5065 			SET_RS_TYPE_NONE(un->un_rs_type);
5066 			/*
5067 			 * If the resync completed ok reset the submirror
5068 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
5069 			 */
5070 			state = (broke_out ?
5071 			    SMS_ATTACHED : SMS_RUNNING);
5072 			mirror_set_sm_state(sm, smic, state, broke_out);
5073 			un->c.un_status &= ~MD_UN_WAR;
5074 			mirror_commit(un, SMI2BIT(smi), 0);
5075 			md_ioctl_writerexit(lockp);
5076 			break;
5077 		case MD_RS_COMPONENT:
5078 			un = md_ioctl_writerlock(lockp, ui);
5079 			smi = RS_SMI(p->rs_type);
5080 			ci = RS_CI(p->rs_type);
5081 			sm = &un->un_sm[smi];
5082 			smic = &un->un_smic[smi];
5083 			shared = (md_m_shared_t *)
5084 			    (*(smic->sm_shared_by_indx))
5085 			    (sm->sm_dev, sm, ci);
5086 			un->c.un_status &= ~MD_UN_WAR;
5087 			/* Clear RESYNC target */
5088 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5089 			/*
5090 			 * Set resync_completed to last resync type and then
5091 			 * clear resync_type to indicate no resync in progress
5092 			 */
5093 			un->un_resync_completed = un->un_rs_type;
5094 			SET_RS_TYPE_NONE(un->un_rs_type);
5095 
5096 			/*
5097 			 * If the resync completed ok, set the component state
5098 			 * to CS_OKAY.
5099 			 */
5100 			if (broke_out)
5101 				shared->ms_flags |= MDM_S_RS_TRIED;
5102 			else {
5103 				/*
5104 				 * As we don't transmit the changes,
5105 				 * no need to drop the lock.
5106 				 */
5107 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
5108 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
5109 			}
5110 			md_ioctl_writerexit(lockp);
5111 		default:
5112 			break;
5113 		}
5114 		/*
5115 		 * If the purpose of this PHASE_DONE message is just to
5116 		 * indicate to all other nodes that the optimized resync
5117 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
5118 		 * no need to generate a notify event as there has not
5119 		 * actually been a resync.
5120 		 */
5121 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
5122 			if (broke_out) {
5123 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
5124 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
5125 				    MD_SID(un));
5126 			} else {
5127 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
5128 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
5129 				    MD_SID(un));
5130 			}
5131 		}
5132 		break;
5133 
5134 	default:
5135 #ifdef DEBUG
5136 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
5137 		    " %x\n", p->msg_type);
5138 #endif
5139 		return (EINVAL);
5140 	}
5141 	return (0);
5142 }
5143 
5144 /* Return a -1 if snarf of optimized record failed and set should be released */
5145 static int
mirror_snarf(md_snarfcmd_t cmd,set_t setno)5146 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
5147 {
5148 	mddb_recid_t	recid;
5149 	int		gotsomething;
5150 	int		all_mirrors_gotten;
5151 	mm_unit_t	*un;
5152 	mddb_type_t	typ1;
5153 	mddb_de_ic_t    *dep;
5154 	mddb_rb32_t	*rbp;
5155 	size_t		newreqsize;
5156 	mm_unit_t	*big_un;
5157 	mm_unit32_od_t	*small_un;
5158 	int		retval;
5159 	mdi_unit_t	*ui;
5160 
5161 	if (cmd == MD_SNARF_CLEANUP) {
5162 		if (md_get_setstatus(setno) & MD_SET_STALE)
5163 			return (0);
5164 
5165 		recid = mddb_makerecid(setno, 0);
5166 		typ1 = (mddb_type_t)md_getshared_key(setno,
5167 		    mirror_md_ops.md_driver.md_drivername);
5168 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5169 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
5170 				un = (mm_unit_t *)mddb_getrecaddr(recid);
5171 				mirror_cleanup(un);
5172 				recid = mddb_makerecid(setno, 0);
5173 			}
5174 		}
5175 		return (0);
5176 	}
5177 
5178 	all_mirrors_gotten = 1;
5179 	gotsomething = 0;
5180 
5181 	recid = mddb_makerecid(setno, 0);
5182 	typ1 = (mddb_type_t)md_getshared_key(setno,
5183 	    mirror_md_ops.md_driver.md_drivername);
5184 
5185 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5186 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5187 			continue;
5188 
5189 		dep = mddb_getrecdep(recid);
5190 		dep->de_flags = MDDB_F_MIRROR;
5191 		rbp = dep->de_rb;
5192 
5193 		switch (rbp->rb_revision) {
5194 		case MDDB_REV_RB:
5195 		case MDDB_REV_RBFN:
5196 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
5197 				/*
5198 				 * This means, we have an old and small
5199 				 * record and this record hasn't already
5200 				 * been converted.  Before we create an
5201 				 * incore metadevice from this we have to
5202 				 * convert it to a big record.
5203 				 */
5204 				small_un =
5205 				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
5206 				newreqsize = sizeof (mm_unit_t);
5207 				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
5208 				    KM_SLEEP);
5209 				mirror_convert((caddr_t)small_un,
5210 				    (caddr_t)big_un, SMALL_2_BIG);
5211 				kmem_free(small_un, dep->de_reqsize);
5212 
5213 				/*
5214 				 * Update userdata and incore userdata
5215 				 * incores are at the end of un
5216 				 */
5217 				dep->de_rb_userdata_ic = big_un;
5218 				dep->de_rb_userdata = big_un;
5219 				dep->de_icreqsize = newreqsize;
5220 				un = big_un;
5221 				rbp->rb_private |= MD_PRV_CONVD;
5222 			} else {
5223 				/*
5224 				 * Unit already converted, just get the
5225 				 * record address.
5226 				 */
5227 				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5228 				    sizeof (*un), 0);
5229 			}
5230 			un->c.un_revision &= ~MD_64BIT_META_DEV;
5231 			break;
5232 		case MDDB_REV_RB64:
5233 		case MDDB_REV_RB64FN:
5234 			/* Big device */
5235 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5236 			    sizeof (*un), 0);
5237 			un->c.un_revision |= MD_64BIT_META_DEV;
5238 			un->c.un_flag |= MD_EFILABEL;
5239 			break;
5240 		}
5241 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
5242 
5243 		/*
5244 		 * Create minor device node for snarfed entry.
5245 		 */
5246 		(void) md_create_minor_node(setno, MD_SID(un));
5247 
5248 		if (MD_UNIT(MD_SID(un)) != NULL) {
5249 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5250 			continue;
5251 		}
5252 		all_mirrors_gotten = 0;
5253 		retval = mirror_build_incore(un, 1);
5254 		if (retval == 0) {
5255 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
5256 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5257 			resync_start_timeout(setno);
5258 			gotsomething = 1;
5259 		} else {
5260 			return (retval);
5261 		}
5262 		/*
5263 		 * Set flag to indicate that the mirror has not yet
5264 		 * been through a reconfig. This flag is used for MN sets
5265 		 * when determining whether to update the mirror state from
5266 		 * the Master node.
5267 		 */
5268 		if (MD_MNSET_SETNO(setno)) {
5269 			ui = MDI_UNIT(MD_SID(un));
5270 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5271 		}
5272 	}
5273 
5274 	if (!all_mirrors_gotten)
5275 		return (gotsomething);
5276 
5277 	recid = mddb_makerecid(setno, 0);
5278 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5279 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5280 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5281 
5282 	return (0);
5283 }
5284 
5285 static int
mirror_halt(md_haltcmd_t cmd,set_t setno)5286 mirror_halt(md_haltcmd_t cmd, set_t setno)
5287 {
5288 	unit_t		i;
5289 	mdi_unit_t	*ui;
5290 	minor_t		mnum;
5291 	int		reset_mirror_flag = 0;
5292 
5293 	if (cmd == MD_HALT_CLOSE)
5294 		return (0);
5295 
5296 	if (cmd == MD_HALT_OPEN)
5297 		return (0);
5298 
5299 	if (cmd == MD_HALT_UNLOAD)
5300 		return (0);
5301 
5302 	if (cmd == MD_HALT_CHECK) {
5303 		for (i = 0; i < md_nunits; i++) {
5304 			mnum = MD_MKMIN(setno, i);
5305 			if ((ui = MDI_UNIT(mnum)) == NULL)
5306 				continue;
5307 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5308 				continue;
5309 			if (md_unit_isopen(ui))
5310 				return (1);
5311 		}
5312 		return (0);
5313 	}
5314 
5315 	if (cmd != MD_HALT_DOIT)
5316 		return (1);
5317 
5318 	for (i = 0; i < md_nunits; i++) {
5319 		mnum = MD_MKMIN(setno, i);
5320 		if ((ui = MDI_UNIT(mnum)) == NULL)
5321 			continue;
5322 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5323 			continue;
5324 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5325 
5326 		/* Set a flag if there is at least one mirror metadevice. */
5327 		reset_mirror_flag = 1;
5328 	}
5329 
5330 	/*
5331 	 * Only wait for the global dr_timeout to finish
5332 	 *  - if there are mirror metadevices in this diskset or
5333 	 *  - if this is the local set since an unload of the md_mirror
5334 	 *    driver could follow a successful mirror halt in the local set.
5335 	 */
5336 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5337 		while ((mirror_md_ops.md_head == NULL) &&
5338 		    (mirror_timeout.dr_timeout_id != 0))
5339 			delay(md_hz);
5340 	}
5341 
5342 	return (0);
5343 }
5344 
5345 /*ARGSUSED3*/
5346 static int
mirror_open(dev_t * dev,int flag,int otyp,cred_t * cred_p,int md_oflags)5347 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5348 {
5349 	IOLOCK	lock;
5350 	minor_t		mnum = getminor(*dev);
5351 	set_t		setno;
5352 
5353 	/*
5354 	 * When doing an open of a multi owner metadevice, check to see if this
5355 	 * node is a starting node and if a reconfig cycle is underway.
5356 	 * If so, the system isn't sufficiently set up enough to handle the
5357 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
5358 	 */
5359 	setno = MD_MIN2SET(mnum);
5360 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5361 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5362 			return (ENXIO);
5363 	}
5364 
5365 	if (md_oflags & MD_OFLG_FROMIOCTL) {
5366 		/*
5367 		 * This indicates that the caller is an ioctl service routine.
5368 		 * In this case we initialise our stack-based IOLOCK and pass
5369 		 * this into the internal open routine. This allows multi-owner
5370 		 * metadevices to avoid deadlocking if an error is encountered
5371 		 * during the open() attempt. The failure case is:
5372 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5373 		 * this configuration would deadlock as the mirror code has to
5374 		 * send a state-update to the other nodes when it detects the
5375 		 * failure of the underlying submirror with an errored soft-part
5376 		 * on it. As there is a class1 message in progress (metaclear)
5377 		 * set_sm_comp_state() cannot send another class1 message;
5378 		 * instead we do not send a state_update message as the
5379 		 * metaclear is distributed and the failed submirror will be
5380 		 * cleared from the configuration by the metaclear.
5381 		 */
5382 		IOLOCK_INIT(&lock);
5383 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5384 		    md_oflags, &lock));
5385 	} else {
5386 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5387 		    md_oflags, (IOLOCK *)NULL));
5388 	}
5389 }
5390 
5391 
5392 /*ARGSUSED1*/
5393 static int
mirror_close(dev_t dev,int flag,int otyp,cred_t * cred_p,int md_cflags)5394 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5395 {
5396 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5397 	    (IOLOCK *)NULL));
5398 }
5399 
5400 
5401 /*
5402  * This routine dumps memory to the disk.  It assumes that the memory has
5403  * already been mapped into mainbus space.  It is called at disk interrupt
5404  * priority when the system is in trouble.
5405  *
5406  */
5407 static int
mirror_dump(dev_t dev,caddr_t addr,daddr_t blkno,int nblk)5408 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5409 {
5410 	mm_unit_t	*un;
5411 	dev_t		mapdev;
5412 	int		result;
5413 	int		smi;
5414 	int		any_succeed = 0;
5415 	int		save_result = 0;
5416 
5417 	/*
5418 	 * Don't need to grab the unit lock.
5419 	 * Cause nothing else is suppose to be happenning.
5420 	 * Also dump is not suppose to sleep.
5421 	 */
5422 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
5423 
5424 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5425 		return (EINVAL);
5426 
5427 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5428 		return (EINVAL);
5429 
5430 	for (smi = 0; smi < NMIRROR; smi++) {
5431 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5432 			continue;
5433 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5434 		result = bdev_dump(mapdev, addr, blkno, nblk);
5435 		if (result)
5436 			save_result = result;
5437 
5438 		if (result == 0)
5439 			any_succeed++;
5440 	}
5441 
5442 	if (any_succeed)
5443 		return (0);
5444 
5445 	return (save_result);
5446 }
5447 
5448 /*
5449  * NAME: mirror_probe_dev
5450  *
5451  * DESCRITPION: force opens every component of a mirror.
5452  *
5453  * On entry the unit writerlock is held
5454  */
5455 static int
mirror_probe_dev(mdi_unit_t * ui,minor_t mnum)5456 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5457 {
5458 	int		i;
5459 	int		smi;
5460 	int		ci;
5461 	mm_unit_t	*un;
5462 	int		md_devopen = 0;
5463 	set_t		setno;
5464 	int		sm_cnt;
5465 	int		sm_unavail_cnt;
5466 
5467 	if (md_unit_isopen(ui))
5468 		md_devopen++;
5469 
5470 	un = MD_UNIT(mnum);
5471 	setno = MD_UN2SET(un);
5472 
5473 	sm_cnt = 0;
5474 	sm_unavail_cnt = 0;
5475 	for (i = 0; i < NMIRROR; i++) {
5476 		md_dev64_t tmpdev;
5477 		mdi_unit_t	*sm_ui;
5478 
5479 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5480 			continue;
5481 		}
5482 
5483 		sm_cnt++;
5484 		tmpdev = un->un_sm[i].sm_dev;
5485 		(void) md_layered_open(mnum, &tmpdev,
5486 		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5487 		un->un_sm[i].sm_dev = tmpdev;
5488 
5489 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5490 
5491 		/*
5492 		 * Logic similar to that in mirror_open_all_devs.  We set or
5493 		 * clear the submirror Unavailable bit.
5494 		 */
5495 		(void) md_unit_writerlock(sm_ui);
5496 		if (submirror_unavailable(un, i, 1)) {
5497 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
5498 			sm_unavail_cnt++;
5499 		} else {
5500 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5501 		}
5502 		md_unit_writerexit(sm_ui);
5503 	}
5504 
5505 	/*
5506 	 * If all of the submirrors are unavailable, the mirror is also
5507 	 * unavailable.
5508 	 */
5509 	if (sm_cnt == sm_unavail_cnt) {
5510 		ui->ui_tstate |= MD_INACCESSIBLE;
5511 	} else {
5512 		ui->ui_tstate &= ~MD_INACCESSIBLE;
5513 	}
5514 
5515 	/*
5516 	 * Start checking from probe failures. If failures occur we
5517 	 * set the appropriate erred state only if the metadevice is in
5518 	 * use. This is specifically to prevent unnecessary resyncs.
5519 	 * For instance if the disks were accidentally disconnected when
5520 	 * the system booted up then until the metadevice is accessed
5521 	 * (like file system mount) the user can shutdown, recable and
5522 	 * reboot w/o incurring a potentially huge resync.
5523 	 */
5524 
5525 	smi = 0;
5526 	ci = 0;
5527 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5528 
5529 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
5530 			/*
5531 			 * Note that for a MN set, there is no need to call
5532 			 * SE_NOTIFY as that is done when processing the
5533 			 * state change
5534 			 */
5535 			if (md_devopen) {
5536 				/*
5537 				 * Never called from ioctl context,
5538 				 * so (IOLOCK *)NULL
5539 				 */
5540 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5541 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
5542 				if (!MD_MNSET_SETNO(setno)) {
5543 					SE_NOTIFY(EC_SVM_STATE,
5544 					    ESC_SVM_LASTERRED,
5545 					    SVM_TAG_METADEVICE, setno,
5546 					    MD_SID(un));
5547 				}
5548 				continue;
5549 			} else {
5550 				(void) mirror_close_all_devs(un,
5551 				    MD_OFLG_PROBEDEV);
5552 				if (!MD_MNSET_SETNO(setno)) {
5553 					SE_NOTIFY(EC_SVM_STATE,
5554 					    ESC_SVM_OPEN_FAIL,
5555 					    SVM_TAG_METADEVICE, setno,
5556 					    MD_SID(un));
5557 				}
5558 				mirror_openfail_console_info(un, smi, ci);
5559 				return (ENXIO);
5560 			}
5561 		}
5562 
5563 		/*
5564 		 * Note that for a MN set, there is no need to call
5565 		 * SE_NOTIFY as that is done when processing the
5566 		 * state change
5567 		 */
5568 		if (md_devopen) {
5569 			/* Never called from ioctl context, so (IOLOCK *)NULL */
5570 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5571 			    MD_STATE_XMIT, (IOLOCK *)NULL);
5572 			if (!MD_MNSET_SETNO(setno)) {
5573 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5574 				    SVM_TAG_METADEVICE, setno,
5575 				    MD_SID(un));
5576 			}
5577 		}
5578 		mirror_openfail_console_info(un, smi, ci);
5579 		ci++;
5580 	}
5581 
5582 	if (MD_MNSET_SETNO(setno)) {
5583 		send_poke_hotspares(setno);
5584 	} else {
5585 		(void) poke_hotspares();
5586 	}
5587 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5588 
5589 	return (0);
5590 }
5591 
5592 
5593 static int
mirror_imp_set(set_t setno)5594 mirror_imp_set(
5595 	set_t	setno
5596 )
5597 {
5598 
5599 	mddb_recid_t	recid;
5600 	int		gotsomething, i;
5601 	mddb_type_t	typ1;
5602 	mddb_de_ic_t	*dep;
5603 	mddb_rb32_t	*rbp;
5604 	mm_unit32_od_t	*un32;
5605 	mm_unit_t	*un64;
5606 	md_dev64_t	self_devt;
5607 	minor_t		*self_id;	/* minor needs to be updated */
5608 	md_parent_t	*parent_id;	/* parent needs to be updated */
5609 	mddb_recid_t	*record_id;	/* record id needs to be updated */
5610 	mddb_recid_t	*optrec_id;
5611 	md_dev64_t	tmpdev;
5612 
5613 
5614 	gotsomething = 0;
5615 
5616 	typ1 = (mddb_type_t)md_getshared_key(setno,
5617 	    mirror_md_ops.md_driver.md_drivername);
5618 	recid = mddb_makerecid(setno, 0);
5619 
5620 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5621 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5622 			continue;
5623 
5624 		dep = mddb_getrecdep(recid);
5625 		rbp = dep->de_rb;
5626 
5627 		switch (rbp->rb_revision) {
5628 		case MDDB_REV_RB:
5629 		case MDDB_REV_RBFN:
5630 			/*
5631 			 * Small device
5632 			 */
5633 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5634 			self_id = &(un32->c.un_self_id);
5635 			parent_id = &(un32->c.un_parent);
5636 			record_id = &(un32->c.un_record_id);
5637 			optrec_id = &(un32->un_rr_dirty_recid);
5638 
5639 			for (i = 0; i < un32->un_nsm; i++) {
5640 				tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5641 				un32->un_sm[i].sm_dev = md_cmpldev
5642 				    (md_makedevice(md_major, MD_MKMIN(setno,
5643 				    MD_MIN2UNIT(md_getminor(tmpdev)))));
5644 
5645 				if (!md_update_minor(setno, mddb_getsidenum
5646 				    (setno), un32->un_sm[i].sm_key))
5647 				goto out;
5648 			}
5649 			break;
5650 		case MDDB_REV_RB64:
5651 		case MDDB_REV_RB64FN:
5652 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5653 			self_id = &(un64->c.un_self_id);
5654 			parent_id = &(un64->c.un_parent);
5655 			record_id = &(un64->c.un_record_id);
5656 			optrec_id = &(un64->un_rr_dirty_recid);
5657 
5658 			for (i = 0; i < un64->un_nsm; i++) {
5659 				tmpdev = un64->un_sm[i].sm_dev;
5660 				un64->un_sm[i].sm_dev = md_makedevice
5661 				    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
5662 				    (md_getminor(tmpdev))));
5663 
5664 				if (!md_update_minor(setno, mddb_getsidenum
5665 				    (setno), un64->un_sm[i].sm_key))
5666 				goto out;
5667 			}
5668 			break;
5669 		}
5670 
5671 		/*
5672 		 * If this is a top level and a friendly name metadevice,
5673 		 * update its minor in the namespace.
5674 		 */
5675 		if ((*parent_id == MD_NO_PARENT) &&
5676 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
5677 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
5678 
5679 			self_devt = md_makedevice(md_major, *self_id);
5680 			if (!md_update_top_device_minor(setno,
5681 			    mddb_getsidenum(setno), self_devt))
5682 				goto out;
5683 		}
5684 
5685 		/*
5686 		 * Update unit with the imported setno
5687 		 *
5688 		 */
5689 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
5690 
5691 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5692 		if (*parent_id != MD_NO_PARENT)
5693 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5694 		*record_id = MAKERECID(setno, DBID(*record_id));
5695 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
5696 
5697 		gotsomething = 1;
5698 	}
5699 
5700 out:
5701 	return (gotsomething);
5702 }
5703 
5704 /*
5705  * NAME: mirror_check_offline
5706  *
5707  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5708  *
5709  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5710  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5711  * ioctl.
5712  */
5713 int
mirror_check_offline(md_dev64_t dev,int * offline_status)5714 mirror_check_offline(md_dev64_t dev, int *offline_status)
5715 {
5716 	mm_unit_t		*un;
5717 	md_error_t		mde = mdnullerror;
5718 
5719 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5720 		return (EINVAL);
5721 	*offline_status = 0;
5722 	if (un->c.un_status & MD_UN_OFFLINE_SM)
5723 		*offline_status = 1;
5724 	return (0);
5725 }
5726 
5727 /*
5728  * NAME: mirror_inc_abr_count
5729  *
5730  * DESCRIPTION: increment the count of layered soft parts with ABR set
5731  *
5732  * Called from ioctl, so access to un_abr_count is protected by the global
5733  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5734  */
5735 int
mirror_inc_abr_count(md_dev64_t dev)5736 mirror_inc_abr_count(md_dev64_t dev)
5737 {
5738 	mm_unit_t		*un;
5739 	md_error_t		mde = mdnullerror;
5740 
5741 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5742 		return (EINVAL);
5743 	un->un_abr_count++;
5744 	return (0);
5745 }
5746 
5747 /*
5748  * NAME: mirror_dec_abr_count
5749  *
5750  * DESCRIPTION: decrement the count of layered soft parts with ABR set
5751  *
5752  * Called from ioctl, so access to un_abr_count is protected by the global
5753  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5754  */
5755 int
mirror_dec_abr_count(md_dev64_t dev)5756 mirror_dec_abr_count(md_dev64_t dev)
5757 {
5758 	mm_unit_t		*un;
5759 	md_error_t		mde = mdnullerror;
5760 
5761 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5762 		return (EINVAL);
5763 	un->un_abr_count--;
5764 	return (0);
5765 }
5766 
5767 static md_named_services_t mirror_named_services[] = {
5768 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
5769 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
5770 	{mirror_rename_check,				MDRNM_CHECK	    },
5771 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
5772 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
5773 			MDRNM_PARENT_UPDATE_TO},
5774 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
5775 			MDRNM_SELF_UPDATE_FROM_DOWN },
5776 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
5777 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
5778 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
5779 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
5780 	{ NULL,						0		    }
5781 };
5782 
5783 md_ops_t mirror_md_ops = {
5784 	mirror_open,		/* open */
5785 	mirror_close,		/* close */
5786 	md_mirror_strategy,	/* strategy */
5787 	NULL,			/* print */
5788 	mirror_dump,		/* dump */
5789 	NULL,			/* read */
5790 	NULL,			/* write */
5791 	md_mirror_ioctl,	/* mirror_ioctl, */
5792 	mirror_snarf,		/* mirror_snarf */
5793 	mirror_halt,		/* mirror_halt */
5794 	NULL,			/* aread */
5795 	NULL,			/* awrite */
5796 	mirror_imp_set,		/* import set */
5797 	mirror_named_services
5798 };
5799 
5800 /* module specific initilization */
5801 static void
init_init()5802 init_init()
5803 {
5804 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5805 
5806 	/* Initialize the parent and child save memory pools */
5807 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5808 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
5809 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5810 	    0);
5811 
5812 	mirror_child_cache = kmem_cache_create("md_mirror_child",
5813 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5814 	    mirror_child_constructor, mirror_child_destructor,
5815 	    mirror_run_queue, NULL, NULL, 0);
5816 
5817 	/*
5818 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5819 	 * then initialize wowbuf memory pool.
5820 	 */
5821 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5822 	if (md_wowbuf_size <= 0)
5823 		md_wowbuf_size = 2 * DEV_BSIZE;
5824 	if (md_wowbuf_size > (32 * DEV_BSIZE))
5825 		md_wowbuf_size = (32 * DEV_BSIZE);
5826 
5827 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5828 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5829 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5830 
5831 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5832 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5833 
5834 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5835 }
5836 
5837 /* module specific uninitilization (undo init_init()) */
5838 static void
fini_uninit()5839 fini_uninit()
5840 {
5841 	kmem_cache_destroy(mirror_parent_cache);
5842 	kmem_cache_destroy(mirror_child_cache);
5843 	kmem_cache_destroy(mirror_wowblk_cache);
5844 	mirror_parent_cache = mirror_child_cache =
5845 	    mirror_wowblk_cache = NULL;
5846 
5847 	mutex_destroy(&mirror_timeout.dr_mx);
5848 	mutex_destroy(&hotspare_request.dr_mx);
5849 	mutex_destroy(&non_ff_drv_mutex);
5850 }
5851 
5852 /* define the module linkage */
5853 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
5854