xref: /titanic_41/usr/src/uts/common/io/lvm/raid/raid.c (revision 88f8b78a88cbdc6d8c1af5c3e54bc49d25095c98)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * NAME:	raid.c
31  *
32  * DESCRIPTION: Main RAID driver source file containing open, close and I/O
33  *		operations.
34  *
35  * ROUTINES PROVIDED FOR EXTERNAL USE:
36  *  raid_open()			- open the RAID metadevice for access.
37  *  raid_internal_open()	- internal open routine of RAID metdevice.
38  *  md_raid_strategy()		- perform normal I/O operations,
39  *				    such as read and write.
40  *  raid_close()		- close the RAID metadevice.
41  *  raid_internal_close()	- internal close routine of RAID metadevice.
42  *  raid_snarf()		- initialize and clean up MDD records.
43  *  raid_halt()			- reset the RAID metadevice
44  *  raid_line()			- return the line # of this segment
45  *  raid_dcolumn()		- return the data column # of this segment
46  *  raid_pcolumn()		- return the parity column # of this segment
47  */
48 
49 #include <sys/param.h>
50 #include <sys/systm.h>
51 #include <sys/conf.h>
52 #include <sys/file.h>
53 #include <sys/user.h>
54 #include <sys/uio.h>
55 #include <sys/t_lock.h>
56 #include <sys/buf.h>
57 #include <sys/dkio.h>
58 #include <sys/vtoc.h>
59 #include <sys/kmem.h>
60 #include <vm/page.h>
61 #include <sys/cmn_err.h>
62 #include <sys/sysmacros.h>
63 #include <sys/types.h>
64 #include <sys/mkdev.h>
65 #include <sys/stat.h>
66 #include <sys/open.h>
67 #include <sys/modctl.h>
68 #include <sys/ddi.h>
69 #include <sys/sunddi.h>
70 #include <sys/debug.h>
71 #include <sys/lvm/md_raid.h>
72 #include <sys/lvm/mdvar.h>
73 #include <sys/lvm/md_convert.h>
74 
75 #include <sys/sysevent/eventdefs.h>
76 #include <sys/sysevent/svm.h>
77 
78 md_ops_t		raid_md_ops;
79 #ifndef lint
80 static char		_depends_on[] = "drv/md";
81 md_ops_t		*md_interface_ops = &raid_md_ops;
82 #endif	/* lint */
83 
84 extern unit_t		md_nunits;
85 extern unit_t		md_nsets;
86 extern md_set_t		md_set[];
87 extern int		md_status;
88 extern major_t		md_major;
89 extern mdq_anchor_t	md_done_daemon;
90 extern mdq_anchor_t	md_mstr_daemon;
91 extern int		md_sleep_for_test;
92 extern clock_t		md_hz;
93 
94 extern md_event_queue_t	*md_event_queue;
95 
96 
97 int pchunks		= 16;
98 int phigh		= 1024;
99 int plow		= 128;
100 int cchunks		= 64;
101 int chigh		= 1024;
102 int clow		= 512;
103 int bchunks		= 32;
104 int bhigh		= 256;
105 int blow		= 128;
106 
107 int raid_total_io		= 0;
108 int raid_reads			= 0;
109 int raid_writes			= 0;
110 int raid_no_bpmaps		= 0;
111 int raid_512			= 0;
112 int raid_1024			= 0;
113 int raid_1024_8192		= 0;
114 int raid_8192			= 0;
115 int raid_8192_bigger		= 0;
116 int raid_line_lock_wait	= 0;
117 
118 int data_buffer_waits		= 0;
119 int parity_buffer_waits	= 0;
120 
121 /* writer line locks */
122 int raid_writer_locks		= 0; /* total writer locks */
123 int raid_write_waits		= 0; /* total writer locks that waited */
124 int raid_full_line_writes	= 0; /* total full line writes */
125 int raid_write_queue_length	= 0; /* wait queue length */
126 int raid_max_write_q_length	= 0; /* maximum queue length */
127 int raid_write_locks_active	= 0; /* writer locks at any time */
128 int raid_max_write_locks	= 0; /* maximum writer locks active */
129 
130 /* read line locks */
131 int raid_reader_locks		= 0; /* total reader locks held */
132 int raid_reader_locks_active	= 0; /* reader locks held */
133 int raid_max_reader_locks	= 0; /* maximum reader locks held in run */
134 int raid_read_overlaps		= 0; /* number of times 2 reads hit same line */
135 int raid_read_waits		= 0; /* times a reader waited on writer */
136 
137 /* prewrite stats */
138 int raid_prewrite_waits		= 0; /* number of waits for a pw slot */
139 int raid_pw			= 0; /* number of pw slots in use */
140 int raid_prewrite_max		= 0; /* maximum number of pw slots in use */
141 int raid_pw_invalidates		= 0;
142 
143 static clock_t md_wr_wait	= 0;
144 
145 int nv_available	= 0; /* presence of nv-ram support in device */
146 int nv_prewrite		= 1; /* mark prewrites with nv_available */
147 int nv_parity		= 1; /* mark parity with nv_available */
148 
149 kmem_cache_t	*raid_parent_cache = NULL;
150 kmem_cache_t	*raid_child_cache = NULL;
151 kmem_cache_t	*raid_cbuf_cache = NULL;
152 
153 int			raid_internal_open(minor_t mnum, int flag, int otyp,
154 			    int md_oflags);
155 
156 static void		freebuffers(md_raidcs_t *cs);
157 static int		raid_read(mr_unit_t *un, md_raidcs_t *cs);
158 static void		raid_read_io(mr_unit_t *un, md_raidcs_t *cs);
159 static int		raid_write(mr_unit_t *un, md_raidcs_t *cs);
160 static void		raid_write_io(mr_unit_t *un, md_raidcs_t *cs);
161 static void		raid_stage(md_raidcs_t *cs);
162 static void		raid_enqueue(md_raidcs_t *cs);
163 static diskaddr_t	raid_line(diskaddr_t segment, mr_unit_t *un);
164 uint_t			raid_dcolumn(diskaddr_t segment, mr_unit_t *un);
165 static void		getpbuffer(md_raidcs_t *cs);
166 static void		getdbuffer(md_raidcs_t *cs);
167 static void		raid_done(buf_t *bp);
168 static void		raid_io_startup(mr_unit_t *un);
169 
170 static rus_state_t
171 raid_col2unit(rcs_state_t state, rus_state_t unitstate)
172 {
173 	switch (state) {
174 	case RCS_INIT:
175 		return (RUS_INIT);
176 	case RCS_OKAY:
177 		return (RUS_OKAY);
178 	case RCS_RESYNC:
179 		if (unitstate & RUS_LAST_ERRED)
180 			return (RUS_LAST_ERRED);
181 		else
182 			return (RUS_ERRED);
183 	case RCS_ERRED:
184 		return (RUS_ERRED);
185 	case RCS_LAST_ERRED:
186 		return (RUS_ERRED);
187 	default:
188 		break;
189 	}
190 	panic("raid_col2unit");
191 	/*NOTREACHED*/
192 }
193 
194 void
195 raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force)
196 {
197 
198 	rus_state_t	unitstate, origstate;
199 	rcs_state_t	colstate;
200 	rcs_state_t	orig_colstate;
201 	int		errcnt = 0,
202 			okaycnt = 0,
203 			resynccnt = 0;
204 	int		i;
205 	char		*devname;
206 
207 	ASSERT(un);
208 	ASSERT(col < un->un_totalcolumncnt);
209 	ASSERT(newstate &
210 	    (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
211 	    RCS_LAST_ERRED | RCS_REGEN));
212 	ASSERT((newstate &
213 	    ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
214 	    RCS_LAST_ERRED | RCS_REGEN))
215 	    == 0);
216 
217 	ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
218 
219 	unitstate = un->un_state;
220 	origstate = unitstate;
221 
222 	if (force) {
223 		un->un_column[col].un_devstate = newstate;
224 		un->un_state = raid_col2unit(newstate, unitstate);
225 		uniqtime32(&un->un_column[col].un_devtimestamp);
226 		uniqtime32(&un->un_timestamp);
227 		return;
228 	}
229 
230 	ASSERT(un->un_state &
231 	    (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED |
232 	    RUS_REGEN));
233 	ASSERT((un->un_state & ~(RUS_INIT |
234 	    RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0);
235 
236 	if (un->un_column[col].un_devstate == newstate)
237 		return;
238 
239 	if (newstate == RCS_REGEN) {
240 		if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt)
241 			return;
242 		un->un_state = RUS_REGEN;
243 		return;
244 	}
245 
246 	orig_colstate = un->un_column[col].un_devstate;
247 
248 	/*
249 	 * if there is another column in the error state then this
250 	 * column should go to the last errored state
251 	 */
252 	for (i = 0; i < un->un_totalcolumncnt; i++) {
253 		if (i == col)
254 			colstate = newstate;
255 		else
256 			colstate = un->un_column[i].un_devstate;
257 		if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED))
258 			errcnt++;
259 		if (colstate & RCS_OKAY)
260 			okaycnt++;
261 		if (colstate & RCS_RESYNC)
262 			resynccnt++;
263 	}
264 	ASSERT(resynccnt < 2);
265 
266 	if (okaycnt == un->un_totalcolumncnt)
267 		unitstate = RUS_OKAY;
268 	else if (errcnt > 1) {
269 		unitstate = RUS_LAST_ERRED;
270 		if (newstate & RCS_ERRED)
271 			newstate = RCS_LAST_ERRED;
272 	} else if (errcnt == 1)
273 		if (!(unitstate & RUS_LAST_ERRED))
274 			unitstate = RUS_ERRED;
275 
276 	if (un->un_state == RUS_DOI)
277 		unitstate = RUS_DOI;
278 
279 	un->un_column[col].un_devstate = newstate;
280 	uniqtime32(&un->un_column[col].un_devtimestamp);
281 	/*
282 	 * if there are last errored column being brought back online
283 	 * by open or snarf, then be sure to clear the RUS_LAST_ERRED
284 	 * bit to allow writes.  If there is a real error then the
285 	 * column will go back into last erred.
286 	 */
287 	if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) &&
288 	    (raid_state_cnt(un, RCS_ERRED) == 1))
289 		unitstate = RUS_ERRED;
290 
291 	un->un_state = unitstate;
292 	uniqtime32(&un->un_timestamp);
293 
294 	if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) &&
295 	    (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) {
296 		devname = md_devname(MD_UN2SET(un),
297 			un->un_column[col].un_dev, NULL, 0);
298 
299 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
300 		    md_shortname(MD_SID(un)), devname);
301 
302 		if (unitstate & RUS_LAST_ERRED) {
303 			cmn_err(CE_WARN, "md: %s: %s last erred",
304 			    md_shortname(MD_SID(un)), devname);
305 
306 		} else if (un->un_column[col].un_devflags &
307 		    MD_RAID_DEV_ISOPEN) {
308 			/*
309 			 * Close the broken device and clear the open flag on
310 			 * it.  We have to check that the device is open,
311 			 * otherwise the first open on it has resulted in the
312 			 * error that is being processed and the actual un_dev
313 			 * will be NODEV64.
314 			 */
315 			md_layered_close(un->un_column[col].un_dev,
316 			    MD_OFLG_NULL);
317 			un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
318 		}
319 	} else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED &&
320 	    un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) {
321 		/*
322 		 * Similar to logic above except no log messages since we
323 		 * are just transitioning from Last Erred to Erred.
324 		 */
325 		md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL);
326 		un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
327 	}
328 
329 	/*
330 	 * If a resync has completed, see if there is a Last Erred
331 	 * component that we can change to the Erred state.
332 	 */
333 	if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) {
334 		for (i = 0; i < un->un_totalcolumncnt; i++) {
335 			if (i != col &&
336 			    (un->un_column[i].un_devstate & RCS_LAST_ERRED)) {
337 				raid_set_state(un, i, RCS_ERRED, 0);
338 				break;
339 			}
340 		}
341 	}
342 }
343 
344 /*
345  * NAME:	erred_check_line
346  *
347  * DESCRIPTION: Return the type of write to perform on an erred column based
348  *		upon any resync activity.
349  *
350  *		if a column is being resynced and the write is above the
351  *		resync point may have to write to the target being resynced.
352  *
353  *		Column state may make it impossible to do the write
354  *		in which case RCL_EIO or RCL_ENXIO is returned.
355  *
356  *		If a column cannot be written directly, RCL_ERRED is
357  *		returned and processing should proceed accordingly.
358  *
359  * PARAMETERS:	minor_t		 mnum - minor number identity of metadevice
360  *		md_raidcs_t	 *cs - child save structure
361  *		mr_column_t	 *dcolumn - pointer to data column structure
362  *		mr_column_t	 *pcolumn - pointer to parity column structure
363  *
364  * RETURNS:	RCL_OKAY, RCL_ERRED
365  *
366  * LOCKS:	Expects Line Writer Lock and Unit Resource Lock to be held
367  *		across call.
368  */
369 
370 static int
371 erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column)
372 {
373 
374 	ASSERT(un != NULL);
375 	ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
376 
377 	if (column->un_devstate & RCS_OKAY)
378 		return (RCL_OKAY);
379 
380 	if (column->un_devstate & RCS_ERRED)
381 		return (RCL_ERRED);  /* do not read from errored disk */
382 
383 	/*
384 	 * for the last errored case their are two considerations.
385 	 * When the last errored column is the only errored column then
386 	 * do treat it like a maintenance column, not doing I/O from
387 	 * it.   When it there are other failures then just attempt
388 	 * to use it.
389 	 */
390 	if (column->un_devstate & RCS_LAST_ERRED)
391 		return (RCL_ERRED);
392 
393 	ASSERT(column->un_devstate & RCS_RESYNC);
394 
395 	/*
396 	 * When a resync from a hotspare is being done (copy resync)
397 	 * then always treat it as an OKAY column, since no regen
398 	 * is required.
399 	 */
400 	if (column->un_devflags & MD_RAID_COPY_RESYNC) {
401 		return (RCL_OKAY);
402 	}
403 
404 	mutex_enter(&un->un_mx);
405 	if (cs->cs_line < un->un_resync_line_index) {
406 		mutex_exit(&un->un_mx);
407 		return (RCL_OKAY);
408 	}
409 	mutex_exit(&un->un_mx);
410 	return (RCL_ERRED);
411 
412 }
413 
414 /*
415  * NAMES:	raid_state_cnt
416  *
417  * DESCRIPTION: counts number of column in a specific state
418  *
419  * PARAMETERS:	md_raid_t *un
420  *		rcs_state state
421  */
422 int
423 raid_state_cnt(mr_unit_t *un, rcs_state_t state)
424 {
425 	int	i, retval = 0;
426 
427 	for (i = 0; i < un->un_totalcolumncnt; i++)
428 		if (un->un_column[i].un_devstate & state)
429 			retval++;
430 	return (retval);
431 }
432 
433 /*
434  * NAMES:	raid_io_overlaps
435  *
436  * DESCRIPTION: checkst for overlap of 2 child save structures
437  *
438  * PARAMETERS:	md_raidcs_t cs1
439  *		md_raidcs_t cs2
440  *
441  * RETURNS:	0 - no overlap
442  *		1 - overlap
443  */
444 int
445 raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2)
446 {
447 	if (cs1->cs_blkno > cs2->cs_lastblk)
448 		return (0);
449 	if (cs1->cs_lastblk < cs2->cs_blkno)
450 		return (0);
451 	return (1);
452 }
453 
454 /*
455  * NAMES:	raid_parent_constructor
456  * DESCRIPTION: parent structure constructor routine
457  * PARAMETERS:
458  */
459 /*ARGSUSED1*/
460 static int
461 raid_parent_constructor(void *p, void *d1, int d2)
462 {
463 	mutex_init(&((md_raidps_t *)p)->ps_mx,
464 	    NULL, MUTEX_DEFAULT, NULL);
465 	mutex_init(&((md_raidps_t *)p)->ps_mapin_mx,
466 	    NULL, MUTEX_DEFAULT, NULL);
467 	return (0);
468 }
469 
470 void
471 raid_parent_init(md_raidps_t *ps)
472 {
473 	bzero(ps, offsetof(md_raidps_t, ps_mx));
474 	((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE;
475 	((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC;
476 }
477 
478 /*ARGSUSED1*/
479 static void
480 raid_parent_destructor(void *p, void *d)
481 {
482 	mutex_destroy(&((md_raidps_t *)p)->ps_mx);
483 	mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx);
484 }
485 
486 /*
487  * NAMES:	raid_child_constructor
488  * DESCRIPTION: child structure constructor routine
489  * PARAMETERS:
490  */
491 /*ARGSUSED1*/
492 static int
493 raid_child_constructor(void *p, void *d1, int d2)
494 {
495 	md_raidcs_t	*cs = (md_raidcs_t *)p;
496 	mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL);
497 	bioinit(&cs->cs_dbuf);
498 	bioinit(&cs->cs_pbuf);
499 	bioinit(&cs->cs_hbuf);
500 	return (0);
501 }
502 
503 void
504 raid_child_init(md_raidcs_t *cs)
505 {
506 	bzero(cs, offsetof(md_raidcs_t, cs_mx));
507 
508 	md_bioreset(&cs->cs_dbuf);
509 	md_bioreset(&cs->cs_pbuf);
510 	md_bioreset(&cs->cs_hbuf);
511 
512 	((md_raidcs_t *)cs)->cs_dbuf.b_chain =
513 	    ((md_raidcs_t *)cs)->cs_pbuf.b_chain =
514 	    ((md_raidcs_t *)cs)->cs_hbuf.b_chain =
515 	    (struct buf *)(cs);
516 
517 	cs->cs_magic = RAID_CSMAGIC;
518 	cs->cs_line = MD_DISKADDR_ERROR;
519 	cs->cs_dpwslot = -1;
520 	cs->cs_ppwslot = -1;
521 }
522 
523 /*ARGSUSED1*/
524 static void
525 raid_child_destructor(void *p, void *d)
526 {
527 	biofini(&((md_raidcs_t *)p)->cs_dbuf);
528 	biofini(&((md_raidcs_t *)p)->cs_hbuf);
529 	biofini(&((md_raidcs_t *)p)->cs_pbuf);
530 	mutex_destroy(&((md_raidcs_t *)p)->cs_mx);
531 }
532 
533 /*ARGSUSED1*/
534 static int
535 raid_cbuf_constructor(void *p, void *d1, int d2)
536 {
537 	bioinit(&((md_raidcbuf_t *)p)->cbuf_bp);
538 	return (0);
539 }
540 
541 static void
542 raid_cbuf_init(md_raidcbuf_t *cb)
543 {
544 	bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp));
545 	md_bioreset(&cb->cbuf_bp);
546 	cb->cbuf_magic = RAID_BUFMAGIC;
547 	cb->cbuf_pwslot = -1;
548 	cb->cbuf_flags = CBUF_WRITE;
549 }
550 
551 /*ARGSUSED1*/
552 static void
553 raid_cbuf_destructor(void *p, void *d)
554 {
555 	biofini(&((md_raidcbuf_t *)p)->cbuf_bp);
556 }
557 
558 /*
559  * NAMES:	raid_run_queue
560  * DESCRIPTION: spawn a backend processing daemon for RAID metadevice.
561  * PARAMETERS:
562  */
563 /*ARGSUSED*/
564 static void
565 raid_run_queue(void *d)
566 {
567 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
568 		md_daemon(1, &md_done_daemon);
569 }
570 
571 /*
572  * NAME:	raid_build_pwslot
573  * DESCRIPTION: builds mr_pw_reserve for the column
574  * PARAMETERS:	un is the pointer to the unit structure
575  *		colindex is the column to create the structure for
576  */
577 int
578 raid_build_pw_reservation(mr_unit_t *un, int colindex)
579 {
580 	mr_pw_reserve_t	*pw;
581 	mr_scoreboard_t	*sb;
582 	int		i;
583 
584 	pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) +
585 	    (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP);
586 	pw->pw_magic = RAID_PWMAGIC;
587 	pw->pw_column = colindex;
588 	pw->pw_free = un->un_pwcnt;
589 	sb = &pw->pw_sb[0];
590 	for (i = 0; i < un->un_pwcnt; i++) {
591 		sb[i].sb_column = colindex;
592 		sb[i].sb_flags = SB_UNUSED;
593 		sb[i].sb_start_blk = 0;
594 		sb[i].sb_last_blk = 0;
595 		sb[i].sb_cs = NULL;
596 	}
597 	un->un_column_ic[colindex].un_pw_reserve = pw;
598 	return (0);
599 }
600 /*
601  * NAME:	raid_free_pw_reservation
602  * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine
603  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
604  *		int colindex  - index of the column whose pre-write slot struct
605  *			is to be destroyed.
606  */
607 void
608 raid_free_pw_reservation(mr_unit_t *un, int colindex)
609 {
610 	mr_pw_reserve_t	*pw = un->un_column_ic[colindex].un_pw_reserve;
611 
612 	kmem_free(pw, sizeof (mr_pw_reserve_t) +
613 	    (sizeof (mr_scoreboard_t) * un->un_pwcnt));
614 }
615 
616 /*
617  * NAME:	raid_cancel_pwslot
618  * DESCRIPTION: RAID metadevice write routine
619  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
620  */
621 static void
622 raid_cancel_pwslot(md_raidcs_t *cs)
623 {
624 	mr_unit_t		*un = cs->cs_un;
625 	mr_pw_reserve_t		*pw;
626 	mr_scoreboard_t		*sb;
627 	mr_column_ic_t		*col;
628 	md_raidcbuf_t		*cbuf;
629 	int			broadcast = 0;
630 
631 	if (cs->cs_ps->ps_flags & MD_RPS_READ)
632 		return;
633 	if (cs->cs_dpwslot != -1) {
634 		col = &un->un_column_ic[cs->cs_dcolumn];
635 		pw = col->un_pw_reserve;
636 		sb = &pw->pw_sb[cs->cs_dpwslot];
637 		sb->sb_flags = SB_AVAIL;
638 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
639 			broadcast++;
640 		sb->sb_cs = NULL;
641 	}
642 
643 	if (cs->cs_ppwslot != -1) {
644 		col = &un->un_column_ic[cs->cs_pcolumn];
645 		pw = col->un_pw_reserve;
646 		sb = &pw->pw_sb[cs->cs_ppwslot];
647 		sb->sb_flags = SB_AVAIL;
648 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
649 			broadcast++;
650 		sb->sb_cs = NULL;
651 	}
652 
653 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
654 		if (cbuf->cbuf_pwslot == -1)
655 			continue;
656 		col = &un->un_column_ic[cbuf->cbuf_column];
657 		pw = col->un_pw_reserve;
658 		sb = &pw->pw_sb[cbuf->cbuf_pwslot];
659 		sb->sb_flags = SB_AVAIL;
660 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
661 			broadcast++;
662 		sb->sb_cs = NULL;
663 	}
664 	if (broadcast) {
665 		cv_broadcast(&un->un_cv);
666 		return;
667 	}
668 	mutex_enter(&un->un_mx);
669 	if (un->un_rflags & MD_RFLAG_NEEDPW)
670 		cv_broadcast(&un->un_cv);
671 	mutex_exit(&un->un_mx);
672 }
673 
674 static void
675 raid_free_pwinvalidate(md_raidcs_t *cs)
676 {
677 	md_raidcbuf_t		*cbuf;
678 	md_raidcbuf_t		*cbuf_to_free;
679 	mr_unit_t		*un = cs->cs_un;
680 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
681 	mr_pw_reserve_t		*pw;
682 	mr_scoreboard_t		*sb;
683 	int			broadcast = 0;
684 
685 	cbuf = cs->cs_pw_inval_list;
686 	ASSERT(cbuf);
687 	mutex_enter(&un->un_linlck_mx);
688 	while (cbuf) {
689 		pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve;
690 		sb = &pw->pw_sb[0];
691 		ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND);
692 		sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED;
693 		sb[cbuf->cbuf_pwslot].sb_cs = NULL;
694 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
695 			broadcast++;
696 		cbuf_to_free = cbuf;
697 		cbuf = cbuf->cbuf_next;
698 		kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize));
699 		kmem_cache_free(raid_cbuf_cache, cbuf_to_free);
700 	}
701 	cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL;
702 	/*
703 	 * now that there is a free prewrite slot, check to see if there
704 	 * are any io operations waiting first wake up the raid_io_startup
705 	 * then signal the the processes waiting in raid_write.
706 	 */
707 	if (ui->ui_io_lock->io_list_front)
708 		raid_io_startup(un);
709 	mutex_exit(&un->un_linlck_mx);
710 	if (broadcast) {
711 		cv_broadcast(&un->un_cv);
712 		return;
713 	}
714 	mutex_enter(&un->un_mx);
715 	if (un->un_rflags & MD_RFLAG_NEEDPW)
716 		cv_broadcast(&un->un_cv);
717 	mutex_exit(&un->un_mx);
718 }
719 
720 
721 static int
722 raid_get_pwslot(md_raidcs_t *cs, int column)
723 {
724 	mr_scoreboard_t	*sb;
725 	mr_pw_reserve_t	*pw;
726 	mr_unit_t	*un = cs->cs_un;
727 	diskaddr_t	start_blk = cs->cs_blkno;
728 	diskaddr_t	last_blk = cs->cs_lastblk;
729 	int		i;
730 	int		pwcnt = un->un_pwcnt;
731 	int		avail = -1;
732 	int		use = -1;
733 	int		flags;
734 
735 
736 	/* start with the data column */
737 	pw = cs->cs_un->un_column_ic[column].un_pw_reserve;
738 	sb = &pw->pw_sb[0];
739 	ASSERT(pw->pw_free > 0);
740 	for (i = 0; i < pwcnt; i++) {
741 		flags = sb[i].sb_flags;
742 		if (flags & SB_INVAL_PEND)
743 			continue;
744 
745 		if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED)))
746 			avail = i;
747 
748 		if ((start_blk > sb[i].sb_last_blk) ||
749 		    (last_blk < sb[i].sb_start_blk))
750 			continue;
751 
752 		/* OVERLAP */
753 		ASSERT(! (sb[i].sb_flags & SB_INUSE));
754 
755 		/*
756 		 * raid_invalidate_pwslot attempts to zero out prewrite entry
757 		 * in parallel with other disk reads/writes related to current
758 		 * transaction. however cs_frags accounting for this case is
759 		 * broken because raid_write_io resets cs_frags i.e. ignoring
760 		 * that it could have been been set to > 0 value by
761 		 * raid_invalidate_pwslot. While this can be fixed an
762 		 * additional problem is that we don't seem to handle
763 		 * correctly the case of getting a disk error for prewrite
764 		 * entry invalidation.
765 		 * It does not look like we really need
766 		 * to invalidate prewrite slots because raid_replay sorts
767 		 * prewrite id's in ascending order and during recovery the
768 		 * latest prewrite entry for the same block will be replay
769 		 * last. That's why i ifdef'd out the call to
770 		 * raid_invalidate_pwslot. --aguzovsk@east
771 		 */
772 
773 		if (use == -1) {
774 			use = i;
775 		}
776 	}
777 
778 	ASSERT(avail != -1);
779 	pw->pw_free--;
780 	if (use == -1)
781 		use = avail;
782 
783 	ASSERT(! (sb[use].sb_flags & SB_INUSE));
784 	sb[use].sb_flags = SB_INUSE;
785 	sb[use].sb_cs = cs;
786 	sb[use].sb_start_blk = start_blk;
787 	sb[use].sb_last_blk = last_blk;
788 	ASSERT((use >= 0) && (use < un->un_pwcnt));
789 	return (use);
790 }
791 
792 static int
793 raid_check_pw(md_raidcs_t *cs)
794 {
795 
796 	mr_unit_t	*un = cs->cs_un;
797 	int		i;
798 
799 	ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
800 	/*
801 	 * check to be sure there is a prewrite slot available
802 	 * if not just return.
803 	 */
804 	if (cs->cs_flags & MD_RCS_LINE) {
805 		for (i = 0; i < un->un_totalcolumncnt; i++)
806 			if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0)
807 				return (1);
808 		return (0);
809 	}
810 
811 	if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0)
812 		return (1);
813 	if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0)
814 		return (1);
815 	return (0);
816 }
817 static int
818 raid_alloc_pwslot(md_raidcs_t *cs)
819 {
820 	mr_unit_t	*un = cs->cs_un;
821 	md_raidcbuf_t	*cbuf;
822 
823 	ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
824 	if (raid_check_pw(cs))
825 		return (1);
826 
827 	mutex_enter(&un->un_mx);
828 	un->un_pwid++;
829 	cs->cs_pwid = un->un_pwid;
830 	mutex_exit(&un->un_mx);
831 
832 	cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn);
833 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
834 		cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column);
835 	}
836 	cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn);
837 
838 	cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS;
839 
840 	return (0);
841 }
842 
843 /*
844  * NAMES:	raid_build_incore
845  * DESCRIPTION: RAID metadevice incore structure building routine
846  * PARAMETERS:	void *p - pointer to a unit structure
847  *		int snarfing - a flag to indicate snarfing is required
848  */
849 int
850 raid_build_incore(void *p, int snarfing)
851 {
852 	mr_unit_t	*un = (mr_unit_t *)p;
853 	minor_t		mnum = MD_SID(un);
854 	mddb_recid_t	hs_recid = 0;
855 	int		i;
856 	int		preserve_flags;
857 	mr_column_t	*column;
858 	int		iosize;
859 	md_dev64_t	hs, dev;
860 	int		resync_cnt = 0,
861 			error_cnt = 0;
862 
863 	hs = NODEV64;
864 	dev = NODEV64;
865 
866 	/* clear out bogus pointer incase we return(1) prior to alloc */
867 	un->mr_ic = NULL;
868 
869 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
870 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
871 		return (1);
872 	}
873 
874 	if (MD_UNIT(mnum) != NULL)
875 		return (0);
876 
877 	if (snarfing)
878 		MD_STATUS(un) = 0;
879 
880 	un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic),
881 	    KM_SLEEP);
882 
883 	un->un_column_ic = (mr_column_ic_t *)
884 	    kmem_zalloc(sizeof (mr_column_ic_t) *
885 		un->un_totalcolumncnt, KM_SLEEP);
886 
887 	for (i = 0; i < un->un_totalcolumncnt; i++) {
888 
889 		column	= &un->un_column[i];
890 		preserve_flags = column->un_devflags &
891 		    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC);
892 		column->un_devflags &=
893 		    ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN |
894 		    MD_RAID_WRITE_ALT);
895 		if (raid_build_pw_reservation(un, i) != 0) {
896 			/* could not build pwslot */
897 			return (1);
898 		}
899 
900 		if (snarfing) {
901 			set_t		setno = MD_MIN2SET(mnum);
902 			dev =  md_getdevnum(setno, mddb_getsidenum(setno),
903 			    column->un_orig_key, MD_NOTRUST_DEVT);
904 			/*
905 			 * Comment out instead of remove so we have history
906 			 * In the pre-SVM releases stored devt is used so
907 			 * as long as there is one snarf is always happy
908 			 * even the component is powered off.  This is not
909 			 * the case in current SVM implementation.  NODEV64
910 			 * can be returned and in this case since we resolve
911 			 * the devt at 'open' time (first use of metadevice)
912 			 * we will allow snarf continue.
913 			 *
914 			 * if (dev == NODEV64)
915 			 *	return (1);
916 			 */
917 
918 			/*
919 			 * Setup un_orig_dev from device id info if the device
920 			 * is valid (not NODEV64).
921 			 */
922 			if (dev != NODEV64)
923 				column->un_orig_dev = dev;
924 
925 			if (column->un_devstate & RCS_RESYNC)
926 				resync_cnt++;
927 			if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
928 				error_cnt++;
929 
930 			if (HOTSPARED(un, i)) {
931 				(void) md_hot_spare_ifc(HS_MKDEV,
932 				    0, 0, 0, &column->un_hs_id, NULL,
933 				    &hs, NULL);
934 				/*
935 				 * Same here
936 				 *
937 				 * if (hs == NODEV64)
938 				 *	return (1);
939 				 */
940 			}
941 
942 			if (HOTSPARED(un, i)) {
943 				if (column->un_devstate &
944 				    (RCS_OKAY | RCS_LAST_ERRED)) {
945 					column->un_dev = hs;
946 					column->un_pwstart =
947 					    column->un_hs_pwstart;
948 					column->un_devstart =
949 					    column->un_hs_devstart;
950 					preserve_flags &=
951 					    ~(MD_RAID_COPY_RESYNC |
952 					    MD_RAID_REGEN_RESYNC);
953 				} else  if (column->un_devstate & RCS_RESYNC) {
954 					/*
955 					 * if previous system was 4.0 set
956 					 * the direction flags
957 					 */
958 					if ((preserve_flags &
959 					    (MD_RAID_COPY_RESYNC |
960 					    MD_RAID_REGEN_RESYNC)) == 0) {
961 					if (column->un_alt_dev != NODEV64)
962 						preserve_flags |=
963 						MD_RAID_COPY_RESYNC;
964 					else
965 					    preserve_flags |=
966 						MD_RAID_REGEN_RESYNC;
967 					}
968 				}
969 			} else { /* no hot spares */
970 				column->un_dev = dev;
971 				column->un_pwstart = column->un_orig_pwstart;
972 				column->un_devstart = column->un_orig_devstart;
973 				if (column->un_devstate & RCS_RESYNC) {
974 					preserve_flags |= MD_RAID_REGEN_RESYNC;
975 					preserve_flags &= ~MD_RAID_COPY_RESYNC;
976 				}
977 			}
978 			if (! (column->un_devstate & RCS_RESYNC)) {
979 				preserve_flags &=
980 				    ~(MD_RAID_REGEN_RESYNC |
981 				    MD_RAID_COPY_RESYNC);
982 			}
983 
984 			column->un_devflags = preserve_flags;
985 			column->un_alt_dev = NODEV64;
986 			column->un_alt_pwstart = 0;
987 			column->un_alt_devstart = 0;
988 			un->un_resync_line_index = 0;
989 			un->un_resync_index = 0;
990 			un->un_percent_done = 0;
991 		}
992 	}
993 
994 	if (resync_cnt && error_cnt) {
995 		for (i = 0; i < un->un_totalcolumncnt; i++) {
996 			column  = &un->un_column[i];
997 			if (HOTSPARED(un, i) &&
998 			    (column->un_devstate & RCS_RESYNC) &&
999 			    (column->un_devflags & MD_RAID_COPY_RESYNC))
1000 				/* hotspare has data */
1001 				continue;
1002 
1003 			if (HOTSPARED(un, i) &&
1004 			    (column->un_devstate & RCS_RESYNC)) {
1005 				/* hotspare does not have data */
1006 				raid_hs_release(HS_FREE, un, &hs_recid, i);
1007 				column->un_dev = column->un_orig_dev;
1008 				column->un_pwstart = column->un_orig_pwstart;
1009 				column->un_devstart = column->un_orig_devstart;
1010 				mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM);
1011 			}
1012 
1013 			if (column->un_devstate & RCS_ERRED)
1014 				column->un_devstate = RCS_LAST_ERRED;
1015 
1016 			if (column->un_devstate & RCS_RESYNC)
1017 				column->un_devstate = RCS_ERRED;
1018 		}
1019 	}
1020 	mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
1021 
1022 	un->un_pwid = 1; /* or some other possible value */
1023 	un->un_magic = RAID_UNMAGIC;
1024 	iosize = un->un_iosize;
1025 	un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
1026 	un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
1027 	mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL);
1028 	cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL);
1029 	un->un_linlck_chn = NULL;
1030 	MD_UNIT(mnum) = un;
1031 
1032 
1033 	return (0);
1034 }
1035 
1036 /*
1037  * NAMES:	reset_raid
1038  * DESCRIPTION: RAID metadevice reset routine
1039  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
1040  *		minor_t mnum - RAID metadevice minor number
1041  *		int removing - a flag to imply removing device name from
1042  *			MDDB database.
1043  */
1044 void
1045 reset_raid(mr_unit_t *un, minor_t mnum, int removing)
1046 {
1047 	int		i, n = 0;
1048 	sv_dev_t	*sv;
1049 	mr_column_t	*column;
1050 	int		column_cnt = un->un_totalcolumncnt;
1051 	mddb_recid_t	*recids, vtoc_id;
1052 	int		hserr;
1053 
1054 	ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) &&
1055 	    (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL));
1056 
1057 	md_destroy_unit_incore(mnum, &raid_md_ops);
1058 
1059 	MD_UNIT(mnum) = NULL;
1060 
1061 	if (un->un_pbuffer) {
1062 		kmem_free(un->un_pbuffer, dbtob(un->un_iosize));
1063 		un->un_pbuffer = NULL;
1064 	}
1065 	if (un->un_dbuffer) {
1066 		kmem_free(un->un_dbuffer, dbtob(un->un_iosize));
1067 		un->un_dbuffer = NULL;
1068 	}
1069 
1070 	/* free all pre-write slots created during build incore */
1071 	for (i = 0; i < un->un_totalcolumncnt; i++)
1072 		raid_free_pw_reservation(un, i);
1073 
1074 	kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
1075 		un->un_totalcolumncnt);
1076 
1077 	kmem_free(un->mr_ic, sizeof (*un->mr_ic));
1078 
1079 	if (!removing)
1080 		return;
1081 
1082 	sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t),
1083 	    KM_SLEEP);
1084 
1085 	recids = (mddb_recid_t *)
1086 	    kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP);
1087 
1088 	for (i = 0; i < column_cnt; i++) {
1089 		md_unit_t	*comp_un;
1090 		md_dev64_t	comp_dev;
1091 
1092 		column = &un->un_column[i];
1093 		sv[i].setno = MD_MIN2SET(mnum);
1094 		sv[i].key = column->un_orig_key;
1095 		if (HOTSPARED(un, i)) {
1096 			if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
1097 				hserr = HS_BAD;
1098 			else
1099 				hserr = HS_FREE;
1100 			raid_hs_release(hserr, un, &recids[n++], i);
1101 		}
1102 		/*
1103 		 * deparent any metadevices.
1104 		 * NOTE: currently soft partitions are the only metadevices
1105 		 * allowed in RAID metadevices.
1106 		 */
1107 		comp_dev = column->un_dev;
1108 		if (md_getmajor(comp_dev) == md_major) {
1109 			comp_un = MD_UNIT(md_getminor(comp_dev));
1110 			recids[n++] = MD_RECID(comp_un);
1111 			md_reset_parent(comp_dev);
1112 		}
1113 	}
1114 	/* decrement the reference count of the old hsp */
1115 	if (un->un_hsp_id != -1)
1116 		(void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
1117 		    &recids[n++], NULL, NULL, NULL);
1118 	recids[n] = 0;
1119 	MD_STATUS(un) |= MD_UN_BEING_RESET;
1120 	vtoc_id = un->c.un_vtoc_id;
1121 
1122 	raid_commit(un, recids);
1123 
1124 
1125 	/* Remove the unit structure */
1126 	mddb_deleterec_wrapper(un->c.un_record_id);
1127 
1128 	/* Remove the vtoc, if present */
1129 	if (vtoc_id)
1130 		mddb_deleterec_wrapper(vtoc_id);
1131 	md_rem_names(sv, column_cnt);
1132 	kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t));
1133 	kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t));
1134 
1135 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
1136 	    MD_MIN2SET(mnum), mnum);
1137 }
1138 
1139 /*
1140  * NAMES:	raid_error_parent
1141  * DESCRIPTION: mark a parent structure in error
1142  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1143  *		int	error - error value to set
1144  * NOTE:	(TBR) - this routine currently is not in use.
1145  */
1146 static void
1147 raid_error_parent(md_raidps_t *ps, int error)
1148 {
1149 	mutex_enter(&ps->ps_mx);
1150 	ps->ps_flags |= MD_RPS_ERROR;
1151 	ps->ps_error = error;
1152 	mutex_exit(&ps->ps_mx);
1153 }
1154 
1155 /*
1156  * The following defines tell raid_free_parent
1157  *	RFP_RLS_LOCK		release the unit reader lock when done.
1158  *	RFP_DECR_PWFRAGS	decrement ps_pwfrags
1159  *	RFP_DECR_FRAGS		decrement ps_frags
1160  *	RFP_DECR_READFRAGS	read keeps FRAGS and PWFRAGS in lockstep
1161  */
1162 #define	RFP_RLS_LOCK		0x00001
1163 #define	RFP_DECR_PWFRAGS	0x00002
1164 #define	RFP_DECR_FRAGS		0x00004
1165 #define	RFP_DECR_READFRAGS	(RFP_DECR_PWFRAGS | RFP_DECR_FRAGS)
1166 
1167 /*
1168  * NAMES:	raid_free_parent
1169  * DESCRIPTION: free a parent structure
1170  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1171  *		int	todo - indicates what needs to be done
1172  */
1173 static void
1174 raid_free_parent(md_raidps_t *ps, int todo)
1175 {
1176 	mdi_unit_t	*ui = ps->ps_ui;
1177 
1178 	ASSERT(ps->ps_magic == RAID_PSMAGIC);
1179 	ASSERT(ps->ps_flags & MD_RPS_INUSE);
1180 	mutex_enter(&ps->ps_mx);
1181 	if (todo & RFP_DECR_PWFRAGS) {
1182 		ASSERT(ps->ps_pwfrags);
1183 		ps->ps_pwfrags--;
1184 		if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) {
1185 			if (ps->ps_flags & MD_RPS_ERROR) {
1186 				ps->ps_bp->b_flags |= B_ERROR;
1187 				ps->ps_bp->b_error = ps->ps_error;
1188 			}
1189 			md_kstat_done(ui, ps->ps_bp, 0);
1190 			biodone(ps->ps_bp);
1191 			ps->ps_flags |= MD_RPS_IODONE;
1192 		}
1193 	}
1194 
1195 	if (todo & RFP_DECR_FRAGS) {
1196 		ASSERT(ps->ps_frags);
1197 		ps->ps_frags--;
1198 	}
1199 
1200 	if (ps->ps_frags != 0) {
1201 		mutex_exit(&ps->ps_mx);
1202 		return;
1203 	}
1204 
1205 	ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0));
1206 	mutex_exit(&ps->ps_mx);
1207 
1208 	if (todo & RFP_RLS_LOCK)
1209 		md_io_readerexit(ui);
1210 
1211 	if (panicstr) {
1212 		ps->ps_flags |= MD_RPS_DONE;
1213 		return;
1214 	}
1215 
1216 	if (ps->ps_flags & MD_RPS_HSREQ)
1217 		(void) raid_hotspares();
1218 
1219 	ASSERT(todo & RFP_RLS_LOCK);
1220 	ps->ps_flags &= ~MD_RPS_INUSE;
1221 
1222 	md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id));
1223 
1224 	kmem_cache_free(raid_parent_cache, ps);
1225 }
1226 
1227 /*
1228  * NAMES:	raid_free_child
1229  * DESCRIPTION: free a parent structure
1230  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1231  *		int drop_locks	- 0 for no locks held
1232  * NOTE:	(TBR) - this routine currently is not in use.
1233  */
1234 static void
1235 raid_free_child(md_raidcs_t *cs, int drop_locks)
1236 {
1237 	mr_unit_t	*un = cs->cs_un;
1238 	md_raidcbuf_t	*cbuf, *cbuf1;
1239 
1240 	if (cs->cs_pw_inval_list)
1241 		raid_free_pwinvalidate(cs);
1242 
1243 	if (drop_locks) {
1244 		ASSERT(cs->cs_flags & MD_RCS_LLOCKD &&
1245 		    (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER)));
1246 		md_unit_readerexit(MDI_UNIT(MD_SID(un)));
1247 		raid_line_exit(cs);
1248 	} else {
1249 		ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD));
1250 	}
1251 
1252 	freebuffers(cs);
1253 	cbuf = cs->cs_buflist;
1254 	while (cbuf) {
1255 		cbuf1 = cbuf->cbuf_next;
1256 		kmem_cache_free(raid_cbuf_cache, cbuf);
1257 		cbuf = cbuf1;
1258 	}
1259 	if (cs->cs_dbuf.b_flags & B_REMAPPED)
1260 		bp_mapout(&cs->cs_dbuf);
1261 	kmem_cache_free(raid_child_cache, cs);
1262 }
1263 
1264 /*
1265  * NAME:	raid_regen_parity
1266  *
1267  * DESCRIPTION:	This routine is used to regenerate the parity blocks
1268  *		for the entire raid device.  It is called from
1269  *		both the regen thread and the IO path.
1270  *
1271  *		On error the entire device is marked as in error by
1272  *		placing the erroring device in error and all other
1273  *		devices in last_errored.
1274  *
1275  * PARAMETERS:	md_raidcs_t	*cs
1276  */
1277 void
1278 raid_regen_parity(md_raidcs_t *cs)
1279 {
1280 	mr_unit_t	*un = cs->cs_un;
1281 	mdi_unit_t	*ui = MDI_UNIT(un->c.un_self_id);
1282 	caddr_t		buffer;
1283 	caddr_t		parity_buffer;
1284 	buf_t		*bp;
1285 	uint_t		*dbuf, *pbuf;
1286 	uint_t		colcnt = un->un_totalcolumncnt;
1287 	int		column;
1288 	int		parity_column = cs->cs_pcolumn;
1289 	size_t		bcount;
1290 	int		j;
1291 
1292 	/*
1293 	 * This routine uses the data and parity buffers allocated to a
1294 	 * write.  In the case of a read the buffers are allocated and
1295 	 * freed at the end.
1296 	 */
1297 
1298 	ASSERT(IO_READER_HELD(un));
1299 	ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
1300 	ASSERT(UNIT_READER_HELD(un));
1301 
1302 	if (raid_state_cnt(un, RCS_OKAY) != colcnt)
1303 		return;
1304 
1305 	if (cs->cs_flags & MD_RCS_READER) {
1306 		getpbuffer(cs);
1307 		getdbuffer(cs);
1308 	}
1309 	ASSERT(cs->cs_dbuffer && cs->cs_pbuffer);
1310 	bcount = cs->cs_bcount;
1311 	buffer = cs->cs_dbuffer;
1312 	parity_buffer = cs->cs_pbuffer;
1313 	bzero(parity_buffer, bcount);
1314 	bp = &cs->cs_dbuf;
1315 	for (column = 0; column < colcnt; column++) {
1316 		if (column == parity_column)
1317 			continue;
1318 		reset_buf(bp, B_READ | B_BUSY, bcount);
1319 		bp->b_un.b_addr = buffer;
1320 		bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
1321 		bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart;
1322 		bp->b_bcount = bcount;
1323 		bp->b_bufsize = bcount;
1324 		(void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
1325 		if (biowait(bp))
1326 			goto bail;
1327 		pbuf = (uint_t *)(void *)parity_buffer;
1328 		dbuf = (uint_t *)(void *)buffer;
1329 		for (j = 0; j < (bcount / (sizeof (uint_t))); j++) {
1330 			*pbuf = *pbuf ^ *dbuf;
1331 			pbuf++;
1332 			dbuf++;
1333 		}
1334 	}
1335 
1336 	reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount);
1337 	bp->b_un.b_addr = parity_buffer;
1338 	bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev);
1339 	bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart;
1340 	bp->b_bcount = bcount;
1341 	bp->b_bufsize = bcount;
1342 	(void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
1343 	if (biowait(bp))
1344 		goto bail;
1345 
1346 	if (cs->cs_flags & MD_RCS_READER) {
1347 		freebuffers(cs);
1348 		cs->cs_pbuffer = NULL;
1349 		cs->cs_dbuffer = NULL;
1350 	}
1351 	bp->b_chain = (struct buf *)cs;
1352 	return;
1353 bail:
1354 	if (cs->cs_flags & MD_RCS_READER) {
1355 		freebuffers(cs);
1356 		cs->cs_pbuffer = NULL;
1357 		cs->cs_dbuffer = NULL;
1358 	}
1359 	md_unit_readerexit(ui);
1360 	un = md_unit_writerlock(ui);
1361 	raid_set_state(un, column, RCS_ERRED, 0);
1362 	for (column = 0; column < colcnt; column++)
1363 		raid_set_state(un, column, RCS_ERRED, 0);
1364 	raid_commit(un, NULL);
1365 	md_unit_writerexit(ui);
1366 	un = md_unit_readerlock(ui);
1367 	bp->b_chain = (struct buf *)cs;
1368 }
1369 
1370 /*
1371  * NAMES:	raid_error_state
1372  * DESCRIPTION: check unit and column states' impact on I/O error
1373  *		NOTE:	the state now may not be the state when the
1374  *			I/O completed due to race conditions.
1375  * PARAMETERS:	mr_unit_t *un - pointer to raid unit structure
1376  *		md_raidcs_t *cs - pointer to child structure
1377  *		buf_t	  *bp - pointer to buffer structure
1378  */
1379 static int
1380 raid_error_state(mr_unit_t *un, buf_t *bp)
1381 {
1382 	int		column;
1383 	int		i;
1384 
1385 	ASSERT(IO_READER_HELD(un));
1386 	ASSERT(UNIT_WRITER_HELD(un));
1387 
1388 	column = -1;
1389 	for (i = 0; i < un->un_totalcolumncnt; i++) {
1390 		if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) {
1391 			column = i;
1392 			break;
1393 		}
1394 		if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) {
1395 			column = i;
1396 			break;
1397 		}
1398 	}
1399 
1400 	/* in case a replace snuck in while waiting on unit writer lock */
1401 
1402 	if (column == -1) {
1403 		return (0);
1404 	}
1405 
1406 	(void) raid_set_state(un, column, RCS_ERRED, 0);
1407 	ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED));
1408 
1409 	raid_commit(un, NULL);
1410 	if (un->un_state & RUS_ERRED) {
1411 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
1412 		    MD_UN2SET(un), MD_SID(un));
1413 	} else if (un->un_state & RUS_LAST_ERRED) {
1414 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
1415 		    MD_UN2SET(un), MD_SID(un));
1416 	}
1417 
1418 	return (EIO);
1419 }
1420 
1421 /*
1422  * NAME:	raid_mapin_buf
1423  * DESCRIPTION:	wait for the input buffer header to be maped in
1424  * PARAMETERS:	md_raidps_t *ps
1425  */
1426 static void
1427 raid_mapin_buf(md_raidcs_t *cs)
1428 {
1429 	md_raidps_t	*ps = cs->cs_ps;
1430 
1431 	/*
1432 	 * check to see if the buffer is maped.  If all is ok return the
1433 	 * offset of the data and return.  Since it is expensive to grab
1434 	 * a mutex this is only done if the mapin is not complete.
1435 	 * Once the mutex is aquired it is possible that the mapin was
1436 	 * not done so recheck and if necessary do the mapin.
1437 	 */
1438 	if (ps->ps_mapin > 0) {
1439 		cs->cs_addr = ps->ps_addr + cs->cs_offset;
1440 		return;
1441 	}
1442 	mutex_enter(&ps->ps_mapin_mx);
1443 	if (ps->ps_mapin > 0) {
1444 		cs->cs_addr = ps->ps_addr + cs->cs_offset;
1445 		mutex_exit(&ps->ps_mapin_mx);
1446 		return;
1447 	}
1448 	bp_mapin(ps->ps_bp);
1449 	/*
1450 	 * get the new b_addr out of the parent since bp_mapin just changed it
1451 	 */
1452 	ps->ps_addr = ps->ps_bp->b_un.b_addr;
1453 	cs->cs_addr = ps->ps_addr + cs->cs_offset;
1454 	ps->ps_mapin++;
1455 	mutex_exit(&ps->ps_mapin_mx);
1456 }
1457 
1458 /*
1459  * NAMES:	raid_read_no_retry
1460  * DESCRIPTION: I/O retry routine for a RAID metadevice read
1461  *		read failed attempting to regenerate the data,
1462  *		no retry possible, error occured in raid_raidregenloop().
1463  * PARAMETERS:	mr_unit_t   *un - pointer to raid unit structure
1464  *		md_raidcs_t *cs - pointer to child structure
1465  */
1466 /*ARGSUSED*/
1467 static void
1468 raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs)
1469 {
1470 	md_raidps_t	*ps = cs->cs_ps;
1471 
1472 	raid_error_parent(ps, EIO);
1473 	raid_free_child(cs, 1);
1474 
1475 	/* decrement readfrags */
1476 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
1477 }
1478 
1479 /*
1480  * NAMES:	raid_read_retry
1481  * DESCRIPTION: I/O retry routine for a RAID metadevice read
1482  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1483  */
1484 static void
1485 raid_read_retry(mr_unit_t *un, md_raidcs_t *cs)
1486 {
1487 	/* re-initialize the buf_t structure for raid_read() */
1488 	cs->cs_dbuf.b_chain = (struct buf *)cs;
1489 	cs->cs_dbuf.b_back = &cs->cs_dbuf;
1490 	cs->cs_dbuf.b_forw = &cs->cs_dbuf;
1491 	cs->cs_dbuf.b_flags = B_BUSY;	/* initialize flags */
1492 	cs->cs_dbuf.b_error = 0;	/* initialize error */
1493 	cs->cs_dbuf.b_offset = -1;
1494 	/* Initialize semaphores */
1495 	sema_init(&cs->cs_dbuf.b_io, 0, NULL,
1496 	    SEMA_DEFAULT, NULL);
1497 	sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
1498 	    SEMA_DEFAULT, NULL);
1499 
1500 	cs->cs_pbuf.b_chain = (struct buf *)cs;
1501 	cs->cs_pbuf.b_back = &cs->cs_pbuf;
1502 	cs->cs_pbuf.b_forw = &cs->cs_pbuf;
1503 	cs->cs_pbuf.b_flags = B_BUSY;	/* initialize flags */
1504 	cs->cs_pbuf.b_error = 0;	/* initialize error */
1505 	cs->cs_pbuf.b_offset = -1;
1506 	sema_init(&cs->cs_pbuf.b_io, 0, NULL,
1507 	    SEMA_DEFAULT, NULL);
1508 	sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
1509 	    SEMA_DEFAULT, NULL);
1510 
1511 	cs->cs_flags &= ~MD_RCS_ERROR;	/* reset child error flag */
1512 	cs->cs_flags |= MD_RCS_RECOVERY;  /* set RECOVERY flag */
1513 
1514 	/*
1515 	 * re-scheduling I/O with raid_read_io() is simpler. basically,
1516 	 * raid_read_io() is invoked again with same child structure.
1517 	 * (NOTE: we aren`t supposed to do any error recovery when an I/O
1518 	 * error occured in raid_raidregenloop().
1519 	 */
1520 	raid_mapin_buf(cs);
1521 	raid_read_io(un, cs);
1522 }
1523 
1524 /*
1525  * NAMES:	raid_rderr
1526  * DESCRIPTION: I/O error handling routine for a RAID metadevice read
1527  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1528  * LOCKS:	must obtain unit writer lock while calling raid_error_state
1529  *		since a unit or column state transition may take place.
1530  *		must obtain unit reader lock to retry I/O.
1531  */
1532 /*ARGSUSED*/
1533 static void
1534 raid_rderr(md_raidcs_t *cs)
1535 {
1536 	md_raidps_t	*ps;
1537 	mdi_unit_t	*ui;
1538 	mr_unit_t	*un;
1539 	int		error = 0;
1540 
1541 	ps = cs->cs_ps;
1542 	ui = ps->ps_ui;
1543 	un = (mr_unit_t *)md_unit_writerlock(ui);
1544 	ASSERT(un != 0);
1545 
1546 	if (cs->cs_dbuf.b_flags & B_ERROR)
1547 		error = raid_error_state(un, &cs->cs_dbuf);
1548 	if (cs->cs_pbuf.b_flags & B_ERROR)
1549 		error |= raid_error_state(un, &cs->cs_pbuf);
1550 
1551 	md_unit_writerexit(ui);
1552 
1553 	ps->ps_flags |= MD_RPS_HSREQ;
1554 
1555 	un = (mr_unit_t *)md_unit_readerlock(ui);
1556 	ASSERT(un != 0);
1557 	/* now attempt the appropriate retry routine */
1558 	(*(cs->cs_retry_call))(un, cs);
1559 }
1560 
1561 
1562 /*
1563  * NAMES:	raid_read_error
1564  * DESCRIPTION: I/O error handling routine for a RAID metadevice read
1565  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1566  */
1567 /*ARGSUSED*/
1568 static void
1569 raid_read_error(md_raidcs_t *cs)
1570 {
1571 	md_raidps_t	*ps;
1572 	mdi_unit_t	*ui;
1573 	mr_unit_t	*un;
1574 	set_t		setno;
1575 
1576 	ps = cs->cs_ps;
1577 	ui = ps->ps_ui;
1578 	un = cs->cs_un;
1579 
1580 	setno = MD_UN2SET(un);
1581 
1582 	if ((cs->cs_dbuf.b_flags & B_ERROR) &&
1583 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
1584 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
1585 		cmn_err(CE_WARN, "md %s: read error on %s",
1586 		    md_shortname(MD_SID(un)),
1587 		    md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
1588 
1589 	if ((cs->cs_pbuf.b_flags & B_ERROR) &&
1590 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
1591 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
1592 		cmn_err(CE_WARN, "md %s: read error on %s",
1593 		    md_shortname(MD_SID(un)),
1594 		    md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
1595 
1596 	md_unit_readerexit(ui);
1597 
1598 	ASSERT(cs->cs_frags == 0);
1599 
1600 	/* now schedule processing for possible state change */
1601 	daemon_request(&md_mstr_daemon, raid_rderr,
1602 		(daemon_queue_t *)cs, REQ_OLD);
1603 
1604 }
1605 
1606 /*
1607  * NAMES:	getdbuffer
1608  * DESCRIPTION: data buffer allocation for a child structure
1609  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1610  *
1611  * NOTE: always get dbuffer before pbuffer
1612  *	 and get both buffers before pwslot
1613  *	 otherwise a deadlock could be introduced.
1614  */
1615 static void
1616 getdbuffer(md_raidcs_t *cs)
1617 {
1618 	mr_unit_t	*un;
1619 
1620 	cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
1621 	if (cs->cs_dbuffer != NULL)
1622 		return;
1623 	un = cs->cs_ps->ps_un;
1624 	mutex_enter(&un->un_mx);
1625 	while (un->un_dbuffer == NULL) {
1626 		STAT_INC(data_buffer_waits);
1627 		un->un_rflags |= MD_RFLAG_NEEDBUF;
1628 		cv_wait(&un->un_cv, &un->un_mx);
1629 	}
1630 	cs->cs_dbuffer = un->un_dbuffer;
1631 	cs->cs_flags |= MD_RCS_UNDBUF;
1632 	un->un_dbuffer = NULL;
1633 	mutex_exit(&un->un_mx);
1634 }
1635 
1636 /*
1637  * NAMES:	getpbuffer
1638  * DESCRIPTION: parity buffer allocation for a child structure
1639  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1640  *
1641  * NOTE: always get dbuffer before pbuffer
1642  *	 and get both buffers before pwslot
1643  *	 otherwise a deadlock could be introduced.
1644  */
1645 static void
1646 getpbuffer(md_raidcs_t *cs)
1647 {
1648 	mr_unit_t *un;
1649 
1650 	cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
1651 	if (cs->cs_pbuffer != NULL)
1652 		return;
1653 	un = cs->cs_ps->ps_un;
1654 	mutex_enter(&un->un_mx);
1655 	while (un->un_pbuffer == NULL) {
1656 		STAT_INC(parity_buffer_waits);
1657 		un->un_rflags |= MD_RFLAG_NEEDBUF;
1658 		cv_wait(&un->un_cv, &un->un_mx);
1659 	}
1660 	cs->cs_pbuffer = un->un_pbuffer;
1661 	cs->cs_flags |= MD_RCS_UNPBUF;
1662 	un->un_pbuffer = NULL;
1663 	mutex_exit(&un->un_mx);
1664 }
1665 static void
1666 getresources(md_raidcs_t *cs)
1667 {
1668 	md_raidcbuf_t	*cbuf;
1669 	/*
1670 	 * NOTE: always get dbuffer before pbuffer
1671 	 *	 and get both buffers before pwslot
1672 	 *	 otherwise a deadlock could be introduced.
1673 	 */
1674 	getdbuffer(cs);
1675 	getpbuffer(cs);
1676 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
1677 		cbuf->cbuf_buffer =
1678 		    kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP);
1679 }
1680 /*
1681  * NAMES:	freebuffers
1682  * DESCRIPTION: child structure buffer freeing routine
1683  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1684  */
1685 static void
1686 freebuffers(md_raidcs_t *cs)
1687 {
1688 	mr_unit_t	*un;
1689 	md_raidcbuf_t	*cbuf;
1690 
1691 	/* free buffers used for full line write */
1692 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
1693 		if (cbuf->cbuf_buffer == NULL)
1694 			continue;
1695 		kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE);
1696 		cbuf->cbuf_buffer = NULL;
1697 		cbuf->cbuf_bcount = 0;
1698 	}
1699 
1700 	if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
1701 		un = cs->cs_un;
1702 		mutex_enter(&un->un_mx);
1703 	}
1704 	if (cs->cs_dbuffer) {
1705 		if (cs->cs_flags & MD_RCS_UNDBUF)
1706 			un->un_dbuffer = cs->cs_dbuffer;
1707 		else
1708 			kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE);
1709 	}
1710 	if (cs->cs_pbuffer) {
1711 		if (cs->cs_flags & MD_RCS_UNPBUF)
1712 			un->un_pbuffer = cs->cs_pbuffer;
1713 		else
1714 			kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE);
1715 	}
1716 	if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
1717 		un->un_rflags &= ~MD_RFLAG_NEEDBUF;
1718 		cv_broadcast(&un->un_cv);
1719 		mutex_exit(&un->un_mx);
1720 	}
1721 }
1722 
1723 /*
1724  * NAMES:	raid_line_reader_lock, raid_line_writer_lock
1725  * DESCRIPTION: RAID metadevice line reader and writer lock routines
1726  *		data column # and parity column #.
1727  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1728  */
1729 
1730 void
1731 raid_line_reader_lock(md_raidcs_t *cs, int resync_thread)
1732 {
1733 	mr_unit_t	*un;
1734 	md_raidcs_t	*cs1;
1735 
1736 	ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
1737 	un = cs->cs_un;
1738 	cs->cs_flags |= MD_RCS_READER;
1739 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
1740 	if (!panicstr)
1741 		mutex_enter(&un->un_linlck_mx);
1742 	cs1 = un->un_linlck_chn;
1743 	while (cs1 != NULL) {
1744 		for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
1745 			if (raid_io_overlaps(cs, cs1) == 1)
1746 				if (cs1->cs_flags & MD_RCS_WRITER)
1747 					break;
1748 
1749 		if (cs1 != NULL) {
1750 			if (panicstr)
1751 				panic("md; raid line write lock held");
1752 			un->un_linlck_flg = 1;
1753 			cv_wait(&un->un_linlck_cv, &un->un_linlck_mx);
1754 			STAT_INC(raid_read_waits);
1755 		}
1756 	}
1757 	STAT_MAX(raid_max_reader_locks, raid_reader_locks_active);
1758 	STAT_INC(raid_reader_locks);
1759 	cs1 = un->un_linlck_chn;
1760 	if (cs1 != NULL)
1761 		cs1->cs_linlck_prev = cs;
1762 	cs->cs_linlck_next = cs1;
1763 	cs->cs_linlck_prev = NULL;
1764 	un->un_linlck_chn = cs;
1765 	cs->cs_flags |= MD_RCS_LLOCKD;
1766 	if (resync_thread) {
1767 		diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
1768 		diskaddr_t line = (lastblk + 1) / un->un_segsize;
1769 		ASSERT(raid_state_cnt(un, RCS_RESYNC));
1770 		mutex_enter(&un->un_mx);
1771 		un->un_resync_line_index = line;
1772 		mutex_exit(&un->un_mx);
1773 	}
1774 	if (!panicstr)
1775 		mutex_exit(&un->un_linlck_mx);
1776 }
1777 
1778 int
1779 raid_line_writer_lock(md_raidcs_t *cs, int lock)
1780 {
1781 	mr_unit_t	*un;
1782 	md_raidcs_t	*cs1;
1783 
1784 	ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
1785 	cs->cs_flags |= MD_RCS_WRITER;
1786 	un = cs->cs_ps->ps_un;
1787 
1788 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
1789 	if (lock && !panicstr)
1790 		mutex_enter(&un->un_linlck_mx);
1791 	ASSERT(MUTEX_HELD(&un->un_linlck_mx));
1792 
1793 	cs1 = un->un_linlck_chn;
1794 	for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
1795 		if (raid_io_overlaps(cs, cs1))
1796 			break;
1797 
1798 	if (cs1 != NULL) {
1799 		if (panicstr)
1800 			panic("md: line writer lock inaccessible");
1801 		goto no_lock_exit;
1802 	}
1803 
1804 	if (raid_alloc_pwslot(cs)) {
1805 		if (panicstr)
1806 			panic("md: no prewrite slots");
1807 		STAT_INC(raid_prewrite_waits);
1808 		goto no_lock_exit;
1809 	}
1810 
1811 	cs1 = un->un_linlck_chn;
1812 	if (cs1 != NULL)
1813 		cs1->cs_linlck_prev = cs;
1814 	cs->cs_linlck_next = cs1;
1815 	cs->cs_linlck_prev = NULL;
1816 	un->un_linlck_chn = cs;
1817 	cs->cs_flags |= MD_RCS_LLOCKD;
1818 	cs->cs_flags &= ~MD_RCS_WAITING;
1819 	STAT_INC(raid_writer_locks);
1820 	STAT_MAX(raid_max_write_locks, raid_write_locks_active);
1821 	if (lock && !panicstr)
1822 		mutex_exit(&un->un_linlck_mx);
1823 	return (0);
1824 
1825 no_lock_exit:
1826 	/* if this is already queued then do not requeue it */
1827 	ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
1828 	if (!lock || (cs->cs_flags & MD_RCS_WAITING))
1829 		return (1);
1830 	cs->cs_flags |= MD_RCS_WAITING;
1831 	cs->cs_un = un;
1832 	raid_enqueue(cs);
1833 	if (lock && !panicstr)
1834 		mutex_exit(&un->un_linlck_mx);
1835 	return (1);
1836 }
1837 
1838 static void
1839 raid_startio(md_raidcs_t *cs)
1840 {
1841 	mdi_unit_t	*ui = cs->cs_ps->ps_ui;
1842 	mr_unit_t	*un = cs->cs_un;
1843 
1844 	un = md_unit_readerlock(ui);
1845 	raid_write_io(un, cs);
1846 }
1847 
1848 void
1849 raid_io_startup(mr_unit_t *un)
1850 {
1851 	md_raidcs_t	*waiting_list, *cs1;
1852 	md_raidcs_t	*previous = NULL, *next = NULL;
1853 	mdi_unit_t	*ui =  MDI_UNIT(un->c.un_self_id);
1854 	kmutex_t	*io_list_mutex = &ui->ui_io_lock->io_list_mutex;
1855 
1856 	ASSERT(MUTEX_HELD(&un->un_linlck_mx));
1857 	mutex_enter(io_list_mutex);
1858 
1859 	/*
1860 	 * check to be sure there are no reader locks outstanding.  If
1861 	 * there are not then pass on the writer lock.
1862 	 */
1863 	waiting_list = ui->ui_io_lock->io_list_front;
1864 	while (waiting_list) {
1865 		ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
1866 		ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD));
1867 		for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
1868 			if (raid_io_overlaps(waiting_list, cs1) == 1)
1869 				break;
1870 		/*
1871 		 * there was an IOs that overlaps this io so go onto
1872 		 * the next io in the waiting list
1873 		 */
1874 		if (cs1) {
1875 			previous = waiting_list;
1876 			waiting_list = waiting_list->cs_linlck_next;
1877 			continue;
1878 		}
1879 
1880 		/*
1881 		 * There are no IOs that overlap this, so remove it from
1882 		 * the waiting queue, and start it
1883 		 */
1884 
1885 		if (raid_check_pw(waiting_list)) {
1886 			ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
1887 			previous = waiting_list;
1888 			waiting_list = waiting_list->cs_linlck_next;
1889 			continue;
1890 		}
1891 		ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
1892 
1893 		next = waiting_list->cs_linlck_next;
1894 		if (previous)
1895 			previous->cs_linlck_next = next;
1896 		else
1897 			ui->ui_io_lock->io_list_front = next;
1898 
1899 		if (ui->ui_io_lock->io_list_front == NULL)
1900 			ui->ui_io_lock->io_list_back = NULL;
1901 
1902 		if (ui->ui_io_lock->io_list_back == waiting_list)
1903 			ui->ui_io_lock->io_list_back = previous;
1904 
1905 		waiting_list->cs_linlck_next = NULL;
1906 		waiting_list->cs_flags &= ~MD_RCS_WAITING;
1907 		STAT_DEC(raid_write_queue_length);
1908 		if (raid_line_writer_lock(waiting_list, 0))
1909 			panic("region locking corrupted");
1910 
1911 		ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD);
1912 		daemon_request(&md_mstr_daemon, raid_startio,
1913 		    (daemon_queue_t *)waiting_list, REQ_OLD);
1914 		waiting_list = next;
1915 
1916 	}
1917 	mutex_exit(io_list_mutex);
1918 }
1919 
1920 void
1921 raid_line_exit(md_raidcs_t *cs)
1922 {
1923 	mr_unit_t	*un;
1924 
1925 	un = cs->cs_ps->ps_un;
1926 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
1927 	mutex_enter(&un->un_linlck_mx);
1928 	if (cs->cs_flags & MD_RCS_READER)
1929 		STAT_DEC(raid_reader_locks_active);
1930 	else
1931 		STAT_DEC(raid_write_locks_active);
1932 
1933 	if (cs->cs_linlck_prev)
1934 		cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next;
1935 	else
1936 		un->un_linlck_chn = cs->cs_linlck_next;
1937 	if (cs->cs_linlck_next)
1938 		cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev;
1939 
1940 	cs->cs_flags &= ~MD_RCS_LLOCKD;
1941 
1942 	if (un->un_linlck_flg)
1943 		cv_broadcast(&un->un_linlck_cv);
1944 
1945 	un->un_linlck_flg = 0;
1946 	cs->cs_line = MD_DISKADDR_ERROR;
1947 
1948 	raid_cancel_pwslot(cs);
1949 	/*
1950 	 * now that the lock is droped go ahead and see if there are any
1951 	 * other writes that can be started up
1952 	 */
1953 	raid_io_startup(un);
1954 
1955 	mutex_exit(&un->un_linlck_mx);
1956 }
1957 
1958 /*
1959  * NAMES:	raid_line, raid_pcolumn, raid_dcolumn
1960  * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #,
1961  *		data column # and parity column #.
1962  * PARAMETERS:	int segment - segment number
1963  *		mr_unit_t *un - pointer to an unit structure
1964  * RETURNS:	raid_line returns line #
1965  *		raid_dcolumn returns data column #
1966  *		raid_pcolumn returns parity column #
1967  */
1968 static diskaddr_t
1969 raid_line(diskaddr_t segment, mr_unit_t *un)
1970 {
1971 	diskaddr_t	adj_seg;
1972 	diskaddr_t	line;
1973 	diskaddr_t	max_orig_segment;
1974 
1975 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
1976 	if (segment >= max_orig_segment) {
1977 		adj_seg = segment - max_orig_segment;
1978 		line = adj_seg % un->un_segsincolumn;
1979 	} else {
1980 		line = segment / (un->un_origcolumncnt - 1);
1981 	}
1982 	return (line);
1983 }
1984 
1985 uint_t
1986 raid_dcolumn(diskaddr_t segment, mr_unit_t *un)
1987 {
1988 	diskaddr_t	adj_seg;
1989 	diskaddr_t	line;
1990 	diskaddr_t	max_orig_segment;
1991 	uint_t		column;
1992 
1993 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
1994 	if (segment >= max_orig_segment) {
1995 		adj_seg = segment - max_orig_segment;
1996 		column = un->un_origcolumncnt  +
1997 			(uint_t)(adj_seg / un->un_segsincolumn);
1998 	} else {
1999 		line = segment / (un->un_origcolumncnt - 1);
2000 		column = (uint_t)((segment % (un->un_origcolumncnt - 1) + line)
2001 		    % un->un_origcolumncnt);
2002 	}
2003 	return (column);
2004 }
2005 
2006 uint_t
2007 raid_pcolumn(diskaddr_t segment, mr_unit_t *un)
2008 {
2009 	diskaddr_t	adj_seg;
2010 	diskaddr_t	line;
2011 	diskaddr_t	max_orig_segment;
2012 	uint_t		column;
2013 
2014 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
2015 	if (segment >= max_orig_segment) {
2016 		adj_seg = segment - max_orig_segment;
2017 		line = adj_seg % un->un_segsincolumn;
2018 	} else {
2019 		line = segment / (un->un_origcolumncnt - 1);
2020 	}
2021 	column = (uint_t)((line + (un->un_origcolumncnt - 1))
2022 				% un->un_origcolumncnt);
2023 	return (column);
2024 }
2025 
2026 
2027 /*
2028  * Is called in raid_iosetup to probe each column to insure
2029  * that all the columns are in 'okay' state and meet the
2030  * 'full line' requirement.  If any column is in error,
2031  * we don't want to enable the 'full line' flag.  Previously,
2032  * we would do so and disable it only when a error is
2033  * detected after the first 'full line' io which is too late
2034  * and leads to the potential data corruption.
2035  */
2036 static int
2037 raid_check_cols(mr_unit_t *un)
2038 {
2039 	buf_t		bp;
2040 	char		*buf;
2041 	mr_column_t	*colptr;
2042 	minor_t		mnum = MD_SID(un);
2043 	int		i;
2044 	int		err = 0;
2045 
2046 	buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
2047 
2048 	for (i = 0; i < un->un_totalcolumncnt; i++) {
2049 		md_dev64_t tmpdev;
2050 
2051 		colptr = &un->un_column[i];
2052 
2053 		tmpdev = colptr->un_dev;
2054 		/*
2055 		 * Open by device id
2056 		 * If this device is hotspared
2057 		 * use the hotspare key
2058 		 */
2059 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
2060 			colptr->un_hs_key : colptr->un_orig_key);
2061 
2062 		if (tmpdev == NODEV64) {
2063 			err = 1;
2064 			break;
2065 		}
2066 
2067 		colptr->un_dev = tmpdev;
2068 
2069 		bzero((caddr_t)&bp, sizeof (buf_t));
2070 		bp.b_back = &bp;
2071 		bp.b_forw = &bp;
2072 		bp.b_flags = (B_READ | B_BUSY);
2073 		sema_init(&bp.b_io, 0, NULL,
2074 		    SEMA_DEFAULT, NULL);
2075 		sema_init(&bp.b_sem, 0, NULL,
2076 		    SEMA_DEFAULT, NULL);
2077 		bp.b_edev = md_dev64_to_dev(colptr->un_dev);
2078 		bp.b_lblkno = colptr->un_pwstart;
2079 		bp.b_bcount = DEV_BSIZE;
2080 		bp.b_bufsize = DEV_BSIZE;
2081 		bp.b_un.b_addr = (caddr_t)buf;
2082 		(void) md_call_strategy(&bp, 0, NULL);
2083 		if (biowait(&bp)) {
2084 			err = 1;
2085 			break;
2086 		}
2087 	}
2088 
2089 	kmem_free(buf, DEV_BSIZE);
2090 	return (err);
2091 }
2092 
2093 /*
2094  * NAME:	raid_iosetup
2095  * DESCRIPTION: RAID metadevice specific I/O set up routine which does
2096  *		all the necessary calculations to determine the location
2097  *		of the segement for the I/O.
2098  * PARAMETERS:	mr_unit_t *un - unit number of RAID metadevice
2099  *		diskaddr_t	blkno - block number of the I/O attempt
2100  *		size_t		blkcnt - block count for this I/O
2101  *		md_raidcs_t *cs - child structure for each segmented I/O
2102  *
2103  * NOTE:	The following is an example of a raid disk layer out:
2104  *
2105  *		Total Column = 5
2106  *		Original Column = 4
2107  *		Segment Per Column = 10
2108  *
2109  *			Col#0	Col#1	Col#2	Col#3	Col#4	Col#5	Col#6
2110  *		-------------------------------------------------------------
2111  *		line#0	Seg#0	Seg#1	Seg#2	Parity	Seg#30	Seg#40
2112  *		line#1	Parity	Seg#3	Seg#4	Seg#5	Seg#31
2113  *		line#2	Seg#8	Parity	Seg#6	Seg#7	Seg#32
2114  *		line#3	Seg#10	Seg#11	Parity	Seg#9	Seg#33
2115  *		line#4	Seg#12	Seg#13	Seg#14	Parity	Seg#34
2116  *		line#5	Parity	Seg#15	Seg#16	Seg#17	Seg#35
2117  *		line#6	Seg#20	Parity	Seg#18	Seg#19	Seg#36
2118  *		line#7	Seg#22	Seg#23	Parity	Seg#21	Seg#37
2119  *		line#8	Seg#24	Seg#25	Seg#26	Parity	Seg#38
2120  *		line#9	Parity	Seg#27	Seg#28	Seg#29	Seg#39
2121  */
2122 static size_t
2123 raid_iosetup(
2124 	mr_unit_t	*un,
2125 	diskaddr_t	blkno,
2126 	size_t		blkcnt,
2127 	md_raidcs_t	*cs
2128 )
2129 {
2130 	diskaddr_t	segment;
2131 	diskaddr_t	segstart;
2132 	diskaddr_t	segoff;
2133 	size_t		leftover;
2134 	diskaddr_t	line;
2135 	uint_t		iosize;
2136 	uint_t		colcnt;
2137 
2138 	/* caculate the segment# and offset for the block */
2139 	segment = blkno / un->un_segsize;
2140 	segstart = segment * un->un_segsize;
2141 	segoff = blkno - segstart;
2142 	iosize = un->un_iosize - 1;
2143 	colcnt = un->un_totalcolumncnt - 1;
2144 	line = raid_line(segment, un);
2145 	cs->cs_dcolumn = raid_dcolumn(segment, un);
2146 	cs->cs_pcolumn = raid_pcolumn(segment, un);
2147 	cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags;
2148 	cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags;
2149 	cs->cs_line = line;
2150 
2151 	if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) &&
2152 	    (UNIT_STATE(un) & RCS_OKAY) &&
2153 	    (segoff == 0) &&
2154 	    (un->un_totalcolumncnt == un->un_origcolumncnt) &&
2155 	    (un->un_segsize < un->un_iosize) &&
2156 	    (un->un_iosize <= un->un_maxio) &&
2157 	    (blkno == line * un->un_segsize * colcnt) &&
2158 	    (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) &&
2159 	    (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) &&
2160 	    (raid_check_cols(un) == 0)) {
2161 
2162 		md_raidcbuf_t	**cbufp;
2163 		md_raidcbuf_t	*cbuf;
2164 		int		i, j;
2165 
2166 		STAT_INC(raid_full_line_writes);
2167 		leftover = blkcnt - (un->un_segsize * colcnt);
2168 		ASSERT(blkcnt >= (un->un_segsize * colcnt));
2169 		cs->cs_blkno = line * un->un_segsize;
2170 		cs->cs_blkcnt = un->un_segsize;
2171 		cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
2172 		cs->cs_bcount = dbtob(cs->cs_blkcnt);
2173 		cs->cs_flags |= MD_RCS_LINE;
2174 
2175 		cbufp = &cs->cs_buflist;
2176 		for (i = 0; i < un->un_totalcolumncnt; i++) {
2177 			j = cs->cs_dcolumn + i;
2178 			j = j % un->un_totalcolumncnt;
2179 
2180 			if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn))
2181 				continue;
2182 			cbuf = kmem_cache_alloc(raid_cbuf_cache,
2183 			    MD_ALLOCFLAGS);
2184 			raid_cbuf_init(cbuf);
2185 			cbuf->cbuf_un = cs->cs_un;
2186 			cbuf->cbuf_ps = cs->cs_ps;
2187 			cbuf->cbuf_column = j;
2188 			cbuf->cbuf_bcount = dbtob(un->un_segsize);
2189 			*cbufp = cbuf;
2190 			cbufp = &cbuf->cbuf_next;
2191 		}
2192 		return (leftover);
2193 	}
2194 
2195 	leftover = blkcnt - (un->un_segsize - segoff);
2196 	if (blkcnt > (un->un_segsize - segoff))
2197 		blkcnt -= leftover;
2198 	else
2199 		leftover = 0;
2200 
2201 	if (blkcnt > (size_t)iosize) {
2202 		leftover += (blkcnt - iosize);
2203 		blkcnt = iosize;
2204 	}
2205 
2206 	/* calculate the line# and column# for the segment */
2207 	cs->cs_flags &= ~MD_RCS_LINE;
2208 	cs->cs_blkno = line * un->un_segsize + segoff;
2209 	cs->cs_blkcnt = (uint_t)blkcnt;
2210 	cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
2211 	cs->cs_bcount = dbtob((uint_t)blkcnt);
2212 	return (leftover);
2213 }
2214 
2215 /*
2216  * NAME:	raid_done
2217  * DESCRIPTION: RAID metadevice I/O done interrupt routine
2218  * PARAMETERS:	struct buf *bp - pointer to a buffer structure
2219  */
2220 static void
2221 raid_done(struct buf *bp)
2222 {
2223 	md_raidcs_t	*cs;
2224 	int		flags, frags;
2225 
2226 	sema_v(&bp->b_io);
2227 	cs = (md_raidcs_t *)bp->b_chain;
2228 
2229 	ASSERT(cs != NULL);
2230 
2231 	mutex_enter(&cs->cs_mx);
2232 	if (bp->b_flags & B_ERROR) {
2233 		cs->cs_flags |= MD_RCS_ERROR;
2234 		cs->cs_flags &= ~(MD_RCS_ISCALL);
2235 	}
2236 
2237 	flags = cs->cs_flags;
2238 	frags = --cs->cs_frags;
2239 	mutex_exit(&cs->cs_mx);
2240 	if (frags != 0) {
2241 		return;
2242 	}
2243 
2244 	if (flags & MD_RCS_ERROR) {
2245 		if (cs->cs_error_call) {
2246 			daemon_request(&md_done_daemon, cs->cs_error_call,
2247 				(daemon_queue_t *)cs, REQ_OLD);
2248 		}
2249 		return;
2250 	}
2251 
2252 	if (flags & MD_RCS_ISCALL) {
2253 		cs->cs_flags &= ~(MD_RCS_ISCALL);
2254 		(*(cs->cs_call))(cs);
2255 		return;
2256 	}
2257 	daemon_request(&md_done_daemon, cs->cs_call,
2258 					(daemon_queue_t *)cs, REQ_OLD);
2259 }
2260 /*
2261  * the flag RIO_EXTRA is used when dealing with a column in the process
2262  * of being resynced. During the resync, writes may have to take place
2263  * on both the original component and a hotspare component.
2264  */
2265 #define	RIO_DATA	0x00100		/* use data buffer & data column */
2266 #define	RIO_PARITY	0x00200		/* use parity buffer & parity column */
2267 #define	RIO_WRITE	0x00400		/* issue a write */
2268 #define	RIO_READ	0x00800		/* issue a read */
2269 #define	RIO_PWIO	0x01000		/* do the I/O to the prewrite entry */
2270 #define	RIO_ALT		0x02000		/* do write to alternate device */
2271 #define	RIO_EXTRA	0x04000		/* use extra buffer */
2272 
2273 #define	RIO_COLMASK	0x000ff
2274 
2275 #define	RIO_PREWRITE	RIO_WRITE | RIO_PWIO
2276 
2277 /*
2278  * NAME:	raidio
2279  * DESCRIPTION: RAID metadevice write routine
2280  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2281  */
2282 static void
2283 raidio(md_raidcs_t *cs, int flags)
2284 {
2285 	buf_t		*bp;
2286 	int		column;
2287 	int		flag;
2288 	void		*private;
2289 	mr_unit_t	*un;
2290 	int		iosize;
2291 	diskaddr_t	pwstart;
2292 	diskaddr_t	devstart;
2293 	md_dev64_t	dev;
2294 
2295 	un = cs->cs_un;
2296 
2297 	ASSERT(IO_READER_HELD(un));
2298 	ASSERT(UNIT_READER_HELD(un));
2299 
2300 	if (flags & RIO_DATA) {
2301 		if (flags & RIO_EXTRA)
2302 			bp = &cs->cs_hbuf;
2303 		else
2304 			bp = &cs->cs_dbuf;
2305 		bp->b_un.b_addr = cs->cs_dbuffer;
2306 		column = cs->cs_dcolumn;
2307 	} else {
2308 		if (flags & RIO_EXTRA)
2309 			bp = &cs->cs_hbuf;
2310 		else
2311 			bp = &cs->cs_pbuf;
2312 		bp->b_un.b_addr = cs->cs_pbuffer;
2313 		column = cs->cs_pcolumn;
2314 	}
2315 	if (flags & RIO_COLMASK)
2316 		column = (flags & RIO_COLMASK) - 1;
2317 
2318 	bp->b_bcount = cs->cs_bcount;
2319 	bp->b_bufsize = cs->cs_bcount;
2320 	iosize = un->un_iosize;
2321 
2322 	/* check if the hotspared device will be used */
2323 	if (flags & RIO_ALT && (flags & RIO_WRITE)) {
2324 		pwstart = un->un_column[column].un_alt_pwstart;
2325 		devstart = un->un_column[column].un_alt_devstart;
2326 		dev = un->un_column[column].un_alt_dev;
2327 	} else {
2328 		pwstart = un->un_column[column].un_pwstart;
2329 		devstart = un->un_column[column].un_devstart;
2330 		dev = un->un_column[column].un_dev;
2331 	}
2332 
2333 	/* if not writing to log skip log header */
2334 	if ((flags & RIO_PWIO) == 0) {
2335 		bp->b_lblkno = devstart + cs->cs_blkno;
2336 		bp->b_un.b_addr += DEV_BSIZE;
2337 	} else {
2338 		bp->b_bcount += DEV_BSIZE;
2339 		bp->b_bufsize = bp->b_bcount;
2340 		if (flags & RIO_DATA) {
2341 			bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart;
2342 		} else { /* not DATA -> PARITY */
2343 			bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart;
2344 		}
2345 	}
2346 
2347 	bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available);
2348 	bp->b_flags |= B_BUSY;
2349 	if (flags & RIO_READ) {
2350 		bp->b_flags |= B_READ;
2351 	} else {
2352 		bp->b_flags |= B_WRITE;
2353 		if ((nv_available && nv_parity && (flags & RIO_PARITY)) ||
2354 		    (nv_available && nv_prewrite && (flags & RIO_PWIO)))
2355 			bp->b_flags |= nv_available;
2356 	}
2357 	bp->b_iodone = (int (*)())raid_done;
2358 	bp->b_edev = md_dev64_to_dev(dev);
2359 
2360 	ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV));
2361 
2362 	private = cs->cs_strategy_private;
2363 	flag = cs->cs_strategy_flag;
2364 
2365 	md_call_strategy(bp, flag, private);
2366 }
2367 
2368 /*
2369  * NAME:	genstandardparity
2370  * DESCRIPTION: This routine
2371  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2372  */
2373 static void
2374 genstandardparity(md_raidcs_t *cs)
2375 {
2376 	uint_t		*dbuf, *pbuf;
2377 	size_t		wordcnt;
2378 	uint_t		dsum = 0;
2379 	uint_t		psum = 0;
2380 
2381 	ASSERT((cs->cs_bcount & 0x3) == 0);
2382 
2383 	wordcnt = cs->cs_bcount / sizeof (uint_t);
2384 
2385 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
2386 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2387 
2388 	/* Word aligned */
2389 	if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
2390 		uint_t	*uwbuf = (uint_t *)(void *)(cs->cs_addr);
2391 		uint_t	uval;
2392 
2393 		while (wordcnt--) {
2394 			uval = *uwbuf++;
2395 			psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval));
2396 			++pbuf;
2397 			*dbuf = uval;
2398 			dsum ^= uval;
2399 			++dbuf;
2400 		}
2401 	} else {
2402 		uchar_t	*ubbuf = (uchar_t *)(cs->cs_addr);
2403 		union {
2404 			uint_t	wb;
2405 			uchar_t	bb[4];
2406 		} cb;
2407 
2408 		while (wordcnt--) {
2409 			cb.bb[0] = *ubbuf++;
2410 			cb.bb[1] = *ubbuf++;
2411 			cb.bb[2] = *ubbuf++;
2412 			cb.bb[3] = *ubbuf++;
2413 			psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb));
2414 			++pbuf;
2415 			*dbuf = cb.wb;
2416 			dsum ^= cb.wb;
2417 			++dbuf;
2418 		}
2419 	}
2420 
2421 	RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn,
2422 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2423 			2, cs->cs_dcolumn, RAID_PWMAGIC);
2424 
2425 	RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn,
2426 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2427 			2, cs->cs_pcolumn, RAID_PWMAGIC);
2428 }
2429 
2430 static void
2431 genlineparity(md_raidcs_t *cs)
2432 {
2433 
2434 	mr_unit_t	*un = cs->cs_un;
2435 	md_raidcbuf_t	*cbuf;
2436 	uint_t		*pbuf, *dbuf;
2437 	uint_t		*uwbuf;
2438 	uchar_t		*ubbuf;
2439 	size_t		wordcnt;
2440 	uint_t		psum = 0, dsum = 0;
2441 	size_t		count = un->un_segsize * DEV_BSIZE;
2442 	uint_t		col;
2443 	buf_t		*bp;
2444 
2445 	ASSERT((cs->cs_bcount & 0x3) == 0);
2446 
2447 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2448 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
2449 	uwbuf = (uint_t *)(void *)(cs->cs_addr);
2450 	ubbuf = (uchar_t *)(void *)(cs->cs_addr);
2451 
2452 	wordcnt = count / sizeof (uint_t);
2453 
2454 	/* Word aligned */
2455 	if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
2456 		uint_t	 uval;
2457 
2458 		while (wordcnt--) {
2459 			uval = *uwbuf++;
2460 			*dbuf = uval;
2461 			*pbuf = uval;
2462 			dsum ^= uval;
2463 			++pbuf;
2464 			++dbuf;
2465 		}
2466 	} else {
2467 		union {
2468 			uint_t	wb;
2469 			uchar_t	bb[4];
2470 		} cb;
2471 
2472 		while (wordcnt--) {
2473 			cb.bb[0] = *ubbuf++;
2474 			cb.bb[1] = *ubbuf++;
2475 			cb.bb[2] = *ubbuf++;
2476 			cb.bb[3] = *ubbuf++;
2477 			*dbuf = cb.wb;
2478 			*pbuf = cb.wb;
2479 			dsum ^= cb.wb;
2480 			++pbuf;
2481 			++dbuf;
2482 		}
2483 	}
2484 
2485 	RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn,
2486 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2487 			un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC);
2488 
2489 	raidio(cs, RIO_PREWRITE | RIO_DATA);
2490 
2491 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
2492 
2493 		dsum = 0;
2494 		pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2495 		dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE);
2496 
2497 		wordcnt = count / sizeof (uint_t);
2498 
2499 		col = cbuf->cbuf_column;
2500 
2501 		/* Word aligned */
2502 		if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
2503 			uint_t	uval;
2504 
2505 			/*
2506 			 * Only calculate psum when working on the last
2507 			 * data buffer.
2508 			 */
2509 			if (cbuf->cbuf_next == NULL) {
2510 				psum = 0;
2511 				while (wordcnt--) {
2512 					uval = *uwbuf++;
2513 					*dbuf = uval;
2514 					psum ^= (*pbuf ^= uval);
2515 					dsum ^= uval;
2516 					++dbuf;
2517 					++pbuf;
2518 				}
2519 			} else {
2520 				while (wordcnt--) {
2521 					uval = *uwbuf++;
2522 					*dbuf = uval;
2523 					*pbuf ^= uval;
2524 					dsum ^= uval;
2525 					++dbuf;
2526 					++pbuf;
2527 				}
2528 			}
2529 		} else {
2530 			union {
2531 				uint_t	wb;
2532 				uchar_t	bb[4];
2533 			} cb;
2534 
2535 			/*
2536 			 * Only calculate psum when working on the last
2537 			 * data buffer.
2538 			 */
2539 			if (cbuf->cbuf_next == NULL) {
2540 				psum = 0;
2541 				while (wordcnt--) {
2542 					cb.bb[0] = *ubbuf++;
2543 					cb.bb[1] = *ubbuf++;
2544 					cb.bb[2] = *ubbuf++;
2545 					cb.bb[3] = *ubbuf++;
2546 					*dbuf = cb.wb;
2547 					psum ^= (*pbuf ^= cb.wb);
2548 					dsum ^= cb.wb;
2549 					++dbuf;
2550 					++pbuf;
2551 				}
2552 			} else {
2553 				while (wordcnt--) {
2554 					cb.bb[0] = *ubbuf++;
2555 					cb.bb[1] = *ubbuf++;
2556 					cb.bb[2] = *ubbuf++;
2557 					cb.bb[3] = *ubbuf++;
2558 					*dbuf = cb.wb;
2559 					*pbuf ^= cb.wb;
2560 					dsum ^= cb.wb;
2561 					++dbuf;
2562 					++pbuf;
2563 				}
2564 			}
2565 		}
2566 		RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn,
2567 				cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2568 				un->un_totalcolumncnt, col, RAID_PWMAGIC);
2569 
2570 		/*
2571 		 * fill in buffer for write to prewrite area
2572 		 */
2573 		bp = &cbuf->cbuf_bp;
2574 		bp->b_un.b_addr = cbuf->cbuf_buffer;
2575 		bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE;
2576 		bp->b_bufsize = bp->b_bcount;
2577 		bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) +
2578 		    un->un_column[col].un_pwstart;
2579 		bp->b_flags = B_WRITE | B_BUSY;
2580 		if (nv_available && nv_prewrite)
2581 			bp->b_flags |= nv_available;
2582 		bp->b_iodone = (int (*)())raid_done;
2583 		bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev);
2584 		bp->b_chain = (struct buf *)cs;
2585 		md_call_strategy(bp,
2586 			cs->cs_strategy_flag, cs->cs_strategy_private);
2587 	}
2588 
2589 	RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn,
2590 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2591 			un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC);
2592 
2593 	raidio(cs, RIO_PREWRITE | RIO_PARITY);
2594 }
2595 
2596 /*
2597  * NAME:	raid_readregenloop
2598  * DESCRIPTION: RAID metadevice write routine
2599  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2600  */
2601 static void
2602 raid_readregenloop(md_raidcs_t *cs)
2603 {
2604 	mr_unit_t	*un;
2605 	md_raidps_t	*ps;
2606 	uint_t		*dbuf;
2607 	uint_t		*pbuf;
2608 	size_t		wordcnt;
2609 
2610 	un = cs->cs_un;
2611 
2612 	/*
2613 	 * XOR the parity with data bytes, must skip the
2614 	 * pre-write entry header in all data/parity buffers
2615 	 */
2616 	wordcnt = cs->cs_bcount / sizeof (uint_t);
2617 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
2618 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2619 	while (wordcnt--)
2620 		*dbuf++ ^= *pbuf++;
2621 
2622 	/* bump up the loop count */
2623 	cs->cs_loop++;
2624 
2625 	/* skip the errored component */
2626 	if (cs->cs_loop == cs->cs_dcolumn)
2627 		cs->cs_loop++;
2628 
2629 	if (cs->cs_loop != un->un_totalcolumncnt) {
2630 		cs->cs_frags = 1;
2631 		raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
2632 		return;
2633 	}
2634 	/* reaching the end sof loop */
2635 	ps = cs->cs_ps;
2636 	bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount);
2637 	raid_free_child(cs, 1);
2638 
2639 	/* decrement readfrags */
2640 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
2641 }
2642 
2643 /*
2644  * NAME:	raid_read_io
2645  * DESCRIPTION: RAID metadevice read I/O routine
2646  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2647  *		md_raidcs_t *cs - pointer to a child structure
2648  */
2649 static void
2650 raid_read_io(mr_unit_t *un, md_raidcs_t *cs)
2651 {
2652 	int	flag;
2653 	void	*private;
2654 	buf_t	*bp;
2655 	buf_t	*pb = cs->cs_ps->ps_bp;
2656 	mr_column_t	*column;
2657 
2658 	flag = cs->cs_strategy_flag;
2659 	private = cs->cs_strategy_private;
2660 	column = &un->un_column[cs->cs_dcolumn];
2661 
2662 	/*
2663 	 * The component to be read is good, simply set up bp structure
2664 	 * and call low level md routine doing the read.
2665 	 */
2666 
2667 	if (COLUMN_ISOKAY(un, cs->cs_dcolumn) ||
2668 	    (COLUMN_ISLASTERR(un, cs->cs_dcolumn) &&
2669 		    (cs->cs_flags & MD_RCS_RECOVERY) == 0)) {
2670 		dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */
2671 		ddi_dev = md_dev64_to_dev(column->un_dev);
2672 
2673 		bp = &cs->cs_dbuf;
2674 		bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev,
2675 				column->un_devstart + cs->cs_blkno,
2676 				(int (*)())raid_done, bp, KM_NOSLEEP);
2677 
2678 		bp->b_chain = (buf_t *)cs;
2679 
2680 		cs->cs_frags = 1;
2681 		cs->cs_error_call = raid_read_error;
2682 		cs->cs_retry_call = raid_read_retry;
2683 		cs->cs_flags |= MD_RCS_ISCALL;
2684 		cs->cs_stage = RAID_READ_DONE;
2685 		cs->cs_call = raid_stage;
2686 
2687 		ASSERT(bp->b_edev != 0);
2688 
2689 		md_call_strategy(bp, flag, private);
2690 		return;
2691 	}
2692 
2693 	/*
2694 	 * The component to be read is bad, have to go through
2695 	 * raid specific method to read data from other members.
2696 	 */
2697 	cs->cs_loop = 0;
2698 	/*
2699 	 * NOTE: always get dbuffer before pbuffer
2700 	 *	 and get both buffers before pwslot
2701 	 *	 otherwise a deadlock could be introduced.
2702 	 */
2703 	raid_mapin_buf(cs);
2704 	getdbuffer(cs);
2705 	getpbuffer(cs);
2706 	if (cs->cs_loop == cs->cs_dcolumn)
2707 		cs->cs_loop++;
2708 
2709 	/* zero out data buffer for use as a data sink */
2710 	bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount);
2711 	cs->cs_stage = RAID_NONE;
2712 	cs->cs_call = raid_readregenloop;
2713 	cs->cs_error_call = raid_read_error;
2714 	cs->cs_retry_call = raid_read_no_retry;
2715 	cs->cs_frags = 1;
2716 
2717 	/* use parity buffer to read other columns */
2718 	raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
2719 }
2720 
2721 /*
2722  * NAME:	raid_read
2723  * DESCRIPTION: RAID metadevice write routine
2724  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2725  *		md_raidcs_t *cs - pointer to a child structure
2726  */
2727 static int
2728 raid_read(mr_unit_t *un, md_raidcs_t *cs)
2729 {
2730 	int		error = 0;
2731 	md_raidps_t	*ps;
2732 	mdi_unit_t	*ui;
2733 	minor_t		mnum;
2734 
2735 	ASSERT(IO_READER_HELD(un));
2736 	ps = cs->cs_ps;
2737 	ui = ps->ps_ui;
2738 	raid_line_reader_lock(cs, 0);
2739 	un = (mr_unit_t *)md_unit_readerlock(ui);
2740 	ASSERT(UNIT_STATE(un) != RUS_INIT);
2741 	mnum = MD_SID(un);
2742 	cs->cs_un = un;
2743 
2744 	/* make sure the read doesn't go beyond the end of the column */
2745 	if (cs->cs_blkno + cs->cs_blkcnt >
2746 	    un->un_segsize * un->un_segsincolumn) {
2747 		error = ENXIO;
2748 	}
2749 	if (error)
2750 		goto rerror;
2751 
2752 	if (un->un_state & RUS_REGEN) {
2753 		raid_regen_parity(cs);
2754 		un = MD_UNIT(mnum);
2755 		cs->cs_un = un;
2756 	}
2757 
2758 	raid_read_io(un, cs);
2759 	return (0);
2760 
2761 rerror:
2762 	raid_error_parent(ps, error);
2763 	raid_free_child(cs, 1);
2764 	/* decrement readfrags */
2765 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
2766 	return (0);
2767 }
2768 
2769 /*
2770  * NAME:	raid_write_err_retry
2771  * DESCRIPTION: RAID metadevice write retry routine
2772  *		write was for parity or data only;
2773  *		complete write with error, no recovery possible
2774  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2775  *		md_raidcs_t *cs - pointer to a child structure
2776  */
2777 /*ARGSUSED*/
2778 static void
2779 raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs)
2780 {
2781 	md_raidps_t	*ps = cs->cs_ps;
2782 	int		flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
2783 
2784 	/* decrement pwfrags if needed, and frags */
2785 	if (!(cs->cs_flags & MD_RCS_PWDONE))
2786 		flags |= RFP_DECR_PWFRAGS;
2787 	raid_error_parent(ps, EIO);
2788 	raid_free_child(cs, 1);
2789 	raid_free_parent(ps, flags);
2790 }
2791 
2792 /*
2793  * NAME:	raid_write_err_retry
2794  * DESCRIPTION: RAID metadevice write retry routine
2795  *		 write is too far along to retry and parent
2796  *		 has already been signaled with iodone.
2797  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2798  *		md_raidcs_t *cs - pointer to a child structure
2799  */
2800 /*ARGSUSED*/
2801 static void
2802 raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs)
2803 {
2804 	md_raidps_t	*ps = cs->cs_ps;
2805 	int		flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
2806 
2807 	/* decrement pwfrags if needed, and frags */
2808 	if (!(cs->cs_flags & MD_RCS_PWDONE))
2809 		flags |= RFP_DECR_PWFRAGS;
2810 	raid_free_child(cs, 1);
2811 	raid_free_parent(ps, flags);
2812 }
2813 
2814 /*
2815  * NAME:	raid_write_retry
2816  * DESCRIPTION: RAID metadevice write retry routine
2817  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2818  *		md_raidcs_t *cs - pointer to a child structure
2819  */
2820 static void
2821 raid_write_retry(mr_unit_t *un, md_raidcs_t *cs)
2822 {
2823 	md_raidps_t	*ps;
2824 
2825 	ps = cs->cs_ps;
2826 
2827 	/* re-initialize the buf_t structure for raid_write() */
2828 	cs->cs_dbuf.b_chain = (struct buf *)cs;
2829 	cs->cs_dbuf.b_back = &cs->cs_dbuf;
2830 	cs->cs_dbuf.b_forw = &cs->cs_dbuf;
2831 	cs->cs_dbuf.b_flags = B_BUSY;	/* initialize flags */
2832 	cs->cs_dbuf.b_error = 0;	/* initialize error */
2833 	cs->cs_dbuf.b_offset = -1;
2834 	/* Initialize semaphores */
2835 	sema_init(&cs->cs_dbuf.b_io, 0, NULL,
2836 	    SEMA_DEFAULT, NULL);
2837 	sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
2838 	    SEMA_DEFAULT, NULL);
2839 
2840 	cs->cs_pbuf.b_chain = (struct buf *)cs;
2841 	cs->cs_pbuf.b_back = &cs->cs_pbuf;
2842 	cs->cs_pbuf.b_forw = &cs->cs_pbuf;
2843 	cs->cs_pbuf.b_flags = B_BUSY;	/* initialize flags */
2844 	cs->cs_pbuf.b_error = 0;	/* initialize error */
2845 	cs->cs_pbuf.b_offset = -1;
2846 	sema_init(&cs->cs_pbuf.b_io, 0, NULL,
2847 	    SEMA_DEFAULT, NULL);
2848 	sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
2849 	    SEMA_DEFAULT, NULL);
2850 
2851 	cs->cs_hbuf.b_chain = (struct buf *)cs;
2852 	cs->cs_hbuf.b_back = &cs->cs_hbuf;
2853 	cs->cs_hbuf.b_forw = &cs->cs_hbuf;
2854 	cs->cs_hbuf.b_flags = B_BUSY;	/* initialize flags */
2855 	cs->cs_hbuf.b_error = 0;	/* initialize error */
2856 	cs->cs_hbuf.b_offset = -1;
2857 	sema_init(&cs->cs_hbuf.b_io, 0, NULL,
2858 	    SEMA_DEFAULT, NULL);
2859 	sema_init(&cs->cs_hbuf.b_sem, 0, NULL,
2860 	    SEMA_DEFAULT, NULL);
2861 
2862 	cs->cs_flags &= ~(MD_RCS_ERROR);
2863 	/*
2864 	 * If we have already done'ed the i/o but have done prewrite
2865 	 * on this child, then reset PWDONE flag and bump pwfrags before
2866 	 * restarting i/o.
2867 	 * If pwfrags is zero, we have already 'iodone'd the i/o so
2868 	 * leave things alone.  We don't want to re-'done' it.
2869 	 */
2870 	mutex_enter(&ps->ps_mx);
2871 	if (cs->cs_flags & MD_RCS_PWDONE) {
2872 		cs->cs_flags &= ~MD_RCS_PWDONE;
2873 		ps->ps_pwfrags++;
2874 	}
2875 	mutex_exit(&ps->ps_mx);
2876 	raid_write_io(un, cs);
2877 }
2878 
2879 /*
2880  * NAME:	raid_wrerr
2881  * DESCRIPTION: RAID metadevice write routine
2882  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2883  * LOCKS:	must obtain unit writer lock while calling raid_error_state
2884  *		since a unit or column state transition may take place.
2885  *		must obtain unit reader lock to retry I/O.
2886  */
2887 static void
2888 raid_wrerr(md_raidcs_t *cs)
2889 {
2890 	md_raidps_t	*ps;
2891 	mdi_unit_t	*ui;
2892 	mr_unit_t	*un;
2893 	md_raidcbuf_t	*cbuf;
2894 
2895 	ps = cs->cs_ps;
2896 	ui = ps->ps_ui;
2897 
2898 	un = (mr_unit_t *)md_unit_writerlock(ui);
2899 	ASSERT(un != 0);
2900 
2901 	if (cs->cs_dbuf.b_flags & B_ERROR)
2902 		(void) raid_error_state(un, &cs->cs_dbuf);
2903 	if (cs->cs_pbuf.b_flags & B_ERROR)
2904 		(void) raid_error_state(un, &cs->cs_pbuf);
2905 	if (cs->cs_hbuf.b_flags & B_ERROR)
2906 		(void) raid_error_state(un, &cs->cs_hbuf);
2907 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
2908 		if (cbuf->cbuf_bp.b_flags & B_ERROR)
2909 			(void) raid_error_state(un, &cbuf->cbuf_bp);
2910 
2911 	md_unit_writerexit(ui);
2912 
2913 	ps->ps_flags |= MD_RPS_HSREQ;
2914 
2915 	un = (mr_unit_t *)md_unit_readerlock(ui);
2916 
2917 	/* now attempt the appropriate retry routine */
2918 	(*(cs->cs_retry_call))(un, cs);
2919 }
2920 /*
2921  * NAMES:	raid_write_error
2922  * DESCRIPTION: I/O error handling routine for a RAID metadevice write
2923  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
2924  */
2925 /*ARGSUSED*/
2926 static void
2927 raid_write_error(md_raidcs_t *cs)
2928 {
2929 	md_raidps_t	*ps;
2930 	mdi_unit_t	*ui;
2931 	mr_unit_t	*un;
2932 	md_raidcbuf_t	*cbuf;
2933 	set_t		setno;
2934 
2935 	ps = cs->cs_ps;
2936 	ui = ps->ps_ui;
2937 	un = cs->cs_un;
2938 
2939 	setno = MD_UN2SET(un);
2940 
2941 	/*
2942 	 * locate each buf that is in error on this io and then
2943 	 * output an error message
2944 	 */
2945 	if ((cs->cs_dbuf.b_flags & B_ERROR) &&
2946 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
2947 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
2948 		cmn_err(CE_WARN, "md %s: write error on %s",
2949 		    md_shortname(MD_SID(un)),
2950 		    md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
2951 
2952 	if ((cs->cs_pbuf.b_flags & B_ERROR) &&
2953 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
2954 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
2955 		cmn_err(CE_WARN, "md %s: write error on %s",
2956 		    md_shortname(MD_SID(un)),
2957 		    md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
2958 
2959 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
2960 		if ((cbuf->cbuf_bp.b_flags & B_ERROR) &&
2961 		    (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) &&
2962 		    (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED))
2963 			cmn_err(CE_WARN, "md %s: write error on %s",
2964 			    md_shortname(MD_SID(un)),
2965 			    md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev),
2966 					NULL, 0));
2967 
2968 	md_unit_readerexit(ui);
2969 
2970 	ASSERT(cs->cs_frags == 0);
2971 
2972 	/* now schedule processing for possible state change */
2973 	daemon_request(&md_mstr_daemon, raid_wrerr,
2974 		(daemon_queue_t *)cs, REQ_OLD);
2975 
2976 }
2977 
2978 /*
2979  * NAME:	raid_write_ponly
2980  * DESCRIPTION: RAID metadevice write routine
2981  *		in the case where only the parity column can be written
2982  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2983  */
2984 static void
2985 raid_write_ponly(md_raidcs_t *cs)
2986 {
2987 	md_raidps_t	*ps;
2988 	mr_unit_t	*un = cs->cs_un;
2989 
2990 	ps = cs->cs_ps;
2991 	/* decrement pwfrags if needed, but not frags */
2992 	ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
2993 	raid_free_parent(ps, RFP_DECR_PWFRAGS);
2994 	cs->cs_flags |= MD_RCS_PWDONE;
2995 	cs->cs_frags = 1;
2996 	cs->cs_stage = RAID_WRITE_PONLY_DONE;
2997 	cs->cs_call = raid_stage;
2998 	cs->cs_error_call = raid_write_error;
2999 	cs->cs_retry_call = raid_write_no_retry;
3000 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
3001 		cs->cs_frags++;
3002 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE);
3003 	}
3004 	raidio(cs, RIO_PARITY | RIO_WRITE);
3005 }
3006 
3007 /*
3008  * NAME:	raid_write_ploop
3009  * DESCRIPTION: RAID metadevice write routine, constructs parity from
3010  *		data in other columns.
3011  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
3012  */
3013 static void
3014 raid_write_ploop(md_raidcs_t *cs)
3015 {
3016 	mr_unit_t *un = cs->cs_un;
3017 	uint_t *dbuf;
3018 	uint_t *pbuf;
3019 	size_t wordcnt;
3020 	uint_t psum = 0;
3021 
3022 	wordcnt = cs->cs_bcount / sizeof (uint_t);
3023 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
3024 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
3025 	while (wordcnt--)
3026 		*pbuf++ ^= *dbuf++;
3027 	cs->cs_loop++;
3028 
3029 	/*
3030 	 * build parity from scratch using new data,
3031 	 * skip reading the data and parity columns.
3032 	 */
3033 	while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn)
3034 		cs->cs_loop++;
3035 
3036 	if (cs->cs_loop != un->un_totalcolumncnt) {
3037 		cs->cs_frags = 1;
3038 		raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
3039 		return;
3040 	}
3041 
3042 	/* construct checksum for parity buffer */
3043 	wordcnt = cs->cs_bcount / sizeof (uint_t);
3044 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
3045 	while (wordcnt--) {
3046 		psum ^= *pbuf;
3047 		pbuf++;
3048 	}
3049 	RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1,
3050 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
3051 			1, cs->cs_pcolumn, RAID_PWMAGIC);
3052 
3053 	cs->cs_stage = RAID_NONE;
3054 	cs->cs_call = raid_write_ponly;
3055 	cs->cs_error_call = raid_write_error;
3056 	cs->cs_retry_call = raid_write_err_retry;
3057 	cs->cs_frags = 1;
3058 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
3059 		cs->cs_frags++;
3060 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
3061 	}
3062 	raidio(cs, RIO_PARITY | RIO_PREWRITE);
3063 }
3064 
3065 /*
3066  * NAME:	raid_write_donly
3067  * DESCRIPTION: RAID metadevice write routine
3068  *		Completed writing data to prewrite entry
3069  *		in the case where only the data column can be written
3070  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
3071  */
3072 static void
3073 raid_write_donly(md_raidcs_t *cs)
3074 {
3075 	md_raidps_t	*ps;
3076 	mr_unit_t	*un = cs->cs_un;
3077 
3078 	ps = cs->cs_ps;
3079 	/* WARNING: don't release unit reader lock here... */
3080 	/* decrement pwfrags if needed, but not frags */
3081 	ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
3082 	raid_free_parent(ps, RFP_DECR_PWFRAGS);
3083 	cs->cs_flags |= MD_RCS_PWDONE;
3084 	cs->cs_frags = 1;
3085 	cs->cs_stage = RAID_WRITE_DONLY_DONE;
3086 	cs->cs_call = raid_stage;
3087 	cs->cs_error_call = raid_write_error;
3088 	cs->cs_retry_call = raid_write_err_retry;
3089 	if (WRITE_ALT(un, cs->cs_dcolumn)) {
3090 		cs->cs_frags++;
3091 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
3092 	}
3093 	raidio(cs, RIO_DATA | RIO_WRITE);
3094 }
3095 
3096 /*
3097  * NAME:	raid_write_got_old
3098  * DESCRIPTION: RAID metadevice write routine
3099  *		completed read of old data and old parity
3100  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
3101  */
3102 static void
3103 raid_write_got_old(md_raidcs_t *cs)
3104 {
3105 	mr_unit_t *un = cs->cs_un;
3106 
3107 	ASSERT(IO_READER_HELD(cs->cs_un));
3108 	ASSERT(UNIT_READER_HELD(cs->cs_un));
3109 
3110 	raid_mapin_buf(cs);
3111 	genstandardparity(cs);
3112 	cs->cs_frags = 2;
3113 	cs->cs_call = raid_stage;
3114 	cs->cs_stage = RAID_PREWRITE_DONE;
3115 	cs->cs_error_call = raid_write_error;
3116 	cs->cs_retry_call = raid_write_retry;
3117 
3118 	if (WRITE_ALT(un, cs->cs_dcolumn)) {
3119 		cs->cs_frags++;
3120 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE);
3121 	}
3122 
3123 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
3124 		cs->cs_frags++;
3125 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
3126 	}
3127 	ASSERT(cs->cs_frags < 4);
3128 	raidio(cs,  RIO_DATA | RIO_PREWRITE);
3129 	raidio(cs,  RIO_PARITY | RIO_PREWRITE);
3130 }
3131 
3132 /*
3133  * NAME:	raid_write_io
3134  * DESCRIPTION: RAID metadevice write I/O routine
3135  * PARAMETERS:	mr_unit_t *un -  pointer to a unit structure
3136  *		md_raidcs_t *cs - pointer to a child structure
3137  */
3138 
3139 /*ARGSUSED*/
3140 static void
3141 raid_write_io(mr_unit_t *un, md_raidcs_t *cs)
3142 {
3143 	md_raidps_t	*ps = cs->cs_ps;
3144 	uint_t		*dbuf;
3145 	uint_t		*ubuf;
3146 	size_t		wordcnt;
3147 	uint_t		dsum = 0;
3148 	int		pcheck;
3149 	int		dcheck;
3150 
3151 	ASSERT((un->un_column[cs->cs_pcolumn].un_devstate &
3152 	    RCS_INIT) == 0);
3153 	ASSERT((un->un_column[cs->cs_dcolumn].un_devstate &
3154 	    RCS_INIT) == 0);
3155 	ASSERT(IO_READER_HELD(un));
3156 	ASSERT(UNIT_READER_HELD(un));
3157 	ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS);
3158 	if (cs->cs_flags & MD_RCS_LINE) {
3159 
3160 		mr_unit_t	*un = cs->cs_un;
3161 
3162 		ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt);
3163 		raid_mapin_buf(cs);
3164 		cs->cs_frags = un->un_origcolumncnt;
3165 		cs->cs_call = raid_stage;
3166 		cs->cs_error_call = raid_write_error;
3167 		cs->cs_retry_call = raid_write_no_retry;
3168 		cs->cs_stage = RAID_LINE_PWDONE;
3169 		genlineparity(cs);
3170 		return;
3171 	}
3172 
3173 	pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]);
3174 	dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]);
3175 	cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck;
3176 
3177 	if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) {
3178 		int err = EIO;
3179 
3180 		if ((un->un_column[cs->cs_pcolumn].un_devstate ==
3181 		    RCS_LAST_ERRED) ||
3182 		    (un->un_column[cs->cs_dcolumn].un_devstate ==
3183 		    RCS_LAST_ERRED))
3184 			err = ENXIO;
3185 		raid_error_parent(ps, err);
3186 		ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
3187 		raid_free_child(cs, 1);
3188 		raid_free_parent(ps,  RFP_DECR_FRAGS
3189 		    | RFP_RLS_LOCK | RFP_DECR_PWFRAGS);
3190 		return;
3191 	}
3192 
3193 	if (pcheck & RCL_ERRED) {
3194 		/*
3195 		 * handle case of only having data drive
3196 		 */
3197 		raid_mapin_buf(cs);
3198 		wordcnt = cs->cs_bcount / sizeof (uint_t);
3199 
3200 		dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
3201 		ubuf = (uint_t *)(void *)(cs->cs_addr);
3202 
3203 		while (wordcnt--) {
3204 			*dbuf = *ubuf;
3205 			dsum ^= *ubuf;
3206 			dbuf++;
3207 			ubuf++;
3208 		}
3209 		RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1,
3210 				cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
3211 				1, cs->cs_dcolumn, RAID_PWMAGIC);
3212 		cs->cs_frags = 1;
3213 		cs->cs_stage = RAID_NONE;
3214 		cs->cs_call = raid_write_donly;
3215 		cs->cs_error_call = raid_write_error;
3216 		cs->cs_retry_call = raid_write_err_retry;
3217 		if (WRITE_ALT(un, cs->cs_dcolumn)) {
3218 			cs->cs_frags++;
3219 			raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA |
3220 			    RIO_PREWRITE);
3221 		}
3222 		raidio(cs, RIO_DATA | RIO_PREWRITE);
3223 		return;
3224 	}
3225 
3226 	if (dcheck & RCL_ERRED) {
3227 		/*
3228 		 * handle case of only having parity drive
3229 		 * build parity from scratch using new data,
3230 		 * skip reading the data and parity columns.
3231 		 */
3232 		raid_mapin_buf(cs);
3233 		cs->cs_loop = 0;
3234 		while (cs->cs_loop == cs->cs_dcolumn ||
3235 		    cs->cs_loop == cs->cs_pcolumn)
3236 			cs->cs_loop++;
3237 
3238 		/* copy new data in to begin building parity */
3239 		bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount);
3240 		cs->cs_stage = RAID_NONE;
3241 		cs->cs_call = raid_write_ploop;
3242 		cs->cs_error_call = raid_write_error;
3243 		cs->cs_retry_call = raid_write_err_retry;
3244 		cs->cs_frags = 1;
3245 		raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
3246 		return;
3247 	}
3248 	/*
3249 	 * handle normal cases
3250 	 * read old data and old parity
3251 	 */
3252 	cs->cs_frags = 2;
3253 	cs->cs_stage = RAID_NONE;
3254 	cs->cs_call = raid_write_got_old;
3255 	cs->cs_error_call = raid_write_error;
3256 	cs->cs_retry_call = raid_write_retry;
3257 	ASSERT(ps->ps_magic == RAID_PSMAGIC);
3258 	raidio(cs, RIO_DATA | RIO_READ);
3259 	raidio(cs, RIO_PARITY | RIO_READ);
3260 }
3261 
3262 static void
3263 raid_enqueue(md_raidcs_t *cs)
3264 {
3265 	mdi_unit_t	*ui = cs->cs_ps->ps_ui;
3266 	kmutex_t	*io_list_mutex = &ui->ui_io_lock->io_list_mutex;
3267 	md_raidcs_t	*cs1;
3268 
3269 	mutex_enter(io_list_mutex);
3270 	ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
3271 	if (ui->ui_io_lock->io_list_front == NULL) {
3272 		ui->ui_io_lock->io_list_front = cs;
3273 		ui->ui_io_lock->io_list_back = cs;
3274 	} else {
3275 		cs1 = ui->ui_io_lock->io_list_back;
3276 		cs1->cs_linlck_next = cs;
3277 		ui->ui_io_lock->io_list_back = cs;
3278 	}
3279 	STAT_INC(raid_write_waits);
3280 	STAT_MAX(raid_max_write_q_length, raid_write_queue_length);
3281 	cs->cs_linlck_next = NULL;
3282 	mutex_exit(io_list_mutex);
3283 }
3284 
3285 /*
3286  * NAME:	raid_write
3287  * DESCRIPTION: RAID metadevice write routine
3288  * PARAMETERS:	mr_unit_t *un -  pointer to a unit structure
3289  *		md_raidcs_t *cs - pointer to a child structure
3290  */
3291 
3292 /*ARGSUSED*/
3293 static int
3294 raid_write(mr_unit_t *un, md_raidcs_t *cs)
3295 {
3296 	int		error = 0;
3297 	md_raidps_t	*ps;
3298 	mdi_unit_t	*ui;
3299 	minor_t		mnum;
3300 	clock_t		timeout;
3301 
3302 	ASSERT(IO_READER_HELD(un));
3303 	ps = cs->cs_ps;
3304 	ui = ps->ps_ui;
3305 
3306 	ASSERT(UNIT_STATE(un) != RUS_INIT);
3307 	if (UNIT_STATE(un) == RUS_LAST_ERRED)
3308 		error = EIO;
3309 
3310 	/* make sure the write doesn't go beyond the column */
3311 	if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn)
3312 		error = ENXIO;
3313 	if (error)
3314 		goto werror;
3315 
3316 	getresources(cs);
3317 
3318 	/*
3319 	 * this is an advisory loop that keeps the waiting lists short
3320 	 * to reduce cpu time.  Since there is a race introduced by not
3321 	 * aquiring all the correct mutexes, use a cv_timedwait to be
3322 	 * sure the write always will wake up and start.
3323 	 */
3324 	while (raid_check_pw(cs)) {
3325 		mutex_enter(&un->un_mx);
3326 		(void) drv_getparm(LBOLT, &timeout);
3327 		timeout += md_wr_wait;
3328 		un->un_rflags |= MD_RFLAG_NEEDPW;
3329 		STAT_INC(raid_prewrite_waits);
3330 		(void) cv_timedwait(&un->un_cv, &un->un_mx, timeout);
3331 		un->un_rflags &= ~MD_RFLAG_NEEDPW;
3332 		mutex_exit(&un->un_mx);
3333 	}
3334 
3335 	if (raid_line_writer_lock(cs, 1))
3336 		return (0);
3337 
3338 	un = (mr_unit_t *)md_unit_readerlock(ui);
3339 	cs->cs_un = un;
3340 	mnum = MD_SID(un);
3341 
3342 	if (un->un_state & RUS_REGEN) {
3343 		raid_regen_parity(cs);
3344 		un = MD_UNIT(mnum);
3345 		cs->cs_un = un;
3346 	}
3347 
3348 	raid_write_io(un, cs);
3349 	return (0);
3350 werror:
3351 	/* aquire unit reader lock sinc raid_free_child always drops it */
3352 	raid_error_parent(ps, error);
3353 	raid_free_child(cs, 0);
3354 	/* decrement both pwfrags and frags */
3355 	raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK);
3356 	return (0);
3357 }
3358 
3359 
3360 /*
3361  * NAMES:	raid_stage
3362  * DESCRIPTION: post-processing routine for a RAID metadevice
3363  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
3364  */
3365 static void
3366 raid_stage(md_raidcs_t *cs)
3367 {
3368 	md_raidps_t	*ps = cs->cs_ps;
3369 	mr_unit_t	*un = cs->cs_un;
3370 	md_raidcbuf_t	*cbuf;
3371 	buf_t		*bp;
3372 	void		*private;
3373 	int		flag;
3374 
3375 	switch (cs->cs_stage) {
3376 	    case RAID_READ_DONE:
3377 		raid_free_child(cs, 1);
3378 		/* decrement readfrags */
3379 		raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
3380 		return;
3381 
3382 	    case RAID_WRITE_DONE:
3383 	    case RAID_WRITE_PONLY_DONE:
3384 	    case RAID_WRITE_DONLY_DONE:
3385 		/*
3386 		 *  Completed writing real parity and/or data.
3387 		 */
3388 		ASSERT(cs->cs_flags & MD_RCS_PWDONE);
3389 		raid_free_child(cs, 1);
3390 		/* decrement frags but not pwfrags */
3391 		raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK);
3392 		return;
3393 
3394 	    case RAID_PREWRITE_DONE:
3395 		/*
3396 		 * completed writing data and parity to prewrite entries
3397 		 */
3398 		/*
3399 		 * WARNING: don't release unit reader lock here..
3400 		 * decrement pwfrags but not frags
3401 		 */
3402 		raid_free_parent(ps, RFP_DECR_PWFRAGS);
3403 		cs->cs_flags |= MD_RCS_PWDONE;
3404 		cs->cs_frags = 2;
3405 		cs->cs_stage = RAID_WRITE_DONE;
3406 		cs->cs_call = raid_stage;
3407 		cs->cs_error_call = raid_write_error;
3408 		cs->cs_retry_call = raid_write_no_retry;
3409 		if (WRITE_ALT(un, cs->cs_pcolumn)) {
3410 			cs->cs_frags++;
3411 			raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY |
3412 			    RIO_WRITE);
3413 		}
3414 		if (WRITE_ALT(un, cs->cs_dcolumn)) {
3415 			cs->cs_frags++;
3416 			raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
3417 		}
3418 		ASSERT(cs->cs_frags < 4);
3419 		raidio(cs, RIO_DATA | RIO_WRITE);
3420 		raidio(cs, RIO_PARITY | RIO_WRITE);
3421 		if (cs->cs_pw_inval_list) {
3422 			raid_free_pwinvalidate(cs);
3423 		}
3424 		return;
3425 
3426 	    case RAID_LINE_PWDONE:
3427 		ASSERT(cs->cs_frags == 0);
3428 		raid_free_parent(ps, RFP_DECR_PWFRAGS);
3429 		cs->cs_flags |= MD_RCS_PWDONE;
3430 		cs->cs_frags = un->un_origcolumncnt;
3431 		cs->cs_call = raid_stage;
3432 		cs->cs_error_call = raid_write_error;
3433 		cs->cs_retry_call = raid_write_no_retry;
3434 		cs->cs_stage = RAID_WRITE_DONE;
3435 		for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
3436 			/*
3437 			 * fill in buffer for write to prewrite area
3438 			 */
3439 			bp = &cbuf->cbuf_bp;
3440 			bp->b_back = bp;
3441 			bp->b_forw = bp;
3442 			bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE;
3443 			bp->b_bcount = cbuf->cbuf_bcount;
3444 			bp->b_bufsize = cbuf->cbuf_bcount;
3445 			bp->b_lblkno =
3446 			    un->un_column[cbuf->cbuf_column].un_devstart +
3447 			    cs->cs_blkno;
3448 			bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR);
3449 			bp->b_flags &= ~nv_available;
3450 			bp->b_flags |= B_WRITE | B_BUSY;
3451 			bp->b_iodone = (int (*)())raid_done;
3452 			bp->b_edev = md_dev64_to_dev(
3453 				un->un_column[cbuf->cbuf_column].un_dev);
3454 			bp->b_chain = (struct buf *)cs;
3455 			private = cs->cs_strategy_private;
3456 			flag = cs->cs_strategy_flag;
3457 			md_call_strategy(bp, flag, private);
3458 		}
3459 		raidio(cs, RIO_DATA | RIO_WRITE);
3460 		raidio(cs, RIO_PARITY | RIO_WRITE);
3461 		if (cs->cs_pw_inval_list) {
3462 			raid_free_pwinvalidate(cs);
3463 		}
3464 		return;
3465 
3466 	    default:
3467 		ASSERT(0);
3468 		break;
3469 	}
3470 }
3471 /*
3472  * NAME:	md_raid_strategy
3473  * DESCRIPTION: RAID metadevice I/O oprations entry point.
3474  * PARAMETERS:	buf_t	  *pb - pointer to a user I/O buffer
3475  *		int	 flag - metadevice specific flag
3476  *		void *private - carry over flag ??
3477  *
3478  */
3479 
3480 void
3481 md_raid_strategy(buf_t *pb, int flag, void *private)
3482 {
3483 	md_raidps_t	*ps;
3484 	md_raidcs_t	*cs;
3485 	int		doing_writes;
3486 	int		err;
3487 	mr_unit_t	*un;
3488 	mdi_unit_t	*ui;
3489 	size_t		count;
3490 	diskaddr_t	blkno;
3491 	caddr_t		addr;
3492 	off_t		offset;
3493 	int		colcnt;
3494 	minor_t		mnum;
3495 	set_t		setno;
3496 
3497 	ui = MDI_UNIT(getminor(pb->b_edev));
3498 	md_kstat_waitq_enter(ui);
3499 	un = (mr_unit_t *)md_io_readerlock(ui);
3500 	setno = MD_MIN2SET(getminor(pb->b_edev));
3501 
3502 	if ((flag & MD_NOBLOCK) == 0) {
3503 		if (md_inc_iocount(setno) != 0) {
3504 			pb->b_flags |= B_ERROR;
3505 			pb->b_error = ENXIO;
3506 			pb->b_resid = pb->b_bcount;
3507 			md_io_readerexit(ui);
3508 			biodone(pb);
3509 			return;
3510 		}
3511 	} else {
3512 		md_inc_iocount_noblock(setno);
3513 	}
3514 
3515 	mnum = MD_SID(un);
3516 	colcnt = un->un_totalcolumncnt - 1;
3517 	count = pb->b_bcount;
3518 
3519 	STAT_CHECK(raid_512, count == 512);
3520 	STAT_CHECK(raid_1024, count == 1024);
3521 	STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192);
3522 	STAT_CHECK(raid_8192, count == 8192);
3523 	STAT_CHECK(raid_8192_bigger, count > 8192);
3524 
3525 	(void *) md_unit_readerlock(ui);
3526 	if (!(flag & MD_STR_NOTTOP)) {
3527 		err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */
3528 		if (err != 0) {
3529 			md_kstat_waitq_exit(ui);
3530 			md_io_readerexit(ui);
3531 			return;
3532 		}
3533 	}
3534 	md_unit_readerexit(ui);
3535 
3536 	STAT_INC(raid_total_io);
3537 
3538 	/* allocate a parent structure for the user I/O */
3539 	ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS);
3540 	raid_parent_init(ps);
3541 
3542 	/*
3543 	 * Save essential information from the original buffhdr
3544 	 * in the md_save structure.
3545 	 */
3546 	ps->ps_un = un;
3547 	ps->ps_ui = ui;
3548 	ps->ps_bp = pb;
3549 	ps->ps_addr = pb->b_un.b_addr;
3550 
3551 	if ((pb->b_flags & B_READ) == 0) {
3552 		ps->ps_flags |= MD_RPS_WRITE;
3553 		doing_writes = 1;
3554 		STAT_INC(raid_writes);
3555 	} else {
3556 		ps->ps_flags |= MD_RPS_READ;
3557 		doing_writes = 0;
3558 		STAT_INC(raid_reads);
3559 	}
3560 
3561 	count = lbtodb(pb->b_bcount);	/* transfer count (in blocks) */
3562 	blkno = pb->b_lblkno;		/* block number on device */
3563 	addr  = 0;
3564 	offset = 0;
3565 	ps->ps_pwfrags = 1;
3566 	ps->ps_frags = 1;
3567 	md_kstat_waitq_to_runq(ui);
3568 
3569 	do {
3570 		cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS);
3571 		raid_child_init(cs);
3572 		cs->cs_ps = ps;
3573 		cs->cs_un = un;
3574 		cs->cs_mdunit = mnum;
3575 		cs->cs_strategy_flag = flag;
3576 		cs->cs_strategy_private = private;
3577 		cs->cs_addr = addr;
3578 		cs->cs_offset = offset;
3579 		count = raid_iosetup(un, blkno, count, cs);
3580 		if (cs->cs_flags & MD_RCS_LINE) {
3581 			blkno += (cs->cs_blkcnt * colcnt);
3582 			offset += (cs->cs_bcount * colcnt);
3583 		} else {
3584 			blkno +=  cs->cs_blkcnt;
3585 			offset += cs->cs_bcount;
3586 		}
3587 		/* for each cs bump up the ps_pwfrags and ps_frags fields */
3588 		if (count) {
3589 			mutex_enter(&ps->ps_mx);
3590 			ps->ps_pwfrags++;
3591 			ps->ps_frags++;
3592 			mutex_exit(&ps->ps_mx);
3593 			if (doing_writes)
3594 				(void) raid_write(un, cs);
3595 			else
3596 				(void) raid_read(un, cs);
3597 		}
3598 	} while (count);
3599 	if (doing_writes) {
3600 		(void) raid_write(un, cs);
3601 	} else
3602 		(void) raid_read(un, cs);
3603 
3604 	if (! (flag & MD_STR_NOTTOP) && panicstr) {
3605 		while (! (ps->ps_flags & MD_RPS_DONE)) {
3606 			md_daemon(1, &md_done_daemon);
3607 			drv_usecwait(10);
3608 		}
3609 		kmem_cache_free(raid_parent_cache, ps);
3610 	}
3611 }
3612 
3613 /*
3614  * NAMES:	raid_snarf
3615  * DESCRIPTION: RAID metadevice SNARF entry point
3616  * PARAMETERS:	md_snarfcmd_t cmd,
3617  *		set_t setno
3618  * RETURNS:
3619  */
3620 static int
3621 raid_snarf(md_snarfcmd_t cmd, set_t setno)
3622 {
3623 	mr_unit_t	*un;
3624 	mddb_recid_t	recid;
3625 	int		gotsomething;
3626 	int		all_raid_gotten;
3627 	mddb_type_t	typ1;
3628 	uint_t		ncol;
3629 	mddb_de_ic_t	*dep;
3630 	mddb_rb32_t	*rbp;
3631 	size_t		newreqsize;
3632 	mr_unit_t	*big_un;
3633 	mr_unit32_od_t	*small_un;
3634 
3635 
3636 	if (cmd == MD_SNARF_CLEANUP)
3637 		return (0);
3638 
3639 	all_raid_gotten = 1;
3640 	gotsomething = 0;
3641 	typ1 = (mddb_type_t)md_getshared_key(setno,
3642 	    raid_md_ops.md_driver.md_drivername);
3643 	recid = mddb_makerecid(setno, 0);
3644 
3645 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
3646 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) {
3647 			continue;
3648 		}
3649 
3650 		dep = mddb_getrecdep(recid);
3651 		dep->de_flags = MDDB_F_RAID;
3652 		rbp = dep->de_rb;
3653 		if ((rbp->rb_revision == MDDB_REV_RB) &&
3654 		    ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
3655 			/*
3656 			 * This means, we have an old and small record
3657 			 * and this record hasn't already been converted.
3658 			 * Before we create an incore metadevice from this
3659 			 * we have to convert it to a big record.
3660 			 */
3661 			small_un = (mr_unit32_od_t *)mddb_getrecaddr(recid);
3662 			ncol = small_un->un_totalcolumncnt;
3663 			newreqsize = sizeof (mr_unit_t) +
3664 				((ncol - 1) * sizeof (mr_column_t));
3665 			big_un = (mr_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
3666 			raid_convert((caddr_t)small_un, (caddr_t)big_un,
3667 				SMALL_2_BIG);
3668 			kmem_free(small_un, dep->de_reqsize);
3669 			dep->de_rb_userdata = big_un;
3670 			dep->de_reqsize = newreqsize;
3671 			un = big_un;
3672 			rbp->rb_private |= MD_PRV_CONVD;
3673 		} else {
3674 			/* Big device */
3675 			un = (mr_unit_t *)mddb_getrecaddr(recid);
3676 		}
3677 
3678 		/* Set revision and flag accordingly */
3679 		if (rbp->rb_revision == MDDB_REV_RB) {
3680 			un->c.un_revision = MD_32BIT_META_DEV;
3681 		} else {
3682 			un->c.un_revision = MD_64BIT_META_DEV;
3683 			un->c.un_flag |= MD_EFILABEL;
3684 		}
3685 
3686 		/*
3687 		 * Create minor device node for snarfed entry.
3688 		 */
3689 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
3690 
3691 		if (MD_UNIT(MD_SID(un)) != NULL) {
3692 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
3693 			continue;
3694 		}
3695 		all_raid_gotten = 0;
3696 		if (raid_build_incore((void *)un, 1) == 0) {
3697 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
3698 			md_create_unit_incore(MD_SID(un), &raid_md_ops,
3699 			    1);
3700 			gotsomething = 1;
3701 		} else if (un->mr_ic) {
3702 			kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
3703 				un->un_totalcolumncnt);
3704 			kmem_free(un->mr_ic, sizeof (*un->mr_ic));
3705 		}
3706 	}
3707 
3708 	if (!all_raid_gotten) {
3709 		return (gotsomething);
3710 	}
3711 
3712 	recid = mddb_makerecid(setno, 0);
3713 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
3714 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
3715 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
3716 
3717 	return (0);
3718 }
3719 
3720 /*
3721  * NAMES:	raid_halt
3722  * DESCRIPTION: RAID metadevice HALT entry point
3723  * PARAMETERS:	md_haltcmd_t cmd -
3724  *		set_t	setno -
3725  * RETURNS:
3726  */
3727 static int
3728 raid_halt(md_haltcmd_t cmd, set_t setno)
3729 {
3730 	set_t		i;
3731 	mdi_unit_t	*ui;
3732 	minor_t		mnum;
3733 
3734 	if (cmd == MD_HALT_CLOSE)
3735 		return (0);
3736 
3737 	if (cmd == MD_HALT_OPEN)
3738 		return (0);
3739 
3740 	if (cmd == MD_HALT_UNLOAD)
3741 		return (0);
3742 
3743 	if (cmd == MD_HALT_CHECK) {
3744 		for (i = 0; i < md_nunits; i++) {
3745 			mnum = MD_MKMIN(setno, i);
3746 			if ((ui = MDI_UNIT(mnum)) == NULL)
3747 				continue;
3748 			if (ui->ui_opsindex != raid_md_ops.md_selfindex)
3749 				continue;
3750 			if (md_unit_isopen(ui))
3751 				return (1);
3752 		}
3753 		return (0);
3754 	}
3755 
3756 	if (cmd != MD_HALT_DOIT)
3757 		return (1);
3758 
3759 	for (i = 0; i < md_nunits; i++) {
3760 		mnum = MD_MKMIN(setno, i);
3761 		if ((ui = MDI_UNIT(mnum)) == NULL)
3762 			continue;
3763 		if (ui->ui_opsindex != raid_md_ops.md_selfindex)
3764 			continue;
3765 		reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0);
3766 	}
3767 	return (0);
3768 }
3769 
3770 /*
3771  * NAMES:	raid_close_all_devs
3772  * DESCRIPTION: Close all the devices of the unit.
3773  * PARAMETERS:	mr_unit_t *un - pointer to unit structure
3774  * RETURNS:
3775  */
3776 void
3777 raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags)
3778 {
3779 	int		i;
3780 	mr_column_t	*device;
3781 
3782 	for (i = 0; i < un->un_totalcolumncnt; i++) {
3783 		device = &un->un_column[i];
3784 		if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
3785 			ASSERT((device->un_dev != (md_dev64_t)0) &&
3786 			    (device->un_dev != NODEV64));
3787 			if ((device->un_devstate & RCS_OKAY) && init_pw)
3788 				(void) init_pw_area(un, device->un_dev,
3789 							device->un_pwstart, i);
3790 			md_layered_close(device->un_dev, md_cflags);
3791 			device->un_devflags &= ~MD_RAID_DEV_ISOPEN;
3792 		}
3793 	}
3794 }
3795 
3796 /*
3797  * NAMES:	raid_open_all_devs
3798  * DESCRIPTION: Open all the components (columns) of the device unit.
3799  * PARAMETERS:	mr_unit_t *un - pointer to unit structure
3800  * RETURNS:
3801  */
3802 static int
3803 raid_open_all_devs(mr_unit_t *un, int md_oflags)
3804 {
3805 	minor_t		mnum = MD_SID(un);
3806 	int		i;
3807 	int		not_opened = 0;
3808 	int		commit = 0;
3809 	int		col = -1;
3810 	mr_column_t	*device;
3811 	set_t		setno = MD_MIN2SET(MD_SID(un));
3812 	side_t		side = mddb_getsidenum(setno);
3813 	mdkey_t		key;
3814 	mdi_unit_t	*ui = MDI_UNIT(mnum);
3815 
3816 	ui->ui_tstate &= ~MD_INACCESSIBLE;
3817 
3818 	for (i = 0; i < un->un_totalcolumncnt; i++) {
3819 		md_dev64_t tmpdev;
3820 
3821 		device = &un->un_column[i];
3822 
3823 		if (COLUMN_STATE(un, i) & RCS_ERRED) {
3824 			not_opened++;
3825 			continue;
3826 		}
3827 
3828 		if (device->un_devflags & MD_RAID_DEV_ISOPEN)
3829 			continue;
3830 
3831 		tmpdev = device->un_dev;
3832 		/*
3833 		 * Open by device id
3834 		 */
3835 		key = HOTSPARED(un, i) ?
3836 			device->un_hs_key : device->un_orig_key;
3837 		if ((md_getmajor(tmpdev) != md_major) &&
3838 			md_devid_found(setno, side, key) == 1) {
3839 			tmpdev = md_resolve_bydevid(mnum, tmpdev, key);
3840 		}
3841 		if (md_layered_open(mnum, &tmpdev, md_oflags)) {
3842 			device->un_dev = tmpdev;
3843 			not_opened++;
3844 			continue;
3845 		}
3846 		device->un_dev = tmpdev;
3847 		device->un_devflags |= MD_RAID_DEV_ISOPEN;
3848 	}
3849 
3850 	/* if open errors and errored devices are 1 then device can run */
3851 	if (not_opened > 1) {
3852 		cmn_err(CE_WARN,
3853 		"md: %s failed to open. open error on %s\n",
3854 			md_shortname(MD_SID(un)),
3855 			md_devname(MD_UN2SET(un), device->un_orig_dev,
3856 					NULL, 0));
3857 
3858 		ui->ui_tstate |= MD_INACCESSIBLE;
3859 
3860 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
3861 		    MD_UN2SET(un), MD_SID(un));
3862 
3863 		return (not_opened > 1);
3864 	}
3865 
3866 	for (i = 0; i < un->un_totalcolumncnt; i++) {
3867 		device = &un->un_column[i];
3868 		if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
3869 			if (device->un_devstate & RCS_LAST_ERRED) {
3870 			/*
3871 			 * At this point in time there is a possibility
3872 			 * that errors were the result of a controller
3873 			 * failure with more than a single column on it
3874 			 * so clear out last errored columns and let errors
3875 			 * re-occur is necessary.
3876 			 */
3877 				raid_set_state(un, i, RCS_OKAY, 0);
3878 				commit++;
3879 			}
3880 			continue;
3881 		}
3882 		ASSERT(col == -1);
3883 		col = i;
3884 	}
3885 
3886 	if (col != -1) {
3887 		raid_set_state(un, col, RCS_ERRED, 0);
3888 		commit++;
3889 	}
3890 
3891 	if (commit)
3892 		raid_commit(un, NULL);
3893 
3894 	if (col != -1) {
3895 		if (COLUMN_STATE(un, col) & RCS_ERRED) {
3896 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
3897 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
3898 		} else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
3899 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
3900 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
3901 		}
3902 	}
3903 
3904 	return (0);
3905 }
3906 
3907 /*
3908  * NAMES:	raid_internal_open
3909  * DESCRIPTION: Do the actual RAID open
3910  * PARAMETERS:	minor_t mnum - minor number of the RAID device
3911  *		int flag -
3912  *		int otyp -
3913  *		int md_oflags - RAID open flags
3914  * RETURNS:	0 if successful, nonzero otherwise
3915  */
3916 int
3917 raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags)
3918 {
3919 	mr_unit_t	*un;
3920 	mdi_unit_t	*ui;
3921 	int		err = 0;
3922 	int		replay_error = 0;
3923 
3924 	ui = MDI_UNIT(mnum);
3925 	ASSERT(ui != NULL);
3926 
3927 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
3928 	/*
3929 	 * this MUST be checked before md_unit_isopen is checked.
3930 	 * raid_init_columns sets md_unit_isopen to block reset, halt.
3931 	 */
3932 	if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) &&
3933 			!(md_oflags & MD_OFLG_ISINIT)) {
3934 		md_unit_openclose_exit(ui);
3935 		return (EAGAIN);
3936 	}
3937 
3938 	if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) {
3939 		err = md_unit_incopen(mnum, flag, otyp);
3940 		goto out;
3941 	}
3942 
3943 	md_unit_readerexit(ui);
3944 
3945 	un = (mr_unit_t *)md_unit_writerlock(ui);
3946 	if (raid_open_all_devs(un, md_oflags) == 0) {
3947 		if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) {
3948 			md_unit_writerexit(ui);
3949 			un = (mr_unit_t *)md_unit_readerlock(ui);
3950 			raid_close_all_devs(un, 0, md_oflags);
3951 			goto out;
3952 		}
3953 	} else {
3954 		/*
3955 		 * if this unit contains more than two errored components
3956 		 * should return error and close all opened devices
3957 		 */
3958 
3959 		md_unit_writerexit(ui);
3960 		un = (mr_unit_t *)md_unit_readerlock(ui);
3961 		raid_close_all_devs(un, 0, md_oflags);
3962 		md_unit_openclose_exit(ui);
3963 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
3964 		    MD_UN2SET(un), MD_SID(un));
3965 		return (ENXIO);
3966 	}
3967 
3968 	if (!(MD_STATUS(un) & MD_UN_REPLAYED)) {
3969 		replay_error = raid_replay(un);
3970 		MD_STATUS(un) |= MD_UN_REPLAYED;
3971 	}
3972 
3973 	md_unit_writerexit(ui);
3974 	un = (mr_unit_t *)md_unit_readerlock(ui);
3975 
3976 	if ((replay_error == RAID_RPLY_READONLY) &&
3977 	    ((flag & (FREAD | FWRITE)) == FREAD)) {
3978 		md_unit_openclose_exit(ui);
3979 		return (0);
3980 	}
3981 
3982 	/* allocate hotspare if possible */
3983 	(void) raid_hotspares();
3984 
3985 
3986 out:
3987 	md_unit_openclose_exit(ui);
3988 	return (err);
3989 }
3990 /*
3991  * NAMES:	raid_open
3992  * DESCRIPTION: RAID metadevice OPEN entry point
3993  * PARAMETERS:	dev_t dev -
3994  *		int flag -
3995  *		int otyp -
3996  *		cred_t * cred_p -
3997  *		int md_oflags -
3998  * RETURNS:
3999  */
4000 /*ARGSUSED1*/
4001 static int
4002 raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
4003 {
4004 	int		error = 0;
4005 
4006 	if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) {
4007 		return (error);
4008 	}
4009 	return (0);
4010 }
4011 
4012 /*
4013  * NAMES:	raid_internal_close
4014  * DESCRIPTION: RAID metadevice CLOSE actual implementation
4015  * PARAMETERS:	minor_t - minor number of the RAID device
4016  *		int otyp -
4017  *		int init_pw -
4018  *		int md_cflags - RAID close flags
4019  * RETURNS:	0 if successful, nonzero otherwise
4020  */
4021 /*ARGSUSED*/
4022 int
4023 raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags)
4024 {
4025 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4026 	mr_unit_t	*un;
4027 	int		err = 0;
4028 
4029 	/* single thread */
4030 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
4031 
4032 	/* count closed */
4033 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
4034 		goto out;
4035 	/* close devices, if necessary */
4036 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
4037 		raid_close_all_devs(un, init_pw, md_cflags);
4038 	}
4039 
4040 	/* unlock, return success */
4041 out:
4042 	md_unit_openclose_exit(ui);
4043 	return (err);
4044 }
4045 
4046 /*
4047  * NAMES:	raid_close
4048  * DESCRIPTION: RAID metadevice close entry point
4049  * PARAMETERS:	dev_t dev -
4050  *		int flag -
4051  *		int otyp -
4052  *		cred_t * cred_p -
4053  *		int md_oflags -
4054  * RETURNS:
4055  */
4056 /*ARGSUSED1*/
4057 static int
4058 raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
4059 {
4060 	int retval;
4061 
4062 	(void) md_io_writerlock(MDI_UNIT(getminor(dev)));
4063 	retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags);
4064 	(void) md_io_writerexit(MDI_UNIT(getminor(dev)));
4065 	return (retval);
4066 }
4067 
4068 /*
4069  * raid_probe_close_all_devs
4070  */
4071 void
4072 raid_probe_close_all_devs(mr_unit_t *un)
4073 {
4074 	int		i;
4075 	mr_column_t	*device;
4076 
4077 	for (i = 0; i < un->un_totalcolumncnt; i++) {
4078 		device = &un->un_column[i];
4079 
4080 		if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
4081 			md_layered_close(device->un_dev,
4082 				MD_OFLG_PROBEDEV);
4083 			device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN;
4084 		}
4085 	}
4086 }
4087 /*
4088  * Raid_probe_dev:
4089  *
4090  * On entry the unit writerlock is held
4091  */
4092 static int
4093 raid_probe_dev(mdi_unit_t *ui, minor_t mnum)
4094 {
4095 	mr_unit_t	*un;
4096 	int		i;
4097 	int		not_opened = 0;
4098 	int		commit = 0;
4099 	int		col = -1;
4100 	mr_column_t	*device;
4101 	int		md_devopen = 0;
4102 
4103 	if (md_unit_isopen(ui))
4104 		md_devopen++;
4105 
4106 	un = MD_UNIT(mnum);
4107 	/*
4108 	 * If the state has been set to LAST_ERRED because
4109 	 * of an error when the raid device was open at some
4110 	 * point in the past, don't probe. We really don't want
4111 	 * to reset the state in this case.
4112 	 */
4113 	if (UNIT_STATE(un) == RUS_LAST_ERRED)
4114 		return (0);
4115 
4116 	ui->ui_tstate &= ~MD_INACCESSIBLE;
4117 
4118 	for (i = 0; i < un->un_totalcolumncnt; i++) {
4119 		md_dev64_t tmpdev;
4120 
4121 		device = &un->un_column[i];
4122 		if (COLUMN_STATE(un, i) & RCS_ERRED) {
4123 			not_opened++;
4124 			continue;
4125 		}
4126 
4127 		tmpdev = device->un_dev;
4128 		/*
4129 		 * Currently the flags passed are not needed since
4130 		 * there cannot be an underlying metadevice. However
4131 		 * they are kept here for consistency.
4132 		 *
4133 		 * Open by device id
4134 		 */
4135 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)?
4136 			device->un_hs_key : device->un_orig_key);
4137 		if (md_layered_open(mnum, &tmpdev,
4138 				MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) {
4139 			device->un_dev = tmpdev;
4140 			not_opened++;
4141 			continue;
4142 		}
4143 		device->un_dev = tmpdev;
4144 
4145 		device->un_devflags |= MD_RAID_DEV_PROBEOPEN;
4146 	}
4147 
4148 	/*
4149 	 * The code below is careful on setting the LAST_ERRED state.
4150 	 *
4151 	 * If open errors and exactly one device has failed we can run.
4152 	 * If more then one device fails we have to figure out when to set
4153 	 * LAST_ERRED state.  The rationale is to avoid unnecessary resyncs
4154 	 * since they are painful and time consuming.
4155 	 *
4156 	 * When more than one component/column fails there are 2 scenerios.
4157 	 *
4158 	 * 1. Metadevice has NOT been opened: In this case, the behavior
4159 	 *    mimics the open symantics. ie. Only the first failed device
4160 	 *    is ERRED and LAST_ERRED is not set.
4161 	 *
4162 	 * 2. Metadevice has been opened: Here the read/write sematics are
4163 	 *    followed. The first failed devicce is ERRED and on the next
4164 	 *    failed device LAST_ERRED is set.
4165 	 */
4166 
4167 	if (not_opened > 1 && !md_devopen) {
4168 		cmn_err(CE_WARN,
4169 			"md: %s failed to open. open error on %s\n",
4170 				md_shortname(MD_SID(un)),
4171 				md_devname(MD_UN2SET(un), device->un_orig_dev,
4172 						NULL, 0));
4173 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
4174 		    MD_UN2SET(un), MD_SID(un));
4175 		raid_probe_close_all_devs(un);
4176 		ui->ui_tstate |= MD_INACCESSIBLE;
4177 		return (not_opened > 1);
4178 	}
4179 
4180 	if (!md_devopen) {
4181 		for (i = 0; i < un->un_totalcolumncnt; i++) {
4182 			device = &un->un_column[i];
4183 			if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
4184 				if (device->un_devstate & RCS_LAST_ERRED) {
4185 					/*
4186 					 * At this point in time there is a
4187 					 * possibility that errors were the
4188 					 * result of a controller failure with
4189 					 * more than a single column on it so
4190 					 * clear out last errored columns and
4191 					 * let errors re-occur is necessary.
4192 					 */
4193 					raid_set_state(un, i, RCS_OKAY, 0);
4194 					commit++;
4195 					}
4196 				continue;
4197 			}
4198 			ASSERT(col == -1);
4199 			/*
4200 			 * note if multiple devices are failing then only
4201 			 * the last one is marked as error
4202 			 */
4203 			col = i;
4204 		}
4205 
4206 		if (col != -1) {
4207 			raid_set_state(un, col, RCS_ERRED, 0);
4208 			commit++;
4209 		}
4210 
4211 	} else {
4212 		for (i = 0; i < un->un_totalcolumncnt; i++) {
4213 			device = &un->un_column[i];
4214 
4215 			/* if we have LAST_ERRED go ahead and commit. */
4216 			if (un->un_state & RUS_LAST_ERRED)
4217 				break;
4218 			/*
4219 			 * could not open the component
4220 			 */
4221 
4222 			if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) {
4223 				col = i;
4224 				raid_set_state(un, col, RCS_ERRED, 0);
4225 				commit++;
4226 			}
4227 		}
4228 	}
4229 
4230 	if (commit)
4231 		raid_commit(un, NULL);
4232 
4233 	if (col != -1) {
4234 		if (COLUMN_STATE(un, col) & RCS_ERRED) {
4235 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
4236 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
4237 		} else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
4238 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
4239 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
4240 		}
4241 	}
4242 
4243 	raid_probe_close_all_devs(un);
4244 	return (0);
4245 }
4246 
4247 static int
4248 raid_imp_set(
4249 	set_t	setno
4250 )
4251 {
4252 	mddb_recid_t    recid;
4253 	int		i, gotsomething;
4254 	mddb_type_t	typ1;
4255 	mddb_de_ic_t	*dep;
4256 	mddb_rb32_t	*rbp;
4257 	mr_unit_t	*un64;
4258 	mr_unit32_od_t	*un32;
4259 	minor_t		*self_id;	/* minor needs to be updated */
4260 	md_parent_t	*parent_id;	/* parent needs to be updated */
4261 	mddb_recid_t	*record_id;	 /* record id needs to be updated */
4262 	hsp_t		*hsp_id;
4263 
4264 	gotsomething = 0;
4265 
4266 	typ1 = (mddb_type_t)md_getshared_key(setno,
4267 	    raid_md_ops.md_driver.md_drivername);
4268 	recid = mddb_makerecid(setno, 0);
4269 
4270 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
4271 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
4272 			continue;
4273 
4274 		dep = mddb_getrecdep(recid);
4275 		rbp = dep->de_rb;
4276 
4277 		if (rbp->rb_revision == MDDB_REV_RB) {
4278 			/*
4279 			 * Small device
4280 			 */
4281 			un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid);
4282 			self_id = &(un32->c.un_self_id);
4283 			parent_id = &(un32->c.un_parent);
4284 			record_id = &(un32->c.un_record_id);
4285 			hsp_id = &(un32->un_hsp_id);
4286 
4287 			for (i = 0; i < un32->un_totalcolumncnt; i++) {
4288 			    mr_column32_od_t *device;
4289 
4290 			    device = &un32->un_column[i];
4291 			    if (!md_update_minor(setno, mddb_getsidenum
4292 				(setno), device->un_orig_key))
4293 				goto out;
4294 
4295 			    if (device->un_hs_id != 0)
4296 				device->un_hs_id = MAKERECID(
4297 				setno, device->un_hs_id);
4298 			}
4299 		} else {
4300 			un64 = (mr_unit_t *)mddb_getrecaddr(recid);
4301 			self_id = &(un64->c.un_self_id);
4302 			parent_id = &(un64->c.un_parent);
4303 			record_id = &(un64->c.un_record_id);
4304 			hsp_id = &(un64->un_hsp_id);
4305 
4306 			for (i = 0; i < un64->un_totalcolumncnt; i++) {
4307 			    mr_column_t	*device;
4308 
4309 			    device = &un64->un_column[i];
4310 			    if (!md_update_minor(setno, mddb_getsidenum
4311 				(setno), device->un_orig_key))
4312 				goto out;
4313 
4314 			    if (device->un_hs_id != 0)
4315 				device->un_hs_id = MAKERECID(
4316 				setno, device->un_hs_id);
4317 			}
4318 		}
4319 
4320 		/*
4321 		 * Update unit with the imported setno
4322 		 */
4323 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
4324 
4325 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
4326 
4327 		if (*hsp_id != -1)
4328 			*hsp_id = MAKERECID(setno, DBID(*hsp_id));
4329 
4330 		if (*parent_id != MD_NO_PARENT)
4331 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
4332 		*record_id = MAKERECID(setno, DBID(*record_id));
4333 		gotsomething = 1;
4334 	}
4335 
4336 out:
4337 	return (gotsomething);
4338 }
4339 
4340 static md_named_services_t raid_named_services[] = {
4341 	{raid_hotspares,			"poke hotspares"	},
4342 	{raid_rename_check,			MDRNM_CHECK		},
4343 	{raid_rename_lock,			MDRNM_LOCK		},
4344 	{(intptr_t (*)()) raid_rename_unlock,	MDRNM_UNLOCK		},
4345 	{(intptr_t (*)()) raid_probe_dev,	"probe open test"	},
4346 	{NULL,					0			}
4347 };
4348 
4349 md_ops_t raid_md_ops = {
4350 	raid_open,		/* open */
4351 	raid_close,		/* close */
4352 	md_raid_strategy,	/* strategy */
4353 	NULL,			/* print */
4354 	NULL,			/* dump */
4355 	NULL,			/* read */
4356 	NULL,			/* write */
4357 	md_raid_ioctl,		/* ioctl, */
4358 	raid_snarf,		/* raid_snarf */
4359 	raid_halt,		/* raid_halt */
4360 	NULL,			/* aread */
4361 	NULL,			/* awrite */
4362 	raid_imp_set,		/* import set */
4363 	raid_named_services
4364 };
4365 
4366 static void
4367 init_init()
4368 {
4369 	/* default to a second */
4370 	if (md_wr_wait == 0)
4371 		md_wr_wait = md_hz >> 1;
4372 
4373 	raid_parent_cache = kmem_cache_create("md_raid_parent",
4374 	    sizeof (md_raidps_t), 0, raid_parent_constructor,
4375 	    raid_parent_destructor, raid_run_queue, NULL, NULL, 0);
4376 	raid_child_cache = kmem_cache_create("md_raid_child",
4377 	    sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0,
4378 	    raid_child_constructor, raid_child_destructor,
4379 	    raid_run_queue, NULL, NULL, 0);
4380 	raid_cbuf_cache = kmem_cache_create("md_raid_cbufs",
4381 	    sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor,
4382 	    raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0);
4383 }
4384 
4385 static void
4386 fini_uninit()
4387 {
4388 	kmem_cache_destroy(raid_parent_cache);
4389 	kmem_cache_destroy(raid_child_cache);
4390 	kmem_cache_destroy(raid_cbuf_cache);
4391 	raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL;
4392 }
4393 
4394 /* define the module linkage */
4395 MD_PLUGIN_MISC_MODULE("raid module %I%", init_init(), fini_uninit())
4396