xref: /titanic_41/usr/src/uts/common/io/lvm/raid/raid.c (revision fb3fb4f3d76d55b64440afd0af72775dfad3bd1d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * NAME:	raid.c
30  *
31  * DESCRIPTION: Main RAID driver source file containing open, close and I/O
32  *		operations.
33  *
34  * ROUTINES PROVIDED FOR EXTERNAL USE:
35  *  raid_open()			- open the RAID metadevice for access.
36  *  raid_internal_open()	- internal open routine of RAID metdevice.
37  *  md_raid_strategy()		- perform normal I/O operations,
38  *				    such as read and write.
39  *  raid_close()		- close the RAID metadevice.
40  *  raid_internal_close()	- internal close routine of RAID metadevice.
41  *  raid_snarf()		- initialize and clean up MDD records.
42  *  raid_halt()			- reset the RAID metadevice
43  *  raid_line()			- return the line # of this segment
44  *  raid_dcolumn()		- return the data column # of this segment
45  *  raid_pcolumn()		- return the parity column # of this segment
46  */
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/conf.h>
51 #include <sys/file.h>
52 #include <sys/user.h>
53 #include <sys/uio.h>
54 #include <sys/t_lock.h>
55 #include <sys/buf.h>
56 #include <sys/dkio.h>
57 #include <sys/vtoc.h>
58 #include <sys/kmem.h>
59 #include <vm/page.h>
60 #include <sys/cmn_err.h>
61 #include <sys/sysmacros.h>
62 #include <sys/types.h>
63 #include <sys/mkdev.h>
64 #include <sys/stat.h>
65 #include <sys/open.h>
66 #include <sys/modctl.h>
67 #include <sys/ddi.h>
68 #include <sys/sunddi.h>
69 #include <sys/debug.h>
70 #include <sys/lvm/md_raid.h>
71 #include <sys/lvm/mdvar.h>
72 #include <sys/lvm/md_convert.h>
73 
74 #include <sys/sysevent/eventdefs.h>
75 #include <sys/sysevent/svm.h>
76 
77 md_ops_t		raid_md_ops;
78 #ifndef lint
79 char			_depends_on[] = "drv/md";
80 md_ops_t		*md_interface_ops = &raid_md_ops;
81 #endif	/* lint */
82 
83 extern unit_t		md_nunits;
84 extern unit_t		md_nsets;
85 extern md_set_t		md_set[];
86 extern int		md_status;
87 extern major_t		md_major;
88 extern mdq_anchor_t	md_done_daemon;
89 extern mdq_anchor_t	md_mstr_daemon;
90 extern int		md_sleep_for_test;
91 extern clock_t		md_hz;
92 
93 extern md_event_queue_t	*md_event_queue;
94 
95 
96 int pchunks		= 16;
97 int phigh		= 1024;
98 int plow		= 128;
99 int cchunks		= 64;
100 int chigh		= 1024;
101 int clow		= 512;
102 int bchunks		= 32;
103 int bhigh		= 256;
104 int blow		= 128;
105 
106 int raid_total_io		= 0;
107 int raid_reads			= 0;
108 int raid_writes			= 0;
109 int raid_no_bpmaps		= 0;
110 int raid_512			= 0;
111 int raid_1024			= 0;
112 int raid_1024_8192		= 0;
113 int raid_8192			= 0;
114 int raid_8192_bigger		= 0;
115 int raid_line_lock_wait	= 0;
116 
117 int data_buffer_waits		= 0;
118 int parity_buffer_waits	= 0;
119 
120 /* writer line locks */
121 int raid_writer_locks		= 0; /* total writer locks */
122 int raid_write_waits		= 0; /* total writer locks that waited */
123 int raid_full_line_writes	= 0; /* total full line writes */
124 int raid_write_queue_length	= 0; /* wait queue length */
125 int raid_max_write_q_length	= 0; /* maximum queue length */
126 int raid_write_locks_active	= 0; /* writer locks at any time */
127 int raid_max_write_locks	= 0; /* maximum writer locks active */
128 
129 /* read line locks */
130 int raid_reader_locks		= 0; /* total reader locks held */
131 int raid_reader_locks_active	= 0; /* reader locks held */
132 int raid_max_reader_locks	= 0; /* maximum reader locks held in run */
133 int raid_read_overlaps		= 0; /* number of times 2 reads hit same line */
134 int raid_read_waits		= 0; /* times a reader waited on writer */
135 
136 /* prewrite stats */
137 int raid_prewrite_waits		= 0; /* number of waits for a pw slot */
138 int raid_pw			= 0; /* number of pw slots in use */
139 int raid_prewrite_max		= 0; /* maximum number of pw slots in use */
140 int raid_pw_invalidates		= 0;
141 
142 static clock_t md_wr_wait	= 0;
143 
144 int nv_available	= 0; /* presence of nv-ram support in device */
145 int nv_prewrite		= 1; /* mark prewrites with nv_available */
146 int nv_parity		= 1; /* mark parity with nv_available */
147 
148 kmem_cache_t	*raid_parent_cache = NULL;
149 kmem_cache_t	*raid_child_cache = NULL;
150 kmem_cache_t	*raid_cbuf_cache = NULL;
151 
152 int			raid_internal_open(minor_t mnum, int flag, int otyp,
153 			    int md_oflags);
154 
155 static void		freebuffers(md_raidcs_t *cs);
156 static int		raid_read(mr_unit_t *un, md_raidcs_t *cs);
157 static void		raid_read_io(mr_unit_t *un, md_raidcs_t *cs);
158 static int		raid_write(mr_unit_t *un, md_raidcs_t *cs);
159 static void		raid_write_io(mr_unit_t *un, md_raidcs_t *cs);
160 static void		raid_stage(md_raidcs_t *cs);
161 static void		raid_enqueue(md_raidcs_t *cs);
162 static diskaddr_t	raid_line(diskaddr_t segment, mr_unit_t *un);
163 uint_t			raid_dcolumn(diskaddr_t segment, mr_unit_t *un);
164 static void		getpbuffer(md_raidcs_t *cs);
165 static void		getdbuffer(md_raidcs_t *cs);
166 static void		raid_done(buf_t *bp);
167 static void		raid_io_startup(mr_unit_t *un);
168 
169 static rus_state_t
170 raid_col2unit(rcs_state_t state, rus_state_t unitstate)
171 {
172 	switch (state) {
173 	case RCS_INIT:
174 		return (RUS_INIT);
175 	case RCS_OKAY:
176 		return (RUS_OKAY);
177 	case RCS_RESYNC:
178 		if (unitstate & RUS_LAST_ERRED)
179 			return (RUS_LAST_ERRED);
180 		else
181 			return (RUS_ERRED);
182 	case RCS_ERRED:
183 		return (RUS_ERRED);
184 	case RCS_LAST_ERRED:
185 		return (RUS_ERRED);
186 	default:
187 		break;
188 	}
189 	panic("raid_col2unit");
190 	/*NOTREACHED*/
191 }
192 
193 void
194 raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force)
195 {
196 
197 	rus_state_t	unitstate, origstate;
198 	rcs_state_t	colstate;
199 	rcs_state_t	orig_colstate;
200 	int		errcnt = 0,
201 			okaycnt = 0,
202 			resynccnt = 0;
203 	int		i;
204 	char		*devname;
205 
206 	ASSERT(un);
207 	ASSERT(col < un->un_totalcolumncnt);
208 	ASSERT(newstate &
209 	    (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
210 	    RCS_LAST_ERRED | RCS_REGEN));
211 	ASSERT((newstate &
212 	    ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
213 	    RCS_LAST_ERRED | RCS_REGEN))
214 	    == 0);
215 
216 	ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
217 
218 	unitstate = un->un_state;
219 	origstate = unitstate;
220 
221 	if (force) {
222 		un->un_column[col].un_devstate = newstate;
223 		un->un_state = raid_col2unit(newstate, unitstate);
224 		uniqtime32(&un->un_column[col].un_devtimestamp);
225 		uniqtime32(&un->un_timestamp);
226 		return;
227 	}
228 
229 	ASSERT(un->un_state &
230 	    (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED |
231 	    RUS_REGEN));
232 	ASSERT((un->un_state & ~(RUS_INIT |
233 	    RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0);
234 
235 	if (un->un_column[col].un_devstate == newstate)
236 		return;
237 
238 	if (newstate == RCS_REGEN) {
239 		if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt)
240 			return;
241 		un->un_state = RUS_REGEN;
242 		return;
243 	}
244 
245 	orig_colstate = un->un_column[col].un_devstate;
246 
247 	/*
248 	 * if there is another column in the error state then this
249 	 * column should go to the last errored state
250 	 */
251 	for (i = 0; i < un->un_totalcolumncnt; i++) {
252 		if (i == col)
253 			colstate = newstate;
254 		else
255 			colstate = un->un_column[i].un_devstate;
256 		if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED))
257 			errcnt++;
258 		if (colstate & RCS_OKAY)
259 			okaycnt++;
260 		if (colstate & RCS_RESYNC)
261 			resynccnt++;
262 	}
263 	ASSERT(resynccnt < 2);
264 
265 	if (okaycnt == un->un_totalcolumncnt)
266 		unitstate = RUS_OKAY;
267 	else if (errcnt > 1) {
268 		unitstate = RUS_LAST_ERRED;
269 		if (newstate & RCS_ERRED)
270 			newstate = RCS_LAST_ERRED;
271 	} else if (errcnt == 1)
272 		if (!(unitstate & RUS_LAST_ERRED))
273 			unitstate = RUS_ERRED;
274 
275 	if (un->un_state == RUS_DOI)
276 		unitstate = RUS_DOI;
277 
278 	un->un_column[col].un_devstate = newstate;
279 	uniqtime32(&un->un_column[col].un_devtimestamp);
280 	/*
281 	 * if there are last errored column being brought back online
282 	 * by open or snarf, then be sure to clear the RUS_LAST_ERRED
283 	 * bit to allow writes.  If there is a real error then the
284 	 * column will go back into last erred.
285 	 */
286 	if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) &&
287 	    (raid_state_cnt(un, RCS_ERRED) == 1))
288 		unitstate = RUS_ERRED;
289 
290 	un->un_state = unitstate;
291 	uniqtime32(&un->un_timestamp);
292 
293 	if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) &&
294 	    (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) {
295 		devname = md_devname(MD_UN2SET(un),
296 			un->un_column[col].un_dev, NULL, 0);
297 
298 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
299 		    md_shortname(MD_SID(un)), devname);
300 
301 		if (unitstate & RUS_LAST_ERRED) {
302 			cmn_err(CE_WARN, "md: %s: %s last erred",
303 			    md_shortname(MD_SID(un)), devname);
304 
305 		} else if (un->un_column[col].un_devflags &
306 		    MD_RAID_DEV_ISOPEN) {
307 			/*
308 			 * Close the broken device and clear the open flag on
309 			 * it.  We have to check that the device is open,
310 			 * otherwise the first open on it has resulted in the
311 			 * error that is being processed and the actual un_dev
312 			 * will be NODEV64.
313 			 */
314 			md_layered_close(un->un_column[col].un_dev,
315 			    MD_OFLG_NULL);
316 			un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
317 		}
318 	} else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED &&
319 	    un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) {
320 		/*
321 		 * Similar to logic above except no log messages since we
322 		 * are just transitioning from Last Erred to Erred.
323 		 */
324 		md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL);
325 		un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
326 	}
327 
328 	/*
329 	 * If a resync has completed, see if there is a Last Erred
330 	 * component that we can change to the Erred state.
331 	 */
332 	if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) {
333 		for (i = 0; i < un->un_totalcolumncnt; i++) {
334 			if (i != col &&
335 			    (un->un_column[i].un_devstate & RCS_LAST_ERRED)) {
336 				raid_set_state(un, i, RCS_ERRED, 0);
337 				break;
338 			}
339 		}
340 	}
341 }
342 
343 /*
344  * NAME:	erred_check_line
345  *
346  * DESCRIPTION: Return the type of write to perform on an erred column based
347  *		upon any resync activity.
348  *
349  *		if a column is being resynced and the write is above the
350  *		resync point may have to write to the target being resynced.
351  *
352  *		Column state may make it impossible to do the write
353  *		in which case RCL_EIO or RCL_ENXIO is returned.
354  *
355  *		If a column cannot be written directly, RCL_ERRED is
356  *		returned and processing should proceed accordingly.
357  *
358  * PARAMETERS:	minor_t		 mnum - minor number identity of metadevice
359  *		md_raidcs_t	 *cs - child save structure
360  *		mr_column_t	 *dcolumn - pointer to data column structure
361  *		mr_column_t	 *pcolumn - pointer to parity column structure
362  *
363  * RETURNS:	RCL_OKAY, RCL_ERRED
364  *
365  * LOCKS:	Expects Line Writer Lock and Unit Resource Lock to be held
366  *		across call.
367  */
368 
369 static int
370 erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column)
371 {
372 
373 	ASSERT(un != NULL);
374 	ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
375 
376 	if (column->un_devstate & RCS_OKAY)
377 		return (RCL_OKAY);
378 
379 	if (column->un_devstate & RCS_ERRED)
380 		return (RCL_ERRED);  /* do not read from errored disk */
381 
382 	/*
383 	 * for the last errored case their are two considerations.
384 	 * When the last errored column is the only errored column then
385 	 * do treat it like a maintenance column, not doing I/O from
386 	 * it.   When it there are other failures then just attempt
387 	 * to use it.
388 	 */
389 	if (column->un_devstate & RCS_LAST_ERRED)
390 		return (RCL_ERRED);
391 
392 	ASSERT(column->un_devstate & RCS_RESYNC);
393 
394 	/*
395 	 * When a resync from a hotspare is being done (copy resync)
396 	 * then always treat it as an OKAY column, since no regen
397 	 * is required.
398 	 */
399 	if (column->un_devflags & MD_RAID_COPY_RESYNC) {
400 		return (RCL_OKAY);
401 	}
402 
403 	mutex_enter(&un->un_mx);
404 	if (cs->cs_line < un->un_resync_line_index) {
405 		mutex_exit(&un->un_mx);
406 		return (RCL_OKAY);
407 	}
408 	mutex_exit(&un->un_mx);
409 	return (RCL_ERRED);
410 
411 }
412 
413 /*
414  * NAMES:	raid_state_cnt
415  *
416  * DESCRIPTION: counts number of column in a specific state
417  *
418  * PARAMETERS:	md_raid_t *un
419  *		rcs_state state
420  */
421 int
422 raid_state_cnt(mr_unit_t *un, rcs_state_t state)
423 {
424 	int	i, retval = 0;
425 
426 	for (i = 0; i < un->un_totalcolumncnt; i++)
427 		if (un->un_column[i].un_devstate & state)
428 			retval++;
429 	return (retval);
430 }
431 
432 /*
433  * NAMES:	raid_io_overlaps
434  *
435  * DESCRIPTION: checkst for overlap of 2 child save structures
436  *
437  * PARAMETERS:	md_raidcs_t cs1
438  *		md_raidcs_t cs2
439  *
440  * RETURNS:	0 - no overlap
441  *		1 - overlap
442  */
443 int
444 raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2)
445 {
446 	if (cs1->cs_blkno > cs2->cs_lastblk)
447 		return (0);
448 	if (cs1->cs_lastblk < cs2->cs_blkno)
449 		return (0);
450 	return (1);
451 }
452 
453 /*
454  * NAMES:	raid_parent_constructor
455  * DESCRIPTION: parent structure constructor routine
456  * PARAMETERS:
457  */
458 /*ARGSUSED1*/
459 static int
460 raid_parent_constructor(void *p, void *d1, int d2)
461 {
462 	mutex_init(&((md_raidps_t *)p)->ps_mx,
463 	    NULL, MUTEX_DEFAULT, NULL);
464 	mutex_init(&((md_raidps_t *)p)->ps_mapin_mx,
465 	    NULL, MUTEX_DEFAULT, NULL);
466 	return (0);
467 }
468 
469 void
470 raid_parent_init(md_raidps_t *ps)
471 {
472 	bzero(ps, offsetof(md_raidps_t, ps_mx));
473 	((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE;
474 	((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC;
475 }
476 
477 /*ARGSUSED1*/
478 static void
479 raid_parent_destructor(void *p, void *d)
480 {
481 	mutex_destroy(&((md_raidps_t *)p)->ps_mx);
482 	mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx);
483 }
484 
485 /*
486  * NAMES:	raid_child_constructor
487  * DESCRIPTION: child structure constructor routine
488  * PARAMETERS:
489  */
490 /*ARGSUSED1*/
491 static int
492 raid_child_constructor(void *p, void *d1, int d2)
493 {
494 	md_raidcs_t	*cs = (md_raidcs_t *)p;
495 	mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL);
496 	bioinit(&cs->cs_dbuf);
497 	bioinit(&cs->cs_pbuf);
498 	bioinit(&cs->cs_hbuf);
499 	return (0);
500 }
501 
502 void
503 raid_child_init(md_raidcs_t *cs)
504 {
505 	bzero(cs, offsetof(md_raidcs_t, cs_mx));
506 
507 	md_bioreset(&cs->cs_dbuf);
508 	md_bioreset(&cs->cs_pbuf);
509 	md_bioreset(&cs->cs_hbuf);
510 
511 	((md_raidcs_t *)cs)->cs_dbuf.b_chain =
512 	    ((md_raidcs_t *)cs)->cs_pbuf.b_chain =
513 	    ((md_raidcs_t *)cs)->cs_hbuf.b_chain =
514 	    (struct buf *)(cs);
515 
516 	cs->cs_magic = RAID_CSMAGIC;
517 	cs->cs_line = MD_DISKADDR_ERROR;
518 	cs->cs_dpwslot = -1;
519 	cs->cs_ppwslot = -1;
520 }
521 
522 /*ARGSUSED1*/
523 static void
524 raid_child_destructor(void *p, void *d)
525 {
526 	biofini(&((md_raidcs_t *)p)->cs_dbuf);
527 	biofini(&((md_raidcs_t *)p)->cs_hbuf);
528 	biofini(&((md_raidcs_t *)p)->cs_pbuf);
529 	mutex_destroy(&((md_raidcs_t *)p)->cs_mx);
530 }
531 
532 /*ARGSUSED1*/
533 static int
534 raid_cbuf_constructor(void *p, void *d1, int d2)
535 {
536 	bioinit(&((md_raidcbuf_t *)p)->cbuf_bp);
537 	return (0);
538 }
539 
540 static void
541 raid_cbuf_init(md_raidcbuf_t *cb)
542 {
543 	bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp));
544 	md_bioreset(&cb->cbuf_bp);
545 	cb->cbuf_magic = RAID_BUFMAGIC;
546 	cb->cbuf_pwslot = -1;
547 	cb->cbuf_flags = CBUF_WRITE;
548 }
549 
550 /*ARGSUSED1*/
551 static void
552 raid_cbuf_destructor(void *p, void *d)
553 {
554 	biofini(&((md_raidcbuf_t *)p)->cbuf_bp);
555 }
556 
557 /*
558  * NAMES:	raid_run_queue
559  * DESCRIPTION: spawn a backend processing daemon for RAID metadevice.
560  * PARAMETERS:
561  */
562 /*ARGSUSED*/
563 static void
564 raid_run_queue(void *d)
565 {
566 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
567 		md_daemon(1, &md_done_daemon);
568 }
569 
570 /*
571  * NAME:	raid_build_pwslot
572  * DESCRIPTION: builds mr_pw_reserve for the column
573  * PARAMETERS:	un is the pointer to the unit structure
574  *		colindex is the column to create the structure for
575  */
576 int
577 raid_build_pw_reservation(mr_unit_t *un, int colindex)
578 {
579 	mr_pw_reserve_t	*pw;
580 	mr_scoreboard_t	*sb;
581 	int		i;
582 
583 	pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) +
584 	    (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP);
585 	pw->pw_magic = RAID_PWMAGIC;
586 	pw->pw_column = colindex;
587 	pw->pw_free = un->un_pwcnt;
588 	sb = &pw->pw_sb[0];
589 	for (i = 0; i < un->un_pwcnt; i++) {
590 		sb[i].sb_column = colindex;
591 		sb[i].sb_flags = SB_UNUSED;
592 		sb[i].sb_start_blk = 0;
593 		sb[i].sb_last_blk = 0;
594 		sb[i].sb_cs = NULL;
595 	}
596 	un->un_column_ic[colindex].un_pw_reserve = pw;
597 	return (0);
598 }
599 /*
600  * NAME:	raid_free_pw_reservation
601  * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine
602  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
603  *		int colindex  - index of the column whose pre-write slot struct
604  *			is to be destroyed.
605  */
606 void
607 raid_free_pw_reservation(mr_unit_t *un, int colindex)
608 {
609 	mr_pw_reserve_t	*pw = un->un_column_ic[colindex].un_pw_reserve;
610 
611 	kmem_free(pw, sizeof (mr_pw_reserve_t) +
612 	    (sizeof (mr_scoreboard_t) * un->un_pwcnt));
613 }
614 
615 /*
616  * NAME:	raid_cancel_pwslot
617  * DESCRIPTION: RAID metadevice write routine
618  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
619  */
620 static void
621 raid_cancel_pwslot(md_raidcs_t *cs)
622 {
623 	mr_unit_t		*un = cs->cs_un;
624 	mr_pw_reserve_t		*pw;
625 	mr_scoreboard_t		*sb;
626 	mr_column_ic_t		*col;
627 	md_raidcbuf_t		*cbuf;
628 	int			broadcast = 0;
629 
630 	if (cs->cs_ps->ps_flags & MD_RPS_READ)
631 		return;
632 	if (cs->cs_dpwslot != -1) {
633 		col = &un->un_column_ic[cs->cs_dcolumn];
634 		pw = col->un_pw_reserve;
635 		sb = &pw->pw_sb[cs->cs_dpwslot];
636 		sb->sb_flags = SB_AVAIL;
637 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
638 			broadcast++;
639 		sb->sb_cs = NULL;
640 	}
641 
642 	if (cs->cs_ppwslot != -1) {
643 		col = &un->un_column_ic[cs->cs_pcolumn];
644 		pw = col->un_pw_reserve;
645 		sb = &pw->pw_sb[cs->cs_ppwslot];
646 		sb->sb_flags = SB_AVAIL;
647 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
648 			broadcast++;
649 		sb->sb_cs = NULL;
650 	}
651 
652 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
653 		if (cbuf->cbuf_pwslot == -1)
654 			continue;
655 		col = &un->un_column_ic[cbuf->cbuf_column];
656 		pw = col->un_pw_reserve;
657 		sb = &pw->pw_sb[cbuf->cbuf_pwslot];
658 		sb->sb_flags = SB_AVAIL;
659 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
660 			broadcast++;
661 		sb->sb_cs = NULL;
662 	}
663 	if (broadcast) {
664 		cv_broadcast(&un->un_cv);
665 		return;
666 	}
667 	mutex_enter(&un->un_mx);
668 	if (un->un_rflags & MD_RFLAG_NEEDPW)
669 		cv_broadcast(&un->un_cv);
670 	mutex_exit(&un->un_mx);
671 }
672 
673 static void
674 raid_free_pwinvalidate(md_raidcs_t *cs)
675 {
676 	md_raidcbuf_t		*cbuf;
677 	md_raidcbuf_t		*cbuf_to_free;
678 	mr_unit_t		*un = cs->cs_un;
679 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
680 	mr_pw_reserve_t		*pw;
681 	mr_scoreboard_t		*sb;
682 	int			broadcast = 0;
683 
684 	cbuf = cs->cs_pw_inval_list;
685 	ASSERT(cbuf);
686 	mutex_enter(&un->un_linlck_mx);
687 	while (cbuf) {
688 		pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve;
689 		sb = &pw->pw_sb[0];
690 		ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND);
691 		sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED;
692 		sb[cbuf->cbuf_pwslot].sb_cs = NULL;
693 		if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
694 			broadcast++;
695 		cbuf_to_free = cbuf;
696 		cbuf = cbuf->cbuf_next;
697 		kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize));
698 		kmem_cache_free(raid_cbuf_cache, cbuf_to_free);
699 	}
700 	cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL;
701 	/*
702 	 * now that there is a free prewrite slot, check to see if there
703 	 * are any io operations waiting first wake up the raid_io_startup
704 	 * then signal the the processes waiting in raid_write.
705 	 */
706 	if (ui->ui_io_lock->io_list_front)
707 		raid_io_startup(un);
708 	mutex_exit(&un->un_linlck_mx);
709 	if (broadcast) {
710 		cv_broadcast(&un->un_cv);
711 		return;
712 	}
713 	mutex_enter(&un->un_mx);
714 	if (un->un_rflags & MD_RFLAG_NEEDPW)
715 		cv_broadcast(&un->un_cv);
716 	mutex_exit(&un->un_mx);
717 }
718 
719 
720 static int
721 raid_get_pwslot(md_raidcs_t *cs, int column)
722 {
723 	mr_scoreboard_t	*sb;
724 	mr_pw_reserve_t	*pw;
725 	mr_unit_t	*un = cs->cs_un;
726 	diskaddr_t	start_blk = cs->cs_blkno;
727 	diskaddr_t	last_blk = cs->cs_lastblk;
728 	int		i;
729 	int		pwcnt = un->un_pwcnt;
730 	int		avail = -1;
731 	int		use = -1;
732 	int		flags;
733 
734 
735 	/* start with the data column */
736 	pw = cs->cs_un->un_column_ic[column].un_pw_reserve;
737 	sb = &pw->pw_sb[0];
738 	ASSERT(pw->pw_free > 0);
739 	for (i = 0; i < pwcnt; i++) {
740 		flags = sb[i].sb_flags;
741 		if (flags & SB_INVAL_PEND)
742 			continue;
743 
744 		if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED)))
745 			avail = i;
746 
747 		if ((start_blk > sb[i].sb_last_blk) ||
748 		    (last_blk < sb[i].sb_start_blk))
749 			continue;
750 
751 		/* OVERLAP */
752 		ASSERT(! (sb[i].sb_flags & SB_INUSE));
753 
754 		/*
755 		 * raid_invalidate_pwslot attempts to zero out prewrite entry
756 		 * in parallel with other disk reads/writes related to current
757 		 * transaction. however cs_frags accounting for this case is
758 		 * broken because raid_write_io resets cs_frags i.e. ignoring
759 		 * that it could have been been set to > 0 value by
760 		 * raid_invalidate_pwslot. While this can be fixed an
761 		 * additional problem is that we don't seem to handle
762 		 * correctly the case of getting a disk error for prewrite
763 		 * entry invalidation.
764 		 * It does not look like we really need
765 		 * to invalidate prewrite slots because raid_replay sorts
766 		 * prewrite id's in ascending order and during recovery the
767 		 * latest prewrite entry for the same block will be replay
768 		 * last. That's why i ifdef'd out the call to
769 		 * raid_invalidate_pwslot. --aguzovsk@east
770 		 */
771 
772 		if (use == -1) {
773 			use = i;
774 		}
775 	}
776 
777 	ASSERT(avail != -1);
778 	pw->pw_free--;
779 	if (use == -1)
780 		use = avail;
781 
782 	ASSERT(! (sb[use].sb_flags & SB_INUSE));
783 	sb[use].sb_flags = SB_INUSE;
784 	sb[use].sb_cs = cs;
785 	sb[use].sb_start_blk = start_blk;
786 	sb[use].sb_last_blk = last_blk;
787 	ASSERT((use >= 0) && (use < un->un_pwcnt));
788 	return (use);
789 }
790 
791 static int
792 raid_check_pw(md_raidcs_t *cs)
793 {
794 
795 	mr_unit_t	*un = cs->cs_un;
796 	int		i;
797 
798 	ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
799 	/*
800 	 * check to be sure there is a prewrite slot available
801 	 * if not just return.
802 	 */
803 	if (cs->cs_flags & MD_RCS_LINE) {
804 		for (i = 0; i < un->un_totalcolumncnt; i++)
805 			if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0)
806 				return (1);
807 		return (0);
808 	}
809 
810 	if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0)
811 		return (1);
812 	if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0)
813 		return (1);
814 	return (0);
815 }
816 static int
817 raid_alloc_pwslot(md_raidcs_t *cs)
818 {
819 	mr_unit_t	*un = cs->cs_un;
820 	md_raidcbuf_t	*cbuf;
821 
822 	ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
823 	if (raid_check_pw(cs))
824 		return (1);
825 
826 	mutex_enter(&un->un_mx);
827 	un->un_pwid++;
828 	cs->cs_pwid = un->un_pwid;
829 	mutex_exit(&un->un_mx);
830 
831 	cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn);
832 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
833 		cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column);
834 	}
835 	cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn);
836 
837 	cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS;
838 
839 	return (0);
840 }
841 
842 /*
843  * NAMES:	raid_build_incore
844  * DESCRIPTION: RAID metadevice incore structure building routine
845  * PARAMETERS:	void *p - pointer to a unit structure
846  *		int snarfing - a flag to indicate snarfing is required
847  */
848 int
849 raid_build_incore(void *p, int snarfing)
850 {
851 	mr_unit_t	*un = (mr_unit_t *)p;
852 	minor_t		mnum = MD_SID(un);
853 	mddb_recid_t	hs_recid = 0;
854 	int		i;
855 	int		preserve_flags;
856 	mr_column_t	*column;
857 	int		iosize;
858 	md_dev64_t	hs, dev;
859 	int		resync_cnt = 0,
860 			error_cnt = 0;
861 
862 	hs = NODEV64;
863 	dev = NODEV64;
864 
865 	/* clear out bogus pointer incase we return(1) prior to alloc */
866 	un->mr_ic = NULL;
867 
868 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
869 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
870 		return (1);
871 	}
872 
873 	if (MD_UNIT(mnum) != NULL)
874 		return (0);
875 
876 	if (snarfing)
877 		MD_STATUS(un) = 0;
878 
879 	un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic),
880 	    KM_SLEEP);
881 
882 	un->un_column_ic = (mr_column_ic_t *)
883 	    kmem_zalloc(sizeof (mr_column_ic_t) *
884 		un->un_totalcolumncnt, KM_SLEEP);
885 
886 	for (i = 0; i < un->un_totalcolumncnt; i++) {
887 
888 		column	= &un->un_column[i];
889 		preserve_flags = column->un_devflags &
890 		    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC);
891 		column->un_devflags &=
892 		    ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN |
893 		    MD_RAID_WRITE_ALT);
894 		if (raid_build_pw_reservation(un, i) != 0) {
895 			/* could not build pwslot */
896 			return (1);
897 		}
898 
899 		if (snarfing) {
900 			set_t		setno = MD_MIN2SET(mnum);
901 			dev =  md_getdevnum(setno, mddb_getsidenum(setno),
902 			    column->un_orig_key, MD_NOTRUST_DEVT);
903 			/*
904 			 * Comment out instead of remove so we have history
905 			 * In the pre-SVM releases stored devt is used so
906 			 * as long as there is one snarf is always happy
907 			 * even the component is powered off.  This is not
908 			 * the case in current SVM implementation.  NODEV64
909 			 * can be returned and in this case since we resolve
910 			 * the devt at 'open' time (first use of metadevice)
911 			 * we will allow snarf continue.
912 			 *
913 			 * if (dev == NODEV64)
914 			 *	return (1);
915 			 */
916 
917 			/*
918 			 * Setup un_orig_dev from device id info if the device
919 			 * is valid (not NODEV64).
920 			 */
921 			if (dev != NODEV64)
922 				column->un_orig_dev = dev;
923 
924 			if (column->un_devstate & RCS_RESYNC)
925 				resync_cnt++;
926 			if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
927 				error_cnt++;
928 
929 			if (HOTSPARED(un, i)) {
930 				(void) md_hot_spare_ifc(HS_MKDEV,
931 				    0, 0, 0, &column->un_hs_id, NULL,
932 				    &hs, NULL);
933 				/*
934 				 * Same here
935 				 *
936 				 * if (hs == NODEV64)
937 				 *	return (1);
938 				 */
939 			}
940 
941 			if (HOTSPARED(un, i)) {
942 				if (column->un_devstate &
943 				    (RCS_OKAY | RCS_LAST_ERRED)) {
944 					column->un_dev = hs;
945 					column->un_pwstart =
946 					    column->un_hs_pwstart;
947 					column->un_devstart =
948 					    column->un_hs_devstart;
949 					preserve_flags &=
950 					    ~(MD_RAID_COPY_RESYNC |
951 					    MD_RAID_REGEN_RESYNC);
952 				} else  if (column->un_devstate & RCS_RESYNC) {
953 					/*
954 					 * if previous system was 4.0 set
955 					 * the direction flags
956 					 */
957 					if ((preserve_flags &
958 					    (MD_RAID_COPY_RESYNC |
959 					    MD_RAID_REGEN_RESYNC)) == 0) {
960 					if (column->un_alt_dev != NODEV64)
961 						preserve_flags |=
962 						MD_RAID_COPY_RESYNC;
963 					else
964 					    preserve_flags |=
965 						MD_RAID_REGEN_RESYNC;
966 					}
967 				}
968 			} else { /* no hot spares */
969 				column->un_dev = dev;
970 				column->un_pwstart = column->un_orig_pwstart;
971 				column->un_devstart = column->un_orig_devstart;
972 				if (column->un_devstate & RCS_RESYNC) {
973 					preserve_flags |= MD_RAID_REGEN_RESYNC;
974 					preserve_flags &= ~MD_RAID_COPY_RESYNC;
975 				}
976 			}
977 			if (! (column->un_devstate & RCS_RESYNC)) {
978 				preserve_flags &=
979 				    ~(MD_RAID_REGEN_RESYNC |
980 				    MD_RAID_COPY_RESYNC);
981 			}
982 
983 			column->un_devflags = preserve_flags;
984 			column->un_alt_dev = NODEV64;
985 			column->un_alt_pwstart = 0;
986 			column->un_alt_devstart = 0;
987 			un->un_resync_line_index = 0;
988 			un->un_resync_index = 0;
989 			un->un_percent_done = 0;
990 		}
991 	}
992 
993 	if (resync_cnt && error_cnt) {
994 		for (i = 0; i < un->un_totalcolumncnt; i++) {
995 			column  = &un->un_column[i];
996 			if (HOTSPARED(un, i) &&
997 			    (column->un_devstate & RCS_RESYNC) &&
998 			    (column->un_devflags & MD_RAID_COPY_RESYNC))
999 				/* hotspare has data */
1000 				continue;
1001 
1002 			if (HOTSPARED(un, i) &&
1003 			    (column->un_devstate & RCS_RESYNC)) {
1004 				/* hotspare does not have data */
1005 				raid_hs_release(HS_FREE, un, &hs_recid, i);
1006 				column->un_dev = column->un_orig_dev;
1007 				column->un_pwstart = column->un_orig_pwstart;
1008 				column->un_devstart = column->un_orig_devstart;
1009 				mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM);
1010 			}
1011 
1012 			if (column->un_devstate & RCS_ERRED)
1013 				column->un_devstate = RCS_LAST_ERRED;
1014 
1015 			if (column->un_devstate & RCS_RESYNC)
1016 				column->un_devstate = RCS_ERRED;
1017 		}
1018 	}
1019 	mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
1020 
1021 	un->un_pwid = 1; /* or some other possible value */
1022 	un->un_magic = RAID_UNMAGIC;
1023 	iosize = un->un_iosize;
1024 	un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
1025 	un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
1026 	mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL);
1027 	cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL);
1028 	un->un_linlck_chn = NULL;
1029 	MD_UNIT(mnum) = un;
1030 
1031 
1032 	return (0);
1033 }
1034 
1035 /*
1036  * NAMES:	reset_raid
1037  * DESCRIPTION: RAID metadevice reset routine
1038  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
1039  *		minor_t mnum - RAID metadevice minor number
1040  *		int removing - a flag to imply removing device name from
1041  *			MDDB database.
1042  */
1043 void
1044 reset_raid(mr_unit_t *un, minor_t mnum, int removing)
1045 {
1046 	int		i, n = 0;
1047 	sv_dev_t	*sv;
1048 	mr_column_t	*column;
1049 	int		column_cnt = un->un_totalcolumncnt;
1050 	mddb_recid_t	*recids, vtoc_id;
1051 	int		hserr;
1052 
1053 	ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) &&
1054 	    (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL));
1055 
1056 	md_destroy_unit_incore(mnum, &raid_md_ops);
1057 
1058 	MD_UNIT(mnum) = NULL;
1059 
1060 	if (un->un_pbuffer) {
1061 		kmem_free(un->un_pbuffer, dbtob(un->un_iosize));
1062 		un->un_pbuffer = NULL;
1063 	}
1064 	if (un->un_dbuffer) {
1065 		kmem_free(un->un_dbuffer, dbtob(un->un_iosize));
1066 		un->un_dbuffer = NULL;
1067 	}
1068 
1069 	/* free all pre-write slots created during build incore */
1070 	for (i = 0; i < un->un_totalcolumncnt; i++)
1071 		raid_free_pw_reservation(un, i);
1072 
1073 	kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
1074 		un->un_totalcolumncnt);
1075 
1076 	kmem_free(un->mr_ic, sizeof (*un->mr_ic));
1077 
1078 	if (!removing)
1079 		return;
1080 
1081 	sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t),
1082 	    KM_SLEEP);
1083 
1084 	recids = (mddb_recid_t *)
1085 	    kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP);
1086 
1087 	for (i = 0; i < column_cnt; i++) {
1088 		md_unit_t	*comp_un;
1089 		md_dev64_t	comp_dev;
1090 
1091 		column = &un->un_column[i];
1092 		sv[i].setno = MD_MIN2SET(mnum);
1093 		sv[i].key = column->un_orig_key;
1094 		if (HOTSPARED(un, i)) {
1095 			if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
1096 				hserr = HS_BAD;
1097 			else
1098 				hserr = HS_FREE;
1099 			raid_hs_release(hserr, un, &recids[n++], i);
1100 		}
1101 		/*
1102 		 * deparent any metadevices.
1103 		 * NOTE: currently soft partitions are the only metadevices
1104 		 * allowed in RAID metadevices.
1105 		 */
1106 		comp_dev = column->un_dev;
1107 		if (md_getmajor(comp_dev) == md_major) {
1108 			comp_un = MD_UNIT(md_getminor(comp_dev));
1109 			recids[n++] = MD_RECID(comp_un);
1110 			md_reset_parent(comp_dev);
1111 		}
1112 	}
1113 	/* decrement the reference count of the old hsp */
1114 	if (un->un_hsp_id != -1)
1115 		(void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
1116 		    &recids[n++], NULL, NULL, NULL);
1117 	recids[n] = 0;
1118 	MD_STATUS(un) |= MD_UN_BEING_RESET;
1119 	vtoc_id = un->c.un_vtoc_id;
1120 
1121 	raid_commit(un, recids);
1122 
1123 
1124 	/* Remove the unit structure */
1125 	mddb_deleterec_wrapper(un->c.un_record_id);
1126 
1127 	/* Remove the vtoc, if present */
1128 	if (vtoc_id)
1129 		mddb_deleterec_wrapper(vtoc_id);
1130 	md_rem_names(sv, column_cnt);
1131 	kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t));
1132 	kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t));
1133 
1134 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
1135 	    MD_MIN2SET(mnum), mnum);
1136 }
1137 
1138 /*
1139  * NAMES:	raid_error_parent
1140  * DESCRIPTION: mark a parent structure in error
1141  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1142  *		int	error - error value to set
1143  * NOTE:	(TBR) - this routine currently is not in use.
1144  */
1145 static void
1146 raid_error_parent(md_raidps_t *ps, int error)
1147 {
1148 	mutex_enter(&ps->ps_mx);
1149 	ps->ps_flags |= MD_RPS_ERROR;
1150 	ps->ps_error = error;
1151 	mutex_exit(&ps->ps_mx);
1152 }
1153 
1154 /*
1155  * The following defines tell raid_free_parent
1156  *	RFP_RLS_LOCK		release the unit reader lock when done.
1157  *	RFP_DECR_PWFRAGS	decrement ps_pwfrags
1158  *	RFP_DECR_FRAGS		decrement ps_frags
1159  *	RFP_DECR_READFRAGS	read keeps FRAGS and PWFRAGS in lockstep
1160  */
1161 #define	RFP_RLS_LOCK		0x00001
1162 #define	RFP_DECR_PWFRAGS	0x00002
1163 #define	RFP_DECR_FRAGS		0x00004
1164 #define	RFP_DECR_READFRAGS	(RFP_DECR_PWFRAGS | RFP_DECR_FRAGS)
1165 
1166 /*
1167  * NAMES:	raid_free_parent
1168  * DESCRIPTION: free a parent structure
1169  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1170  *		int	todo - indicates what needs to be done
1171  */
1172 static void
1173 raid_free_parent(md_raidps_t *ps, int todo)
1174 {
1175 	mdi_unit_t	*ui = ps->ps_ui;
1176 
1177 	ASSERT(ps->ps_magic == RAID_PSMAGIC);
1178 	ASSERT(ps->ps_flags & MD_RPS_INUSE);
1179 	mutex_enter(&ps->ps_mx);
1180 	if (todo & RFP_DECR_PWFRAGS) {
1181 		ASSERT(ps->ps_pwfrags);
1182 		ps->ps_pwfrags--;
1183 		if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) {
1184 			if (ps->ps_flags & MD_RPS_ERROR) {
1185 				ps->ps_bp->b_flags |= B_ERROR;
1186 				ps->ps_bp->b_error = ps->ps_error;
1187 			}
1188 			md_kstat_done(ui, ps->ps_bp, 0);
1189 			biodone(ps->ps_bp);
1190 			ps->ps_flags |= MD_RPS_IODONE;
1191 		}
1192 	}
1193 
1194 	if (todo & RFP_DECR_FRAGS) {
1195 		ASSERT(ps->ps_frags);
1196 		ps->ps_frags--;
1197 	}
1198 
1199 	if (ps->ps_frags != 0) {
1200 		mutex_exit(&ps->ps_mx);
1201 		return;
1202 	}
1203 
1204 	ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0));
1205 	mutex_exit(&ps->ps_mx);
1206 
1207 	if (todo & RFP_RLS_LOCK)
1208 		md_io_readerexit(ui);
1209 
1210 	if (panicstr) {
1211 		ps->ps_flags |= MD_RPS_DONE;
1212 		return;
1213 	}
1214 
1215 	if (ps->ps_flags & MD_RPS_HSREQ)
1216 		(void) raid_hotspares();
1217 
1218 	ASSERT(todo & RFP_RLS_LOCK);
1219 	ps->ps_flags &= ~MD_RPS_INUSE;
1220 
1221 	md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id));
1222 
1223 	kmem_cache_free(raid_parent_cache, ps);
1224 }
1225 
1226 /*
1227  * NAMES:	raid_free_child
1228  * DESCRIPTION: free a parent structure
1229  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1230  *		int drop_locks	- 0 for no locks held
1231  * NOTE:	(TBR) - this routine currently is not in use.
1232  */
1233 static void
1234 raid_free_child(md_raidcs_t *cs, int drop_locks)
1235 {
1236 	mr_unit_t	*un = cs->cs_un;
1237 	md_raidcbuf_t	*cbuf, *cbuf1;
1238 
1239 	if (cs->cs_pw_inval_list)
1240 		raid_free_pwinvalidate(cs);
1241 
1242 	if (drop_locks) {
1243 		ASSERT(cs->cs_flags & MD_RCS_LLOCKD &&
1244 		    (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER)));
1245 		md_unit_readerexit(MDI_UNIT(MD_SID(un)));
1246 		raid_line_exit(cs);
1247 	} else {
1248 		ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD));
1249 	}
1250 
1251 	freebuffers(cs);
1252 	cbuf = cs->cs_buflist;
1253 	while (cbuf) {
1254 		cbuf1 = cbuf->cbuf_next;
1255 		kmem_cache_free(raid_cbuf_cache, cbuf);
1256 		cbuf = cbuf1;
1257 	}
1258 	if (cs->cs_dbuf.b_flags & B_REMAPPED)
1259 		bp_mapout(&cs->cs_dbuf);
1260 	kmem_cache_free(raid_child_cache, cs);
1261 }
1262 
1263 /*
1264  * NAME:	raid_regen_parity
1265  *
1266  * DESCRIPTION:	This routine is used to regenerate the parity blocks
1267  *		for the entire raid device.  It is called from
1268  *		both the regen thread and the IO path.
1269  *
1270  *		On error the entire device is marked as in error by
1271  *		placing the erroring device in error and all other
1272  *		devices in last_errored.
1273  *
1274  * PARAMETERS:	md_raidcs_t	*cs
1275  */
1276 void
1277 raid_regen_parity(md_raidcs_t *cs)
1278 {
1279 	mr_unit_t	*un = cs->cs_un;
1280 	mdi_unit_t	*ui = MDI_UNIT(un->c.un_self_id);
1281 	caddr_t		buffer;
1282 	caddr_t		parity_buffer;
1283 	buf_t		*bp;
1284 	uint_t		*dbuf, *pbuf;
1285 	uint_t		colcnt = un->un_totalcolumncnt;
1286 	int		column;
1287 	int		parity_column = cs->cs_pcolumn;
1288 	size_t		bcount;
1289 	int		j;
1290 
1291 	/*
1292 	 * This routine uses the data and parity buffers allocated to a
1293 	 * write.  In the case of a read the buffers are allocated and
1294 	 * freed at the end.
1295 	 */
1296 
1297 	ASSERT(IO_READER_HELD(un));
1298 	ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
1299 	ASSERT(UNIT_READER_HELD(un));
1300 
1301 	if (raid_state_cnt(un, RCS_OKAY) != colcnt)
1302 		return;
1303 
1304 	if (cs->cs_flags & MD_RCS_READER) {
1305 		getpbuffer(cs);
1306 		getdbuffer(cs);
1307 	}
1308 	ASSERT(cs->cs_dbuffer && cs->cs_pbuffer);
1309 	bcount = cs->cs_bcount;
1310 	buffer = cs->cs_dbuffer;
1311 	parity_buffer = cs->cs_pbuffer;
1312 	bzero(parity_buffer, bcount);
1313 	bp = &cs->cs_dbuf;
1314 	for (column = 0; column < colcnt; column++) {
1315 		if (column == parity_column)
1316 			continue;
1317 		reset_buf(bp, B_READ | B_BUSY, bcount);
1318 		bp->b_un.b_addr = buffer;
1319 		bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
1320 		bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart;
1321 		bp->b_bcount = bcount;
1322 		bp->b_bufsize = bcount;
1323 		(void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
1324 		if (biowait(bp))
1325 			goto bail;
1326 		pbuf = (uint_t *)(void *)parity_buffer;
1327 		dbuf = (uint_t *)(void *)buffer;
1328 		for (j = 0; j < (bcount / (sizeof (uint_t))); j++) {
1329 			*pbuf = *pbuf ^ *dbuf;
1330 			pbuf++;
1331 			dbuf++;
1332 		}
1333 	}
1334 
1335 	reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount);
1336 	bp->b_un.b_addr = parity_buffer;
1337 	bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev);
1338 	bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart;
1339 	bp->b_bcount = bcount;
1340 	bp->b_bufsize = bcount;
1341 	(void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
1342 	if (biowait(bp))
1343 		goto bail;
1344 
1345 	if (cs->cs_flags & MD_RCS_READER) {
1346 		freebuffers(cs);
1347 		cs->cs_pbuffer = NULL;
1348 		cs->cs_dbuffer = NULL;
1349 	}
1350 	bp->b_chain = (struct buf *)cs;
1351 	return;
1352 bail:
1353 	if (cs->cs_flags & MD_RCS_READER) {
1354 		freebuffers(cs);
1355 		cs->cs_pbuffer = NULL;
1356 		cs->cs_dbuffer = NULL;
1357 	}
1358 	md_unit_readerexit(ui);
1359 	un = md_unit_writerlock(ui);
1360 	raid_set_state(un, column, RCS_ERRED, 0);
1361 	for (column = 0; column < colcnt; column++)
1362 		raid_set_state(un, column, RCS_ERRED, 0);
1363 	raid_commit(un, NULL);
1364 	md_unit_writerexit(ui);
1365 	un = md_unit_readerlock(ui);
1366 	bp->b_chain = (struct buf *)cs;
1367 }
1368 
1369 /*
1370  * NAMES:	raid_error_state
1371  * DESCRIPTION: check unit and column states' impact on I/O error
1372  *		NOTE:	the state now may not be the state when the
1373  *			I/O completed due to race conditions.
1374  * PARAMETERS:	mr_unit_t *un - pointer to raid unit structure
1375  *		md_raidcs_t *cs - pointer to child structure
1376  *		buf_t	  *bp - pointer to buffer structure
1377  */
1378 static int
1379 raid_error_state(mr_unit_t *un, buf_t *bp)
1380 {
1381 	int		column;
1382 	int		i;
1383 
1384 	ASSERT(IO_READER_HELD(un));
1385 	ASSERT(UNIT_WRITER_HELD(un));
1386 
1387 	column = -1;
1388 	for (i = 0; i < un->un_totalcolumncnt; i++) {
1389 		if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) {
1390 			column = i;
1391 			break;
1392 		}
1393 		if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) {
1394 			column = i;
1395 			break;
1396 		}
1397 	}
1398 
1399 	/* in case a replace snuck in while waiting on unit writer lock */
1400 
1401 	if (column == -1) {
1402 		return (0);
1403 	}
1404 
1405 	(void) raid_set_state(un, column, RCS_ERRED, 0);
1406 	ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED));
1407 
1408 	raid_commit(un, NULL);
1409 	if (un->un_state & RUS_ERRED) {
1410 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
1411 		    MD_UN2SET(un), MD_SID(un));
1412 	} else if (un->un_state & RUS_LAST_ERRED) {
1413 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
1414 		    MD_UN2SET(un), MD_SID(un));
1415 	}
1416 
1417 	return (EIO);
1418 }
1419 
1420 /*
1421  * NAME:	raid_mapin_buf
1422  * DESCRIPTION:	wait for the input buffer header to be maped in
1423  * PARAMETERS:	md_raidps_t *ps
1424  */
1425 static void
1426 raid_mapin_buf(md_raidcs_t *cs)
1427 {
1428 	md_raidps_t	*ps = cs->cs_ps;
1429 
1430 	/*
1431 	 * check to see if the buffer is maped.  If all is ok return the
1432 	 * offset of the data and return.  Since it is expensive to grab
1433 	 * a mutex this is only done if the mapin is not complete.
1434 	 * Once the mutex is aquired it is possible that the mapin was
1435 	 * not done so recheck and if necessary do the mapin.
1436 	 */
1437 	if (ps->ps_mapin > 0) {
1438 		cs->cs_addr = ps->ps_addr + cs->cs_offset;
1439 		return;
1440 	}
1441 	mutex_enter(&ps->ps_mapin_mx);
1442 	if (ps->ps_mapin > 0) {
1443 		cs->cs_addr = ps->ps_addr + cs->cs_offset;
1444 		mutex_exit(&ps->ps_mapin_mx);
1445 		return;
1446 	}
1447 	bp_mapin(ps->ps_bp);
1448 	/*
1449 	 * get the new b_addr out of the parent since bp_mapin just changed it
1450 	 */
1451 	ps->ps_addr = ps->ps_bp->b_un.b_addr;
1452 	cs->cs_addr = ps->ps_addr + cs->cs_offset;
1453 	ps->ps_mapin++;
1454 	mutex_exit(&ps->ps_mapin_mx);
1455 }
1456 
1457 /*
1458  * NAMES:	raid_read_no_retry
1459  * DESCRIPTION: I/O retry routine for a RAID metadevice read
1460  *		read failed attempting to regenerate the data,
1461  *		no retry possible, error occured in raid_raidregenloop().
1462  * PARAMETERS:	mr_unit_t   *un - pointer to raid unit structure
1463  *		md_raidcs_t *cs - pointer to child structure
1464  */
1465 /*ARGSUSED*/
1466 static void
1467 raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs)
1468 {
1469 	md_raidps_t	*ps = cs->cs_ps;
1470 
1471 	raid_error_parent(ps, EIO);
1472 	raid_free_child(cs, 1);
1473 
1474 	/* decrement readfrags */
1475 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
1476 }
1477 
1478 /*
1479  * NAMES:	raid_read_retry
1480  * DESCRIPTION: I/O retry routine for a RAID metadevice read
1481  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1482  */
1483 static void
1484 raid_read_retry(mr_unit_t *un, md_raidcs_t *cs)
1485 {
1486 	/* re-initialize the buf_t structure for raid_read() */
1487 	cs->cs_dbuf.b_chain = (struct buf *)cs;
1488 	cs->cs_dbuf.b_back = &cs->cs_dbuf;
1489 	cs->cs_dbuf.b_forw = &cs->cs_dbuf;
1490 	cs->cs_dbuf.b_flags = B_BUSY;	/* initialize flags */
1491 	cs->cs_dbuf.b_error = 0;	/* initialize error */
1492 	cs->cs_dbuf.b_offset = -1;
1493 	/* Initialize semaphores */
1494 	sema_init(&cs->cs_dbuf.b_io, 0, NULL,
1495 	    SEMA_DEFAULT, NULL);
1496 	sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
1497 	    SEMA_DEFAULT, NULL);
1498 
1499 	cs->cs_pbuf.b_chain = (struct buf *)cs;
1500 	cs->cs_pbuf.b_back = &cs->cs_pbuf;
1501 	cs->cs_pbuf.b_forw = &cs->cs_pbuf;
1502 	cs->cs_pbuf.b_flags = B_BUSY;	/* initialize flags */
1503 	cs->cs_pbuf.b_error = 0;	/* initialize error */
1504 	cs->cs_pbuf.b_offset = -1;
1505 	sema_init(&cs->cs_pbuf.b_io, 0, NULL,
1506 	    SEMA_DEFAULT, NULL);
1507 	sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
1508 	    SEMA_DEFAULT, NULL);
1509 
1510 	cs->cs_flags &= ~MD_RCS_ERROR;	/* reset child error flag */
1511 	cs->cs_flags |= MD_RCS_RECOVERY;  /* set RECOVERY flag */
1512 
1513 	/*
1514 	 * re-scheduling I/O with raid_read_io() is simpler. basically,
1515 	 * raid_read_io() is invoked again with same child structure.
1516 	 * (NOTE: we aren`t supposed to do any error recovery when an I/O
1517 	 * error occured in raid_raidregenloop().
1518 	 */
1519 	raid_mapin_buf(cs);
1520 	raid_read_io(un, cs);
1521 }
1522 
1523 /*
1524  * NAMES:	raid_rderr
1525  * DESCRIPTION: I/O error handling routine for a RAID metadevice read
1526  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1527  * LOCKS:	must obtain unit writer lock while calling raid_error_state
1528  *		since a unit or column state transition may take place.
1529  *		must obtain unit reader lock to retry I/O.
1530  */
1531 /*ARGSUSED*/
1532 static void
1533 raid_rderr(md_raidcs_t *cs)
1534 {
1535 	md_raidps_t	*ps;
1536 	mdi_unit_t	*ui;
1537 	mr_unit_t	*un;
1538 	int		error = 0;
1539 
1540 	ps = cs->cs_ps;
1541 	ui = ps->ps_ui;
1542 	un = (mr_unit_t *)md_unit_writerlock(ui);
1543 	ASSERT(un != 0);
1544 
1545 	if (cs->cs_dbuf.b_flags & B_ERROR)
1546 		error = raid_error_state(un, &cs->cs_dbuf);
1547 	if (cs->cs_pbuf.b_flags & B_ERROR)
1548 		error |= raid_error_state(un, &cs->cs_pbuf);
1549 
1550 	md_unit_writerexit(ui);
1551 
1552 	ps->ps_flags |= MD_RPS_HSREQ;
1553 
1554 	un = (mr_unit_t *)md_unit_readerlock(ui);
1555 	ASSERT(un != 0);
1556 	/* now attempt the appropriate retry routine */
1557 	(*(cs->cs_retry_call))(un, cs);
1558 }
1559 
1560 
1561 /*
1562  * NAMES:	raid_read_error
1563  * DESCRIPTION: I/O error handling routine for a RAID metadevice read
1564  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1565  */
1566 /*ARGSUSED*/
1567 static void
1568 raid_read_error(md_raidcs_t *cs)
1569 {
1570 	md_raidps_t	*ps;
1571 	mdi_unit_t	*ui;
1572 	mr_unit_t	*un;
1573 	set_t		setno;
1574 
1575 	ps = cs->cs_ps;
1576 	ui = ps->ps_ui;
1577 	un = cs->cs_un;
1578 
1579 	setno = MD_UN2SET(un);
1580 
1581 	if ((cs->cs_dbuf.b_flags & B_ERROR) &&
1582 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
1583 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
1584 		cmn_err(CE_WARN, "md %s: read error on %s",
1585 		    md_shortname(MD_SID(un)),
1586 		    md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
1587 
1588 	if ((cs->cs_pbuf.b_flags & B_ERROR) &&
1589 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
1590 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
1591 		cmn_err(CE_WARN, "md %s: read error on %s",
1592 		    md_shortname(MD_SID(un)),
1593 		    md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
1594 
1595 	md_unit_readerexit(ui);
1596 
1597 	ASSERT(cs->cs_frags == 0);
1598 
1599 	/* now schedule processing for possible state change */
1600 	daemon_request(&md_mstr_daemon, raid_rderr,
1601 		(daemon_queue_t *)cs, REQ_OLD);
1602 
1603 }
1604 
1605 /*
1606  * NAMES:	getdbuffer
1607  * DESCRIPTION: data buffer allocation for a child structure
1608  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1609  *
1610  * NOTE: always get dbuffer before pbuffer
1611  *	 and get both buffers before pwslot
1612  *	 otherwise a deadlock could be introduced.
1613  */
1614 static void
1615 getdbuffer(md_raidcs_t *cs)
1616 {
1617 	mr_unit_t	*un;
1618 
1619 	cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
1620 	if (cs->cs_dbuffer != NULL)
1621 		return;
1622 	un = cs->cs_ps->ps_un;
1623 	mutex_enter(&un->un_mx);
1624 	while (un->un_dbuffer == NULL) {
1625 		STAT_INC(data_buffer_waits);
1626 		un->un_rflags |= MD_RFLAG_NEEDBUF;
1627 		cv_wait(&un->un_cv, &un->un_mx);
1628 	}
1629 	cs->cs_dbuffer = un->un_dbuffer;
1630 	cs->cs_flags |= MD_RCS_UNDBUF;
1631 	un->un_dbuffer = NULL;
1632 	mutex_exit(&un->un_mx);
1633 }
1634 
1635 /*
1636  * NAMES:	getpbuffer
1637  * DESCRIPTION: parity buffer allocation for a child structure
1638  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1639  *
1640  * NOTE: always get dbuffer before pbuffer
1641  *	 and get both buffers before pwslot
1642  *	 otherwise a deadlock could be introduced.
1643  */
1644 static void
1645 getpbuffer(md_raidcs_t *cs)
1646 {
1647 	mr_unit_t *un;
1648 
1649 	cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
1650 	if (cs->cs_pbuffer != NULL)
1651 		return;
1652 	un = cs->cs_ps->ps_un;
1653 	mutex_enter(&un->un_mx);
1654 	while (un->un_pbuffer == NULL) {
1655 		STAT_INC(parity_buffer_waits);
1656 		un->un_rflags |= MD_RFLAG_NEEDBUF;
1657 		cv_wait(&un->un_cv, &un->un_mx);
1658 	}
1659 	cs->cs_pbuffer = un->un_pbuffer;
1660 	cs->cs_flags |= MD_RCS_UNPBUF;
1661 	un->un_pbuffer = NULL;
1662 	mutex_exit(&un->un_mx);
1663 }
1664 static void
1665 getresources(md_raidcs_t *cs)
1666 {
1667 	md_raidcbuf_t	*cbuf;
1668 	/*
1669 	 * NOTE: always get dbuffer before pbuffer
1670 	 *	 and get both buffers before pwslot
1671 	 *	 otherwise a deadlock could be introduced.
1672 	 */
1673 	getdbuffer(cs);
1674 	getpbuffer(cs);
1675 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
1676 		cbuf->cbuf_buffer =
1677 		    kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP);
1678 }
1679 /*
1680  * NAMES:	freebuffers
1681  * DESCRIPTION: child structure buffer freeing routine
1682  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1683  */
1684 static void
1685 freebuffers(md_raidcs_t *cs)
1686 {
1687 	mr_unit_t	*un;
1688 	md_raidcbuf_t	*cbuf;
1689 
1690 	/* free buffers used for full line write */
1691 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
1692 		if (cbuf->cbuf_buffer == NULL)
1693 			continue;
1694 		kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE);
1695 		cbuf->cbuf_buffer = NULL;
1696 		cbuf->cbuf_bcount = 0;
1697 	}
1698 
1699 	if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
1700 		un = cs->cs_un;
1701 		mutex_enter(&un->un_mx);
1702 	}
1703 	if (cs->cs_dbuffer) {
1704 		if (cs->cs_flags & MD_RCS_UNDBUF)
1705 			un->un_dbuffer = cs->cs_dbuffer;
1706 		else
1707 			kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE);
1708 	}
1709 	if (cs->cs_pbuffer) {
1710 		if (cs->cs_flags & MD_RCS_UNPBUF)
1711 			un->un_pbuffer = cs->cs_pbuffer;
1712 		else
1713 			kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE);
1714 	}
1715 	if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
1716 		un->un_rflags &= ~MD_RFLAG_NEEDBUF;
1717 		cv_broadcast(&un->un_cv);
1718 		mutex_exit(&un->un_mx);
1719 	}
1720 }
1721 
1722 /*
1723  * NAMES:	raid_line_reader_lock, raid_line_writer_lock
1724  * DESCRIPTION: RAID metadevice line reader and writer lock routines
1725  *		data column # and parity column #.
1726  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
1727  */
1728 
1729 void
1730 raid_line_reader_lock(md_raidcs_t *cs, int resync_thread)
1731 {
1732 	mr_unit_t	*un;
1733 	md_raidcs_t	*cs1;
1734 
1735 	ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
1736 	un = cs->cs_un;
1737 	cs->cs_flags |= MD_RCS_READER;
1738 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
1739 	if (!panicstr)
1740 		mutex_enter(&un->un_linlck_mx);
1741 	cs1 = un->un_linlck_chn;
1742 	while (cs1 != NULL) {
1743 		for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
1744 			if (raid_io_overlaps(cs, cs1) == 1)
1745 				if (cs1->cs_flags & MD_RCS_WRITER)
1746 					break;
1747 
1748 		if (cs1 != NULL) {
1749 			if (panicstr)
1750 				panic("md; raid line write lock held");
1751 			un->un_linlck_flg = 1;
1752 			cv_wait(&un->un_linlck_cv, &un->un_linlck_mx);
1753 			STAT_INC(raid_read_waits);
1754 		}
1755 	}
1756 	STAT_MAX(raid_max_reader_locks, raid_reader_locks_active);
1757 	STAT_INC(raid_reader_locks);
1758 	cs1 = un->un_linlck_chn;
1759 	if (cs1 != NULL)
1760 		cs1->cs_linlck_prev = cs;
1761 	cs->cs_linlck_next = cs1;
1762 	cs->cs_linlck_prev = NULL;
1763 	un->un_linlck_chn = cs;
1764 	cs->cs_flags |= MD_RCS_LLOCKD;
1765 	if (resync_thread) {
1766 		diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
1767 		diskaddr_t line = (lastblk + 1) / un->un_segsize;
1768 		ASSERT(raid_state_cnt(un, RCS_RESYNC));
1769 		mutex_enter(&un->un_mx);
1770 		un->un_resync_line_index = line;
1771 		mutex_exit(&un->un_mx);
1772 	}
1773 	if (!panicstr)
1774 		mutex_exit(&un->un_linlck_mx);
1775 }
1776 
1777 int
1778 raid_line_writer_lock(md_raidcs_t *cs, int lock)
1779 {
1780 	mr_unit_t	*un;
1781 	md_raidcs_t	*cs1;
1782 
1783 	ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
1784 	cs->cs_flags |= MD_RCS_WRITER;
1785 	un = cs->cs_ps->ps_un;
1786 
1787 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
1788 	if (lock && !panicstr)
1789 		mutex_enter(&un->un_linlck_mx);
1790 	ASSERT(MUTEX_HELD(&un->un_linlck_mx));
1791 
1792 	cs1 = un->un_linlck_chn;
1793 	for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
1794 		if (raid_io_overlaps(cs, cs1))
1795 			break;
1796 
1797 	if (cs1 != NULL) {
1798 		if (panicstr)
1799 			panic("md: line writer lock inaccessible");
1800 		goto no_lock_exit;
1801 	}
1802 
1803 	if (raid_alloc_pwslot(cs)) {
1804 		if (panicstr)
1805 			panic("md: no prewrite slots");
1806 		STAT_INC(raid_prewrite_waits);
1807 		goto no_lock_exit;
1808 	}
1809 
1810 	cs1 = un->un_linlck_chn;
1811 	if (cs1 != NULL)
1812 		cs1->cs_linlck_prev = cs;
1813 	cs->cs_linlck_next = cs1;
1814 	cs->cs_linlck_prev = NULL;
1815 	un->un_linlck_chn = cs;
1816 	cs->cs_flags |= MD_RCS_LLOCKD;
1817 	cs->cs_flags &= ~MD_RCS_WAITING;
1818 	STAT_INC(raid_writer_locks);
1819 	STAT_MAX(raid_max_write_locks, raid_write_locks_active);
1820 	if (lock && !panicstr)
1821 		mutex_exit(&un->un_linlck_mx);
1822 	return (0);
1823 
1824 no_lock_exit:
1825 	/* if this is already queued then do not requeue it */
1826 	ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
1827 	if (!lock || (cs->cs_flags & MD_RCS_WAITING))
1828 		return (1);
1829 	cs->cs_flags |= MD_RCS_WAITING;
1830 	cs->cs_un = un;
1831 	raid_enqueue(cs);
1832 	if (lock && !panicstr)
1833 		mutex_exit(&un->un_linlck_mx);
1834 	return (1);
1835 }
1836 
1837 static void
1838 raid_startio(md_raidcs_t *cs)
1839 {
1840 	mdi_unit_t	*ui = cs->cs_ps->ps_ui;
1841 	mr_unit_t	*un = cs->cs_un;
1842 
1843 	un = md_unit_readerlock(ui);
1844 	raid_write_io(un, cs);
1845 }
1846 
1847 void
1848 raid_io_startup(mr_unit_t *un)
1849 {
1850 	md_raidcs_t	*waiting_list, *cs1;
1851 	md_raidcs_t	*previous = NULL, *next = NULL;
1852 	mdi_unit_t	*ui =  MDI_UNIT(un->c.un_self_id);
1853 	kmutex_t	*io_list_mutex = &ui->ui_io_lock->io_list_mutex;
1854 
1855 	ASSERT(MUTEX_HELD(&un->un_linlck_mx));
1856 	mutex_enter(io_list_mutex);
1857 
1858 	/*
1859 	 * check to be sure there are no reader locks outstanding.  If
1860 	 * there are not then pass on the writer lock.
1861 	 */
1862 	waiting_list = ui->ui_io_lock->io_list_front;
1863 	while (waiting_list) {
1864 		ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
1865 		ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD));
1866 		for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
1867 			if (raid_io_overlaps(waiting_list, cs1) == 1)
1868 				break;
1869 		/*
1870 		 * there was an IOs that overlaps this io so go onto
1871 		 * the next io in the waiting list
1872 		 */
1873 		if (cs1) {
1874 			previous = waiting_list;
1875 			waiting_list = waiting_list->cs_linlck_next;
1876 			continue;
1877 		}
1878 
1879 		/*
1880 		 * There are no IOs that overlap this, so remove it from
1881 		 * the waiting queue, and start it
1882 		 */
1883 
1884 		if (raid_check_pw(waiting_list)) {
1885 			ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
1886 			previous = waiting_list;
1887 			waiting_list = waiting_list->cs_linlck_next;
1888 			continue;
1889 		}
1890 		ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
1891 
1892 		next = waiting_list->cs_linlck_next;
1893 		if (previous)
1894 			previous->cs_linlck_next = next;
1895 		else
1896 			ui->ui_io_lock->io_list_front = next;
1897 
1898 		if (ui->ui_io_lock->io_list_front == NULL)
1899 			ui->ui_io_lock->io_list_back = NULL;
1900 
1901 		if (ui->ui_io_lock->io_list_back == waiting_list)
1902 			ui->ui_io_lock->io_list_back = previous;
1903 
1904 		waiting_list->cs_linlck_next = NULL;
1905 		waiting_list->cs_flags &= ~MD_RCS_WAITING;
1906 		STAT_DEC(raid_write_queue_length);
1907 		if (raid_line_writer_lock(waiting_list, 0))
1908 			panic("region locking corrupted");
1909 
1910 		ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD);
1911 		daemon_request(&md_mstr_daemon, raid_startio,
1912 		    (daemon_queue_t *)waiting_list, REQ_OLD);
1913 		waiting_list = next;
1914 
1915 	}
1916 	mutex_exit(io_list_mutex);
1917 }
1918 
1919 void
1920 raid_line_exit(md_raidcs_t *cs)
1921 {
1922 	mr_unit_t	*un;
1923 
1924 	un = cs->cs_ps->ps_un;
1925 	STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
1926 	mutex_enter(&un->un_linlck_mx);
1927 	if (cs->cs_flags & MD_RCS_READER)
1928 		STAT_DEC(raid_reader_locks_active);
1929 	else
1930 		STAT_DEC(raid_write_locks_active);
1931 
1932 	if (cs->cs_linlck_prev)
1933 		cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next;
1934 	else
1935 		un->un_linlck_chn = cs->cs_linlck_next;
1936 	if (cs->cs_linlck_next)
1937 		cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev;
1938 
1939 	cs->cs_flags &= ~MD_RCS_LLOCKD;
1940 
1941 	if (un->un_linlck_flg)
1942 		cv_broadcast(&un->un_linlck_cv);
1943 
1944 	un->un_linlck_flg = 0;
1945 	cs->cs_line = MD_DISKADDR_ERROR;
1946 
1947 	raid_cancel_pwslot(cs);
1948 	/*
1949 	 * now that the lock is droped go ahead and see if there are any
1950 	 * other writes that can be started up
1951 	 */
1952 	raid_io_startup(un);
1953 
1954 	mutex_exit(&un->un_linlck_mx);
1955 }
1956 
1957 /*
1958  * NAMES:	raid_line, raid_pcolumn, raid_dcolumn
1959  * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #,
1960  *		data column # and parity column #.
1961  * PARAMETERS:	int segment - segment number
1962  *		mr_unit_t *un - pointer to an unit structure
1963  * RETURNS:	raid_line returns line #
1964  *		raid_dcolumn returns data column #
1965  *		raid_pcolumn returns parity column #
1966  */
1967 static diskaddr_t
1968 raid_line(diskaddr_t segment, mr_unit_t *un)
1969 {
1970 	diskaddr_t	adj_seg;
1971 	diskaddr_t	line;
1972 	diskaddr_t	max_orig_segment;
1973 
1974 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
1975 	if (segment >= max_orig_segment) {
1976 		adj_seg = segment - max_orig_segment;
1977 		line = adj_seg % un->un_segsincolumn;
1978 	} else {
1979 		line = segment / (un->un_origcolumncnt - 1);
1980 	}
1981 	return (line);
1982 }
1983 
1984 uint_t
1985 raid_dcolumn(diskaddr_t segment, mr_unit_t *un)
1986 {
1987 	diskaddr_t	adj_seg;
1988 	diskaddr_t	line;
1989 	diskaddr_t	max_orig_segment;
1990 	uint_t		column;
1991 
1992 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
1993 	if (segment >= max_orig_segment) {
1994 		adj_seg = segment - max_orig_segment;
1995 		column = un->un_origcolumncnt  +
1996 			(uint_t)(adj_seg / un->un_segsincolumn);
1997 	} else {
1998 		line = segment / (un->un_origcolumncnt - 1);
1999 		column = (uint_t)((segment % (un->un_origcolumncnt - 1) + line)
2000 		    % un->un_origcolumncnt);
2001 	}
2002 	return (column);
2003 }
2004 
2005 uint_t
2006 raid_pcolumn(diskaddr_t segment, mr_unit_t *un)
2007 {
2008 	diskaddr_t	adj_seg;
2009 	diskaddr_t	line;
2010 	diskaddr_t	max_orig_segment;
2011 	uint_t		column;
2012 
2013 	max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
2014 	if (segment >= max_orig_segment) {
2015 		adj_seg = segment - max_orig_segment;
2016 		line = adj_seg % un->un_segsincolumn;
2017 	} else {
2018 		line = segment / (un->un_origcolumncnt - 1);
2019 	}
2020 	column = (uint_t)((line + (un->un_origcolumncnt - 1))
2021 				% un->un_origcolumncnt);
2022 	return (column);
2023 }
2024 
2025 
2026 /*
2027  * Is called in raid_iosetup to probe each column to insure
2028  * that all the columns are in 'okay' state and meet the
2029  * 'full line' requirement.  If any column is in error,
2030  * we don't want to enable the 'full line' flag.  Previously,
2031  * we would do so and disable it only when a error is
2032  * detected after the first 'full line' io which is too late
2033  * and leads to the potential data corruption.
2034  */
2035 static int
2036 raid_check_cols(mr_unit_t *un)
2037 {
2038 	buf_t		bp;
2039 	char		*buf;
2040 	mr_column_t	*colptr;
2041 	minor_t		mnum = MD_SID(un);
2042 	int		i;
2043 	int		err = 0;
2044 
2045 	buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
2046 
2047 	for (i = 0; i < un->un_totalcolumncnt; i++) {
2048 		md_dev64_t tmpdev;
2049 
2050 		colptr = &un->un_column[i];
2051 
2052 		tmpdev = colptr->un_dev;
2053 		/*
2054 		 * Open by device id
2055 		 * If this device is hotspared
2056 		 * use the hotspare key
2057 		 */
2058 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
2059 			colptr->un_hs_key : colptr->un_orig_key);
2060 
2061 		if (tmpdev == NODEV64) {
2062 			err = 1;
2063 			break;
2064 		}
2065 
2066 		colptr->un_dev = tmpdev;
2067 
2068 		bzero((caddr_t)&bp, sizeof (buf_t));
2069 		bp.b_back = &bp;
2070 		bp.b_forw = &bp;
2071 		bp.b_flags = (B_READ | B_BUSY);
2072 		sema_init(&bp.b_io, 0, NULL,
2073 		    SEMA_DEFAULT, NULL);
2074 		sema_init(&bp.b_sem, 0, NULL,
2075 		    SEMA_DEFAULT, NULL);
2076 		bp.b_edev = md_dev64_to_dev(colptr->un_dev);
2077 		bp.b_lblkno = colptr->un_pwstart;
2078 		bp.b_bcount = DEV_BSIZE;
2079 		bp.b_bufsize = DEV_BSIZE;
2080 		bp.b_un.b_addr = (caddr_t)buf;
2081 		(void) md_call_strategy(&bp, 0, NULL);
2082 		if (biowait(&bp)) {
2083 			err = 1;
2084 			break;
2085 		}
2086 	}
2087 
2088 	kmem_free(buf, DEV_BSIZE);
2089 	return (err);
2090 }
2091 
2092 /*
2093  * NAME:	raid_iosetup
2094  * DESCRIPTION: RAID metadevice specific I/O set up routine which does
2095  *		all the necessary calculations to determine the location
2096  *		of the segement for the I/O.
2097  * PARAMETERS:	mr_unit_t *un - unit number of RAID metadevice
2098  *		diskaddr_t	blkno - block number of the I/O attempt
2099  *		size_t		blkcnt - block count for this I/O
2100  *		md_raidcs_t *cs - child structure for each segmented I/O
2101  *
2102  * NOTE:	The following is an example of a raid disk layer out:
2103  *
2104  *		Total Column = 5
2105  *		Original Column = 4
2106  *		Segment Per Column = 10
2107  *
2108  *			Col#0	Col#1	Col#2	Col#3	Col#4	Col#5	Col#6
2109  *		-------------------------------------------------------------
2110  *		line#0	Seg#0	Seg#1	Seg#2	Parity	Seg#30	Seg#40
2111  *		line#1	Parity	Seg#3	Seg#4	Seg#5	Seg#31
2112  *		line#2	Seg#8	Parity	Seg#6	Seg#7	Seg#32
2113  *		line#3	Seg#10	Seg#11	Parity	Seg#9	Seg#33
2114  *		line#4	Seg#12	Seg#13	Seg#14	Parity	Seg#34
2115  *		line#5	Parity	Seg#15	Seg#16	Seg#17	Seg#35
2116  *		line#6	Seg#20	Parity	Seg#18	Seg#19	Seg#36
2117  *		line#7	Seg#22	Seg#23	Parity	Seg#21	Seg#37
2118  *		line#8	Seg#24	Seg#25	Seg#26	Parity	Seg#38
2119  *		line#9	Parity	Seg#27	Seg#28	Seg#29	Seg#39
2120  */
2121 static size_t
2122 raid_iosetup(
2123 	mr_unit_t	*un,
2124 	diskaddr_t	blkno,
2125 	size_t		blkcnt,
2126 	md_raidcs_t	*cs
2127 )
2128 {
2129 	diskaddr_t	segment;
2130 	diskaddr_t	segstart;
2131 	diskaddr_t	segoff;
2132 	size_t		leftover;
2133 	diskaddr_t	line;
2134 	uint_t		iosize;
2135 	uint_t		colcnt;
2136 
2137 	/* caculate the segment# and offset for the block */
2138 	segment = blkno / un->un_segsize;
2139 	segstart = segment * un->un_segsize;
2140 	segoff = blkno - segstart;
2141 	iosize = un->un_iosize - 1;
2142 	colcnt = un->un_totalcolumncnt - 1;
2143 	line = raid_line(segment, un);
2144 	cs->cs_dcolumn = raid_dcolumn(segment, un);
2145 	cs->cs_pcolumn = raid_pcolumn(segment, un);
2146 	cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags;
2147 	cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags;
2148 	cs->cs_line = line;
2149 
2150 	if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) &&
2151 	    (UNIT_STATE(un) & RCS_OKAY) &&
2152 	    (segoff == 0) &&
2153 	    (un->un_totalcolumncnt == un->un_origcolumncnt) &&
2154 	    (un->un_segsize < un->un_iosize) &&
2155 	    (un->un_iosize <= un->un_maxio) &&
2156 	    (blkno == line * un->un_segsize * colcnt) &&
2157 	    (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) &&
2158 	    (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) &&
2159 	    (raid_check_cols(un) == 0)) {
2160 
2161 		md_raidcbuf_t	**cbufp;
2162 		md_raidcbuf_t	*cbuf;
2163 		int		i, j;
2164 
2165 		STAT_INC(raid_full_line_writes);
2166 		leftover = blkcnt - (un->un_segsize * colcnt);
2167 		ASSERT(blkcnt >= (un->un_segsize * colcnt));
2168 		cs->cs_blkno = line * un->un_segsize;
2169 		cs->cs_blkcnt = un->un_segsize;
2170 		cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
2171 		cs->cs_bcount = dbtob(cs->cs_blkcnt);
2172 		cs->cs_flags |= MD_RCS_LINE;
2173 
2174 		cbufp = &cs->cs_buflist;
2175 		for (i = 0; i < un->un_totalcolumncnt; i++) {
2176 			j = cs->cs_dcolumn + i;
2177 			j = j % un->un_totalcolumncnt;
2178 
2179 			if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn))
2180 				continue;
2181 			cbuf = kmem_cache_alloc(raid_cbuf_cache,
2182 			    MD_ALLOCFLAGS);
2183 			raid_cbuf_init(cbuf);
2184 			cbuf->cbuf_un = cs->cs_un;
2185 			cbuf->cbuf_ps = cs->cs_ps;
2186 			cbuf->cbuf_column = j;
2187 			cbuf->cbuf_bcount = dbtob(un->un_segsize);
2188 			*cbufp = cbuf;
2189 			cbufp = &cbuf->cbuf_next;
2190 		}
2191 		return (leftover);
2192 	}
2193 
2194 	leftover = blkcnt - (un->un_segsize - segoff);
2195 	if (blkcnt > (un->un_segsize - segoff))
2196 		blkcnt -= leftover;
2197 	else
2198 		leftover = 0;
2199 
2200 	if (blkcnt > (size_t)iosize) {
2201 		leftover += (blkcnt - iosize);
2202 		blkcnt = iosize;
2203 	}
2204 
2205 	/* calculate the line# and column# for the segment */
2206 	cs->cs_flags &= ~MD_RCS_LINE;
2207 	cs->cs_blkno = line * un->un_segsize + segoff;
2208 	cs->cs_blkcnt = (uint_t)blkcnt;
2209 	cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
2210 	cs->cs_bcount = dbtob((uint_t)blkcnt);
2211 	return (leftover);
2212 }
2213 
2214 /*
2215  * NAME:	raid_done
2216  * DESCRIPTION: RAID metadevice I/O done interrupt routine
2217  * PARAMETERS:	struct buf *bp - pointer to a buffer structure
2218  */
2219 static void
2220 raid_done(struct buf *bp)
2221 {
2222 	md_raidcs_t	*cs;
2223 	int		flags, frags;
2224 
2225 	sema_v(&bp->b_io);
2226 	cs = (md_raidcs_t *)bp->b_chain;
2227 
2228 	ASSERT(cs != NULL);
2229 
2230 	mutex_enter(&cs->cs_mx);
2231 	if (bp->b_flags & B_ERROR) {
2232 		cs->cs_flags |= MD_RCS_ERROR;
2233 		cs->cs_flags &= ~(MD_RCS_ISCALL);
2234 	}
2235 
2236 	flags = cs->cs_flags;
2237 	frags = --cs->cs_frags;
2238 	mutex_exit(&cs->cs_mx);
2239 	if (frags != 0) {
2240 		return;
2241 	}
2242 
2243 	if (flags & MD_RCS_ERROR) {
2244 		if (cs->cs_error_call) {
2245 			daemon_request(&md_done_daemon, cs->cs_error_call,
2246 				(daemon_queue_t *)cs, REQ_OLD);
2247 		}
2248 		return;
2249 	}
2250 
2251 	if (flags & MD_RCS_ISCALL) {
2252 		cs->cs_flags &= ~(MD_RCS_ISCALL);
2253 		(*(cs->cs_call))(cs);
2254 		return;
2255 	}
2256 	daemon_request(&md_done_daemon, cs->cs_call,
2257 					(daemon_queue_t *)cs, REQ_OLD);
2258 }
2259 /*
2260  * the flag RIO_EXTRA is used when dealing with a column in the process
2261  * of being resynced. During the resync, writes may have to take place
2262  * on both the original component and a hotspare component.
2263  */
2264 #define	RIO_DATA	0x00100		/* use data buffer & data column */
2265 #define	RIO_PARITY	0x00200		/* use parity buffer & parity column */
2266 #define	RIO_WRITE	0x00400		/* issue a write */
2267 #define	RIO_READ	0x00800		/* issue a read */
2268 #define	RIO_PWIO	0x01000		/* do the I/O to the prewrite entry */
2269 #define	RIO_ALT		0x02000		/* do write to alternate device */
2270 #define	RIO_EXTRA	0x04000		/* use extra buffer */
2271 
2272 #define	RIO_COLMASK	0x000ff
2273 
2274 #define	RIO_PREWRITE	RIO_WRITE | RIO_PWIO
2275 
2276 /*
2277  * NAME:	raidio
2278  * DESCRIPTION: RAID metadevice write routine
2279  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2280  */
2281 static void
2282 raidio(md_raidcs_t *cs, int flags)
2283 {
2284 	buf_t		*bp;
2285 	int		column;
2286 	int		flag;
2287 	void		*private;
2288 	mr_unit_t	*un;
2289 	int		iosize;
2290 	diskaddr_t	pwstart;
2291 	diskaddr_t	devstart;
2292 	md_dev64_t	dev;
2293 
2294 	un = cs->cs_un;
2295 
2296 	ASSERT(IO_READER_HELD(un));
2297 	ASSERT(UNIT_READER_HELD(un));
2298 
2299 	if (flags & RIO_DATA) {
2300 		if (flags & RIO_EXTRA)
2301 			bp = &cs->cs_hbuf;
2302 		else
2303 			bp = &cs->cs_dbuf;
2304 		bp->b_un.b_addr = cs->cs_dbuffer;
2305 		column = cs->cs_dcolumn;
2306 	} else {
2307 		if (flags & RIO_EXTRA)
2308 			bp = &cs->cs_hbuf;
2309 		else
2310 			bp = &cs->cs_pbuf;
2311 		bp->b_un.b_addr = cs->cs_pbuffer;
2312 		column = cs->cs_pcolumn;
2313 	}
2314 	if (flags & RIO_COLMASK)
2315 		column = (flags & RIO_COLMASK) - 1;
2316 
2317 	bp->b_bcount = cs->cs_bcount;
2318 	bp->b_bufsize = cs->cs_bcount;
2319 	iosize = un->un_iosize;
2320 
2321 	/* check if the hotspared device will be used */
2322 	if (flags & RIO_ALT && (flags & RIO_WRITE)) {
2323 		pwstart = un->un_column[column].un_alt_pwstart;
2324 		devstart = un->un_column[column].un_alt_devstart;
2325 		dev = un->un_column[column].un_alt_dev;
2326 	} else {
2327 		pwstart = un->un_column[column].un_pwstart;
2328 		devstart = un->un_column[column].un_devstart;
2329 		dev = un->un_column[column].un_dev;
2330 	}
2331 
2332 	/* if not writing to log skip log header */
2333 	if ((flags & RIO_PWIO) == 0) {
2334 		bp->b_lblkno = devstart + cs->cs_blkno;
2335 		bp->b_un.b_addr += DEV_BSIZE;
2336 	} else {
2337 		bp->b_bcount += DEV_BSIZE;
2338 		bp->b_bufsize = bp->b_bcount;
2339 		if (flags & RIO_DATA) {
2340 			bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart;
2341 		} else { /* not DATA -> PARITY */
2342 			bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart;
2343 		}
2344 	}
2345 
2346 	bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available);
2347 	bp->b_flags |= B_BUSY;
2348 	if (flags & RIO_READ) {
2349 		bp->b_flags |= B_READ;
2350 	} else {
2351 		bp->b_flags |= B_WRITE;
2352 		if ((nv_available && nv_parity && (flags & RIO_PARITY)) ||
2353 		    (nv_available && nv_prewrite && (flags & RIO_PWIO)))
2354 			bp->b_flags |= nv_available;
2355 	}
2356 	bp->b_iodone = (int (*)())raid_done;
2357 	bp->b_edev = md_dev64_to_dev(dev);
2358 
2359 	ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV));
2360 
2361 	private = cs->cs_strategy_private;
2362 	flag = cs->cs_strategy_flag;
2363 
2364 	md_call_strategy(bp, flag, private);
2365 }
2366 
2367 /*
2368  * NAME:	genstandardparity
2369  * DESCRIPTION: This routine
2370  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2371  */
2372 static void
2373 genstandardparity(md_raidcs_t *cs)
2374 {
2375 	uint_t		*dbuf, *pbuf;
2376 	size_t		wordcnt;
2377 	uint_t		dsum = 0;
2378 	uint_t		psum = 0;
2379 
2380 	ASSERT((cs->cs_bcount & 0x3) == 0);
2381 
2382 	wordcnt = cs->cs_bcount / sizeof (uint_t);
2383 
2384 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
2385 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2386 
2387 	/* Word aligned */
2388 	if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
2389 		uint_t	*uwbuf = (uint_t *)(void *)(cs->cs_addr);
2390 		uint_t	uval;
2391 
2392 		while (wordcnt--) {
2393 			uval = *uwbuf++;
2394 			psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval));
2395 			++pbuf;
2396 			*dbuf = uval;
2397 			dsum ^= uval;
2398 			++dbuf;
2399 		}
2400 	} else {
2401 		uchar_t	*ubbuf = (uchar_t *)(cs->cs_addr);
2402 		union {
2403 			uint_t	wb;
2404 			uchar_t	bb[4];
2405 		} cb;
2406 
2407 		while (wordcnt--) {
2408 			cb.bb[0] = *ubbuf++;
2409 			cb.bb[1] = *ubbuf++;
2410 			cb.bb[2] = *ubbuf++;
2411 			cb.bb[3] = *ubbuf++;
2412 			psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb));
2413 			++pbuf;
2414 			*dbuf = cb.wb;
2415 			dsum ^= cb.wb;
2416 			++dbuf;
2417 		}
2418 	}
2419 
2420 	RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn,
2421 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2422 			2, cs->cs_dcolumn, RAID_PWMAGIC);
2423 
2424 	RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn,
2425 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2426 			2, cs->cs_pcolumn, RAID_PWMAGIC);
2427 }
2428 
2429 static void
2430 genlineparity(md_raidcs_t *cs)
2431 {
2432 
2433 	mr_unit_t	*un = cs->cs_un;
2434 	md_raidcbuf_t	*cbuf;
2435 	uint_t		*pbuf, *dbuf;
2436 	uint_t		*uwbuf;
2437 	uchar_t		*ubbuf;
2438 	size_t		wordcnt;
2439 	uint_t		psum = 0, dsum = 0;
2440 	size_t		count = un->un_segsize * DEV_BSIZE;
2441 	uint_t		col;
2442 	buf_t		*bp;
2443 
2444 	ASSERT((cs->cs_bcount & 0x3) == 0);
2445 
2446 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2447 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
2448 	uwbuf = (uint_t *)(void *)(cs->cs_addr);
2449 	ubbuf = (uchar_t *)(void *)(cs->cs_addr);
2450 
2451 	wordcnt = count / sizeof (uint_t);
2452 
2453 	/* Word aligned */
2454 	if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
2455 		uint_t	 uval;
2456 
2457 		while (wordcnt--) {
2458 			uval = *uwbuf++;
2459 			*dbuf = uval;
2460 			*pbuf = uval;
2461 			dsum ^= uval;
2462 			++pbuf;
2463 			++dbuf;
2464 		}
2465 	} else {
2466 		union {
2467 			uint_t	wb;
2468 			uchar_t	bb[4];
2469 		} cb;
2470 
2471 		while (wordcnt--) {
2472 			cb.bb[0] = *ubbuf++;
2473 			cb.bb[1] = *ubbuf++;
2474 			cb.bb[2] = *ubbuf++;
2475 			cb.bb[3] = *ubbuf++;
2476 			*dbuf = cb.wb;
2477 			*pbuf = cb.wb;
2478 			dsum ^= cb.wb;
2479 			++pbuf;
2480 			++dbuf;
2481 		}
2482 	}
2483 
2484 	RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn,
2485 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2486 			un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC);
2487 
2488 	raidio(cs, RIO_PREWRITE | RIO_DATA);
2489 
2490 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
2491 
2492 		dsum = 0;
2493 		pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2494 		dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE);
2495 
2496 		wordcnt = count / sizeof (uint_t);
2497 
2498 		col = cbuf->cbuf_column;
2499 
2500 		/* Word aligned */
2501 		if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
2502 			uint_t	uval;
2503 
2504 			/*
2505 			 * Only calculate psum when working on the last
2506 			 * data buffer.
2507 			 */
2508 			if (cbuf->cbuf_next == NULL) {
2509 				psum = 0;
2510 				while (wordcnt--) {
2511 					uval = *uwbuf++;
2512 					*dbuf = uval;
2513 					psum ^= (*pbuf ^= uval);
2514 					dsum ^= uval;
2515 					++dbuf;
2516 					++pbuf;
2517 				}
2518 			} else {
2519 				while (wordcnt--) {
2520 					uval = *uwbuf++;
2521 					*dbuf = uval;
2522 					*pbuf ^= uval;
2523 					dsum ^= uval;
2524 					++dbuf;
2525 					++pbuf;
2526 				}
2527 			}
2528 		} else {
2529 			union {
2530 				uint_t	wb;
2531 				uchar_t	bb[4];
2532 			} cb;
2533 
2534 			/*
2535 			 * Only calculate psum when working on the last
2536 			 * data buffer.
2537 			 */
2538 			if (cbuf->cbuf_next == NULL) {
2539 				psum = 0;
2540 				while (wordcnt--) {
2541 					cb.bb[0] = *ubbuf++;
2542 					cb.bb[1] = *ubbuf++;
2543 					cb.bb[2] = *ubbuf++;
2544 					cb.bb[3] = *ubbuf++;
2545 					*dbuf = cb.wb;
2546 					psum ^= (*pbuf ^= cb.wb);
2547 					dsum ^= cb.wb;
2548 					++dbuf;
2549 					++pbuf;
2550 				}
2551 			} else {
2552 				while (wordcnt--) {
2553 					cb.bb[0] = *ubbuf++;
2554 					cb.bb[1] = *ubbuf++;
2555 					cb.bb[2] = *ubbuf++;
2556 					cb.bb[3] = *ubbuf++;
2557 					*dbuf = cb.wb;
2558 					*pbuf ^= cb.wb;
2559 					dsum ^= cb.wb;
2560 					++dbuf;
2561 					++pbuf;
2562 				}
2563 			}
2564 		}
2565 		RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn,
2566 				cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2567 				un->un_totalcolumncnt, col, RAID_PWMAGIC);
2568 
2569 		/*
2570 		 * fill in buffer for write to prewrite area
2571 		 */
2572 		bp = &cbuf->cbuf_bp;
2573 		bp->b_un.b_addr = cbuf->cbuf_buffer;
2574 		bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE;
2575 		bp->b_bufsize = bp->b_bcount;
2576 		bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) +
2577 		    un->un_column[col].un_pwstart;
2578 		bp->b_flags = B_WRITE | B_BUSY;
2579 		if (nv_available && nv_prewrite)
2580 			bp->b_flags |= nv_available;
2581 		bp->b_iodone = (int (*)())raid_done;
2582 		bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev);
2583 		bp->b_chain = (struct buf *)cs;
2584 		md_call_strategy(bp,
2585 			cs->cs_strategy_flag, cs->cs_strategy_private);
2586 	}
2587 
2588 	RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn,
2589 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
2590 			un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC);
2591 
2592 	raidio(cs, RIO_PREWRITE | RIO_PARITY);
2593 }
2594 
2595 /*
2596  * NAME:	raid_readregenloop
2597  * DESCRIPTION: RAID metadevice write routine
2598  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2599  */
2600 static void
2601 raid_readregenloop(md_raidcs_t *cs)
2602 {
2603 	mr_unit_t	*un;
2604 	md_raidps_t	*ps;
2605 	uint_t		*dbuf;
2606 	uint_t		*pbuf;
2607 	size_t		wordcnt;
2608 
2609 	un = cs->cs_un;
2610 
2611 	/*
2612 	 * XOR the parity with data bytes, must skip the
2613 	 * pre-write entry header in all data/parity buffers
2614 	 */
2615 	wordcnt = cs->cs_bcount / sizeof (uint_t);
2616 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
2617 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
2618 	while (wordcnt--)
2619 		*dbuf++ ^= *pbuf++;
2620 
2621 	/* bump up the loop count */
2622 	cs->cs_loop++;
2623 
2624 	/* skip the errored component */
2625 	if (cs->cs_loop == cs->cs_dcolumn)
2626 		cs->cs_loop++;
2627 
2628 	if (cs->cs_loop != un->un_totalcolumncnt) {
2629 		cs->cs_frags = 1;
2630 		raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
2631 		return;
2632 	}
2633 	/* reaching the end sof loop */
2634 	ps = cs->cs_ps;
2635 	bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount);
2636 	raid_free_child(cs, 1);
2637 
2638 	/* decrement readfrags */
2639 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
2640 }
2641 
2642 /*
2643  * NAME:	raid_read_io
2644  * DESCRIPTION: RAID metadevice read I/O routine
2645  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2646  *		md_raidcs_t *cs - pointer to a child structure
2647  */
2648 static void
2649 raid_read_io(mr_unit_t *un, md_raidcs_t *cs)
2650 {
2651 	int	flag;
2652 	void	*private;
2653 	buf_t	*bp;
2654 	buf_t	*pb = cs->cs_ps->ps_bp;
2655 	mr_column_t	*column;
2656 
2657 	flag = cs->cs_strategy_flag;
2658 	private = cs->cs_strategy_private;
2659 	column = &un->un_column[cs->cs_dcolumn];
2660 
2661 	/*
2662 	 * The component to be read is good, simply set up bp structure
2663 	 * and call low level md routine doing the read.
2664 	 */
2665 
2666 	if (COLUMN_ISOKAY(un, cs->cs_dcolumn) ||
2667 	    (COLUMN_ISLASTERR(un, cs->cs_dcolumn) &&
2668 		    (cs->cs_flags & MD_RCS_RECOVERY) == 0)) {
2669 		dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */
2670 		ddi_dev = md_dev64_to_dev(column->un_dev);
2671 
2672 		bp = &cs->cs_dbuf;
2673 		bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev,
2674 				column->un_devstart + cs->cs_blkno,
2675 				(int (*)())raid_done, bp, KM_NOSLEEP);
2676 
2677 		bp->b_chain = (buf_t *)cs;
2678 
2679 		cs->cs_frags = 1;
2680 		cs->cs_error_call = raid_read_error;
2681 		cs->cs_retry_call = raid_read_retry;
2682 		cs->cs_flags |= MD_RCS_ISCALL;
2683 		cs->cs_stage = RAID_READ_DONE;
2684 		cs->cs_call = raid_stage;
2685 
2686 		ASSERT(bp->b_edev != 0);
2687 
2688 		md_call_strategy(bp, flag, private);
2689 		return;
2690 	}
2691 
2692 	/*
2693 	 * The component to be read is bad, have to go through
2694 	 * raid specific method to read data from other members.
2695 	 */
2696 	cs->cs_loop = 0;
2697 	/*
2698 	 * NOTE: always get dbuffer before pbuffer
2699 	 *	 and get both buffers before pwslot
2700 	 *	 otherwise a deadlock could be introduced.
2701 	 */
2702 	raid_mapin_buf(cs);
2703 	getdbuffer(cs);
2704 	getpbuffer(cs);
2705 	if (cs->cs_loop == cs->cs_dcolumn)
2706 		cs->cs_loop++;
2707 
2708 	/* zero out data buffer for use as a data sink */
2709 	bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount);
2710 	cs->cs_stage = RAID_NONE;
2711 	cs->cs_call = raid_readregenloop;
2712 	cs->cs_error_call = raid_read_error;
2713 	cs->cs_retry_call = raid_read_no_retry;
2714 	cs->cs_frags = 1;
2715 
2716 	/* use parity buffer to read other columns */
2717 	raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
2718 }
2719 
2720 /*
2721  * NAME:	raid_read
2722  * DESCRIPTION: RAID metadevice write routine
2723  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2724  *		md_raidcs_t *cs - pointer to a child structure
2725  */
2726 static int
2727 raid_read(mr_unit_t *un, md_raidcs_t *cs)
2728 {
2729 	int		error = 0;
2730 	md_raidps_t	*ps;
2731 	mdi_unit_t	*ui;
2732 	minor_t		mnum;
2733 
2734 	ASSERT(IO_READER_HELD(un));
2735 	ps = cs->cs_ps;
2736 	ui = ps->ps_ui;
2737 	raid_line_reader_lock(cs, 0);
2738 	un = (mr_unit_t *)md_unit_readerlock(ui);
2739 	ASSERT(UNIT_STATE(un) != RUS_INIT);
2740 	mnum = MD_SID(un);
2741 	cs->cs_un = un;
2742 
2743 	/* make sure the read doesn't go beyond the end of the column */
2744 	if (cs->cs_blkno + cs->cs_blkcnt >
2745 	    un->un_segsize * un->un_segsincolumn) {
2746 		error = ENXIO;
2747 	}
2748 	if (error)
2749 		goto rerror;
2750 
2751 	if (un->un_state & RUS_REGEN) {
2752 		raid_regen_parity(cs);
2753 		un = MD_UNIT(mnum);
2754 		cs->cs_un = un;
2755 	}
2756 
2757 	raid_read_io(un, cs);
2758 	return (0);
2759 
2760 rerror:
2761 	raid_error_parent(ps, error);
2762 	raid_free_child(cs, 1);
2763 	/* decrement readfrags */
2764 	raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
2765 	return (0);
2766 }
2767 
2768 /*
2769  * NAME:	raid_write_err_retry
2770  * DESCRIPTION: RAID metadevice write retry routine
2771  *		write was for parity or data only;
2772  *		complete write with error, no recovery possible
2773  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2774  *		md_raidcs_t *cs - pointer to a child structure
2775  */
2776 /*ARGSUSED*/
2777 static void
2778 raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs)
2779 {
2780 	md_raidps_t	*ps = cs->cs_ps;
2781 	int		flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
2782 
2783 	/* decrement pwfrags if needed, and frags */
2784 	if (!(cs->cs_flags & MD_RCS_PWDONE))
2785 		flags |= RFP_DECR_PWFRAGS;
2786 	raid_error_parent(ps, EIO);
2787 	raid_free_child(cs, 1);
2788 	raid_free_parent(ps, flags);
2789 }
2790 
2791 /*
2792  * NAME:	raid_write_err_retry
2793  * DESCRIPTION: RAID metadevice write retry routine
2794  *		 write is too far along to retry and parent
2795  *		 has already been signaled with iodone.
2796  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2797  *		md_raidcs_t *cs - pointer to a child structure
2798  */
2799 /*ARGSUSED*/
2800 static void
2801 raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs)
2802 {
2803 	md_raidps_t	*ps = cs->cs_ps;
2804 	int		flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
2805 
2806 	/* decrement pwfrags if needed, and frags */
2807 	if (!(cs->cs_flags & MD_RCS_PWDONE))
2808 		flags |= RFP_DECR_PWFRAGS;
2809 	raid_free_child(cs, 1);
2810 	raid_free_parent(ps, flags);
2811 }
2812 
2813 /*
2814  * NAME:	raid_write_retry
2815  * DESCRIPTION: RAID metadevice write retry routine
2816  * PARAMETERS:	mr_unit_t *un - pointer to a unit structure
2817  *		md_raidcs_t *cs - pointer to a child structure
2818  */
2819 static void
2820 raid_write_retry(mr_unit_t *un, md_raidcs_t *cs)
2821 {
2822 	md_raidps_t	*ps;
2823 
2824 	ps = cs->cs_ps;
2825 
2826 	/* re-initialize the buf_t structure for raid_write() */
2827 	cs->cs_dbuf.b_chain = (struct buf *)cs;
2828 	cs->cs_dbuf.b_back = &cs->cs_dbuf;
2829 	cs->cs_dbuf.b_forw = &cs->cs_dbuf;
2830 	cs->cs_dbuf.b_flags = B_BUSY;	/* initialize flags */
2831 	cs->cs_dbuf.b_error = 0;	/* initialize error */
2832 	cs->cs_dbuf.b_offset = -1;
2833 	/* Initialize semaphores */
2834 	sema_init(&cs->cs_dbuf.b_io, 0, NULL,
2835 	    SEMA_DEFAULT, NULL);
2836 	sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
2837 	    SEMA_DEFAULT, NULL);
2838 
2839 	cs->cs_pbuf.b_chain = (struct buf *)cs;
2840 	cs->cs_pbuf.b_back = &cs->cs_pbuf;
2841 	cs->cs_pbuf.b_forw = &cs->cs_pbuf;
2842 	cs->cs_pbuf.b_flags = B_BUSY;	/* initialize flags */
2843 	cs->cs_pbuf.b_error = 0;	/* initialize error */
2844 	cs->cs_pbuf.b_offset = -1;
2845 	sema_init(&cs->cs_pbuf.b_io, 0, NULL,
2846 	    SEMA_DEFAULT, NULL);
2847 	sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
2848 	    SEMA_DEFAULT, NULL);
2849 
2850 	cs->cs_hbuf.b_chain = (struct buf *)cs;
2851 	cs->cs_hbuf.b_back = &cs->cs_hbuf;
2852 	cs->cs_hbuf.b_forw = &cs->cs_hbuf;
2853 	cs->cs_hbuf.b_flags = B_BUSY;	/* initialize flags */
2854 	cs->cs_hbuf.b_error = 0;	/* initialize error */
2855 	cs->cs_hbuf.b_offset = -1;
2856 	sema_init(&cs->cs_hbuf.b_io, 0, NULL,
2857 	    SEMA_DEFAULT, NULL);
2858 	sema_init(&cs->cs_hbuf.b_sem, 0, NULL,
2859 	    SEMA_DEFAULT, NULL);
2860 
2861 	cs->cs_flags &= ~(MD_RCS_ERROR);
2862 	/*
2863 	 * If we have already done'ed the i/o but have done prewrite
2864 	 * on this child, then reset PWDONE flag and bump pwfrags before
2865 	 * restarting i/o.
2866 	 * If pwfrags is zero, we have already 'iodone'd the i/o so
2867 	 * leave things alone.  We don't want to re-'done' it.
2868 	 */
2869 	mutex_enter(&ps->ps_mx);
2870 	if (cs->cs_flags & MD_RCS_PWDONE) {
2871 		cs->cs_flags &= ~MD_RCS_PWDONE;
2872 		ps->ps_pwfrags++;
2873 	}
2874 	mutex_exit(&ps->ps_mx);
2875 	raid_write_io(un, cs);
2876 }
2877 
2878 /*
2879  * NAME:	raid_wrerr
2880  * DESCRIPTION: RAID metadevice write routine
2881  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2882  * LOCKS:	must obtain unit writer lock while calling raid_error_state
2883  *		since a unit or column state transition may take place.
2884  *		must obtain unit reader lock to retry I/O.
2885  */
2886 static void
2887 raid_wrerr(md_raidcs_t *cs)
2888 {
2889 	md_raidps_t	*ps;
2890 	mdi_unit_t	*ui;
2891 	mr_unit_t	*un;
2892 	md_raidcbuf_t	*cbuf;
2893 
2894 	ps = cs->cs_ps;
2895 	ui = ps->ps_ui;
2896 
2897 	un = (mr_unit_t *)md_unit_writerlock(ui);
2898 	ASSERT(un != 0);
2899 
2900 	if (cs->cs_dbuf.b_flags & B_ERROR)
2901 		(void) raid_error_state(un, &cs->cs_dbuf);
2902 	if (cs->cs_pbuf.b_flags & B_ERROR)
2903 		(void) raid_error_state(un, &cs->cs_pbuf);
2904 	if (cs->cs_hbuf.b_flags & B_ERROR)
2905 		(void) raid_error_state(un, &cs->cs_hbuf);
2906 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
2907 		if (cbuf->cbuf_bp.b_flags & B_ERROR)
2908 			(void) raid_error_state(un, &cbuf->cbuf_bp);
2909 
2910 	md_unit_writerexit(ui);
2911 
2912 	ps->ps_flags |= MD_RPS_HSREQ;
2913 
2914 	un = (mr_unit_t *)md_unit_readerlock(ui);
2915 
2916 	/* now attempt the appropriate retry routine */
2917 	(*(cs->cs_retry_call))(un, cs);
2918 }
2919 /*
2920  * NAMES:	raid_write_error
2921  * DESCRIPTION: I/O error handling routine for a RAID metadevice write
2922  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
2923  */
2924 /*ARGSUSED*/
2925 static void
2926 raid_write_error(md_raidcs_t *cs)
2927 {
2928 	md_raidps_t	*ps;
2929 	mdi_unit_t	*ui;
2930 	mr_unit_t	*un;
2931 	md_raidcbuf_t	*cbuf;
2932 	set_t		setno;
2933 
2934 	ps = cs->cs_ps;
2935 	ui = ps->ps_ui;
2936 	un = cs->cs_un;
2937 
2938 	setno = MD_UN2SET(un);
2939 
2940 	/*
2941 	 * locate each buf that is in error on this io and then
2942 	 * output an error message
2943 	 */
2944 	if ((cs->cs_dbuf.b_flags & B_ERROR) &&
2945 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
2946 	    (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
2947 		cmn_err(CE_WARN, "md %s: write error on %s",
2948 		    md_shortname(MD_SID(un)),
2949 		    md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
2950 
2951 	if ((cs->cs_pbuf.b_flags & B_ERROR) &&
2952 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
2953 	    (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
2954 		cmn_err(CE_WARN, "md %s: write error on %s",
2955 		    md_shortname(MD_SID(un)),
2956 		    md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
2957 
2958 	for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
2959 		if ((cbuf->cbuf_bp.b_flags & B_ERROR) &&
2960 		    (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) &&
2961 		    (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED))
2962 			cmn_err(CE_WARN, "md %s: write error on %s",
2963 			    md_shortname(MD_SID(un)),
2964 			    md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev),
2965 					NULL, 0));
2966 
2967 	md_unit_readerexit(ui);
2968 
2969 	ASSERT(cs->cs_frags == 0);
2970 
2971 	/* now schedule processing for possible state change */
2972 	daemon_request(&md_mstr_daemon, raid_wrerr,
2973 		(daemon_queue_t *)cs, REQ_OLD);
2974 
2975 }
2976 
2977 /*
2978  * NAME:	raid_write_ponly
2979  * DESCRIPTION: RAID metadevice write routine
2980  *		in the case where only the parity column can be written
2981  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
2982  */
2983 static void
2984 raid_write_ponly(md_raidcs_t *cs)
2985 {
2986 	md_raidps_t	*ps;
2987 	mr_unit_t	*un = cs->cs_un;
2988 
2989 	ps = cs->cs_ps;
2990 	/* decrement pwfrags if needed, but not frags */
2991 	ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
2992 	raid_free_parent(ps, RFP_DECR_PWFRAGS);
2993 	cs->cs_flags |= MD_RCS_PWDONE;
2994 	cs->cs_frags = 1;
2995 	cs->cs_stage = RAID_WRITE_PONLY_DONE;
2996 	cs->cs_call = raid_stage;
2997 	cs->cs_error_call = raid_write_error;
2998 	cs->cs_retry_call = raid_write_no_retry;
2999 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
3000 		cs->cs_frags++;
3001 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE);
3002 	}
3003 	raidio(cs, RIO_PARITY | RIO_WRITE);
3004 }
3005 
3006 /*
3007  * NAME:	raid_write_ploop
3008  * DESCRIPTION: RAID metadevice write routine, constructs parity from
3009  *		data in other columns.
3010  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
3011  */
3012 static void
3013 raid_write_ploop(md_raidcs_t *cs)
3014 {
3015 	mr_unit_t *un = cs->cs_un;
3016 	uint_t *dbuf;
3017 	uint_t *pbuf;
3018 	size_t wordcnt;
3019 	uint_t psum = 0;
3020 
3021 	wordcnt = cs->cs_bcount / sizeof (uint_t);
3022 	dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
3023 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
3024 	while (wordcnt--)
3025 		*pbuf++ ^= *dbuf++;
3026 	cs->cs_loop++;
3027 
3028 	/*
3029 	 * build parity from scratch using new data,
3030 	 * skip reading the data and parity columns.
3031 	 */
3032 	while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn)
3033 		cs->cs_loop++;
3034 
3035 	if (cs->cs_loop != un->un_totalcolumncnt) {
3036 		cs->cs_frags = 1;
3037 		raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
3038 		return;
3039 	}
3040 
3041 	/* construct checksum for parity buffer */
3042 	wordcnt = cs->cs_bcount / sizeof (uint_t);
3043 	pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
3044 	while (wordcnt--) {
3045 		psum ^= *pbuf;
3046 		pbuf++;
3047 	}
3048 	RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1,
3049 			cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
3050 			1, cs->cs_pcolumn, RAID_PWMAGIC);
3051 
3052 	cs->cs_stage = RAID_NONE;
3053 	cs->cs_call = raid_write_ponly;
3054 	cs->cs_error_call = raid_write_error;
3055 	cs->cs_retry_call = raid_write_err_retry;
3056 	cs->cs_frags = 1;
3057 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
3058 		cs->cs_frags++;
3059 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
3060 	}
3061 	raidio(cs, RIO_PARITY | RIO_PREWRITE);
3062 }
3063 
3064 /*
3065  * NAME:	raid_write_donly
3066  * DESCRIPTION: RAID metadevice write routine
3067  *		Completed writing data to prewrite entry
3068  *		in the case where only the data column can be written
3069  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
3070  */
3071 static void
3072 raid_write_donly(md_raidcs_t *cs)
3073 {
3074 	md_raidps_t	*ps;
3075 	mr_unit_t	*un = cs->cs_un;
3076 
3077 	ps = cs->cs_ps;
3078 	/* WARNING: don't release unit reader lock here... */
3079 	/* decrement pwfrags if needed, but not frags */
3080 	ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
3081 	raid_free_parent(ps, RFP_DECR_PWFRAGS);
3082 	cs->cs_flags |= MD_RCS_PWDONE;
3083 	cs->cs_frags = 1;
3084 	cs->cs_stage = RAID_WRITE_DONLY_DONE;
3085 	cs->cs_call = raid_stage;
3086 	cs->cs_error_call = raid_write_error;
3087 	cs->cs_retry_call = raid_write_err_retry;
3088 	if (WRITE_ALT(un, cs->cs_dcolumn)) {
3089 		cs->cs_frags++;
3090 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
3091 	}
3092 	raidio(cs, RIO_DATA | RIO_WRITE);
3093 }
3094 
3095 /*
3096  * NAME:	raid_write_got_old
3097  * DESCRIPTION: RAID metadevice write routine
3098  *		completed read of old data and old parity
3099  * PARAMETERS:	md_raidcs_t *cs - pointer to a child structure
3100  */
3101 static void
3102 raid_write_got_old(md_raidcs_t *cs)
3103 {
3104 	mr_unit_t *un = cs->cs_un;
3105 
3106 	ASSERT(IO_READER_HELD(cs->cs_un));
3107 	ASSERT(UNIT_READER_HELD(cs->cs_un));
3108 
3109 	raid_mapin_buf(cs);
3110 	genstandardparity(cs);
3111 	cs->cs_frags = 2;
3112 	cs->cs_call = raid_stage;
3113 	cs->cs_stage = RAID_PREWRITE_DONE;
3114 	cs->cs_error_call = raid_write_error;
3115 	cs->cs_retry_call = raid_write_retry;
3116 
3117 	if (WRITE_ALT(un, cs->cs_dcolumn)) {
3118 		cs->cs_frags++;
3119 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE);
3120 	}
3121 
3122 	if (WRITE_ALT(un, cs->cs_pcolumn)) {
3123 		cs->cs_frags++;
3124 		raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
3125 	}
3126 	ASSERT(cs->cs_frags < 4);
3127 	raidio(cs,  RIO_DATA | RIO_PREWRITE);
3128 	raidio(cs,  RIO_PARITY | RIO_PREWRITE);
3129 }
3130 
3131 /*
3132  * NAME:	raid_write_io
3133  * DESCRIPTION: RAID metadevice write I/O routine
3134  * PARAMETERS:	mr_unit_t *un -  pointer to a unit structure
3135  *		md_raidcs_t *cs - pointer to a child structure
3136  */
3137 
3138 /*ARGSUSED*/
3139 static void
3140 raid_write_io(mr_unit_t *un, md_raidcs_t *cs)
3141 {
3142 	md_raidps_t	*ps = cs->cs_ps;
3143 	uint_t		*dbuf;
3144 	uint_t		*ubuf;
3145 	size_t		wordcnt;
3146 	uint_t		dsum = 0;
3147 	int		pcheck;
3148 	int		dcheck;
3149 
3150 	ASSERT((un->un_column[cs->cs_pcolumn].un_devstate &
3151 	    RCS_INIT) == 0);
3152 	ASSERT((un->un_column[cs->cs_dcolumn].un_devstate &
3153 	    RCS_INIT) == 0);
3154 	ASSERT(IO_READER_HELD(un));
3155 	ASSERT(UNIT_READER_HELD(un));
3156 	ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS);
3157 	if (cs->cs_flags & MD_RCS_LINE) {
3158 
3159 		mr_unit_t	*un = cs->cs_un;
3160 
3161 		ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt);
3162 		raid_mapin_buf(cs);
3163 		cs->cs_frags = un->un_origcolumncnt;
3164 		cs->cs_call = raid_stage;
3165 		cs->cs_error_call = raid_write_error;
3166 		cs->cs_retry_call = raid_write_no_retry;
3167 		cs->cs_stage = RAID_LINE_PWDONE;
3168 		genlineparity(cs);
3169 		return;
3170 	}
3171 
3172 	pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]);
3173 	dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]);
3174 	cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck;
3175 
3176 	if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) {
3177 		int err = EIO;
3178 
3179 		if ((un->un_column[cs->cs_pcolumn].un_devstate ==
3180 		    RCS_LAST_ERRED) ||
3181 		    (un->un_column[cs->cs_dcolumn].un_devstate ==
3182 		    RCS_LAST_ERRED))
3183 			err = ENXIO;
3184 		raid_error_parent(ps, err);
3185 		ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
3186 		raid_free_child(cs, 1);
3187 		raid_free_parent(ps,  RFP_DECR_FRAGS
3188 		    | RFP_RLS_LOCK | RFP_DECR_PWFRAGS);
3189 		return;
3190 	}
3191 
3192 	if (pcheck & RCL_ERRED) {
3193 		/*
3194 		 * handle case of only having data drive
3195 		 */
3196 		raid_mapin_buf(cs);
3197 		wordcnt = cs->cs_bcount / sizeof (uint_t);
3198 
3199 		dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
3200 		ubuf = (uint_t *)(void *)(cs->cs_addr);
3201 
3202 		while (wordcnt--) {
3203 			*dbuf = *ubuf;
3204 			dsum ^= *ubuf;
3205 			dbuf++;
3206 			ubuf++;
3207 		}
3208 		RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1,
3209 				cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
3210 				1, cs->cs_dcolumn, RAID_PWMAGIC);
3211 		cs->cs_frags = 1;
3212 		cs->cs_stage = RAID_NONE;
3213 		cs->cs_call = raid_write_donly;
3214 		cs->cs_error_call = raid_write_error;
3215 		cs->cs_retry_call = raid_write_err_retry;
3216 		if (WRITE_ALT(un, cs->cs_dcolumn)) {
3217 			cs->cs_frags++;
3218 			raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA |
3219 			    RIO_PREWRITE);
3220 		}
3221 		raidio(cs, RIO_DATA | RIO_PREWRITE);
3222 		return;
3223 	}
3224 
3225 	if (dcheck & RCL_ERRED) {
3226 		/*
3227 		 * handle case of only having parity drive
3228 		 * build parity from scratch using new data,
3229 		 * skip reading the data and parity columns.
3230 		 */
3231 		raid_mapin_buf(cs);
3232 		cs->cs_loop = 0;
3233 		while (cs->cs_loop == cs->cs_dcolumn ||
3234 		    cs->cs_loop == cs->cs_pcolumn)
3235 			cs->cs_loop++;
3236 
3237 		/* copy new data in to begin building parity */
3238 		bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount);
3239 		cs->cs_stage = RAID_NONE;
3240 		cs->cs_call = raid_write_ploop;
3241 		cs->cs_error_call = raid_write_error;
3242 		cs->cs_retry_call = raid_write_err_retry;
3243 		cs->cs_frags = 1;
3244 		raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
3245 		return;
3246 	}
3247 	/*
3248 	 * handle normal cases
3249 	 * read old data and old parity
3250 	 */
3251 	cs->cs_frags = 2;
3252 	cs->cs_stage = RAID_NONE;
3253 	cs->cs_call = raid_write_got_old;
3254 	cs->cs_error_call = raid_write_error;
3255 	cs->cs_retry_call = raid_write_retry;
3256 	ASSERT(ps->ps_magic == RAID_PSMAGIC);
3257 	raidio(cs, RIO_DATA | RIO_READ);
3258 	raidio(cs, RIO_PARITY | RIO_READ);
3259 }
3260 
3261 static void
3262 raid_enqueue(md_raidcs_t *cs)
3263 {
3264 	mdi_unit_t	*ui = cs->cs_ps->ps_ui;
3265 	kmutex_t	*io_list_mutex = &ui->ui_io_lock->io_list_mutex;
3266 	md_raidcs_t	*cs1;
3267 
3268 	mutex_enter(io_list_mutex);
3269 	ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
3270 	if (ui->ui_io_lock->io_list_front == NULL) {
3271 		ui->ui_io_lock->io_list_front = cs;
3272 		ui->ui_io_lock->io_list_back = cs;
3273 	} else {
3274 		cs1 = ui->ui_io_lock->io_list_back;
3275 		cs1->cs_linlck_next = cs;
3276 		ui->ui_io_lock->io_list_back = cs;
3277 	}
3278 	STAT_INC(raid_write_waits);
3279 	STAT_MAX(raid_max_write_q_length, raid_write_queue_length);
3280 	cs->cs_linlck_next = NULL;
3281 	mutex_exit(io_list_mutex);
3282 }
3283 
3284 /*
3285  * NAME:	raid_write
3286  * DESCRIPTION: RAID metadevice write routine
3287  * PARAMETERS:	mr_unit_t *un -  pointer to a unit structure
3288  *		md_raidcs_t *cs - pointer to a child structure
3289  */
3290 
3291 /*ARGSUSED*/
3292 static int
3293 raid_write(mr_unit_t *un, md_raidcs_t *cs)
3294 {
3295 	int		error = 0;
3296 	md_raidps_t	*ps;
3297 	mdi_unit_t	*ui;
3298 	minor_t		mnum;
3299 	clock_t		timeout;
3300 
3301 	ASSERT(IO_READER_HELD(un));
3302 	ps = cs->cs_ps;
3303 	ui = ps->ps_ui;
3304 
3305 	ASSERT(UNIT_STATE(un) != RUS_INIT);
3306 	if (UNIT_STATE(un) == RUS_LAST_ERRED)
3307 		error = EIO;
3308 
3309 	/* make sure the write doesn't go beyond the column */
3310 	if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn)
3311 		error = ENXIO;
3312 	if (error)
3313 		goto werror;
3314 
3315 	getresources(cs);
3316 
3317 	/*
3318 	 * this is an advisory loop that keeps the waiting lists short
3319 	 * to reduce cpu time.  Since there is a race introduced by not
3320 	 * aquiring all the correct mutexes, use a cv_timedwait to be
3321 	 * sure the write always will wake up and start.
3322 	 */
3323 	while (raid_check_pw(cs)) {
3324 		mutex_enter(&un->un_mx);
3325 		(void) drv_getparm(LBOLT, &timeout);
3326 		timeout += md_wr_wait;
3327 		un->un_rflags |= MD_RFLAG_NEEDPW;
3328 		STAT_INC(raid_prewrite_waits);
3329 		(void) cv_timedwait(&un->un_cv, &un->un_mx, timeout);
3330 		un->un_rflags &= ~MD_RFLAG_NEEDPW;
3331 		mutex_exit(&un->un_mx);
3332 	}
3333 
3334 	if (raid_line_writer_lock(cs, 1))
3335 		return (0);
3336 
3337 	un = (mr_unit_t *)md_unit_readerlock(ui);
3338 	cs->cs_un = un;
3339 	mnum = MD_SID(un);
3340 
3341 	if (un->un_state & RUS_REGEN) {
3342 		raid_regen_parity(cs);
3343 		un = MD_UNIT(mnum);
3344 		cs->cs_un = un;
3345 	}
3346 
3347 	raid_write_io(un, cs);
3348 	return (0);
3349 werror:
3350 	/* aquire unit reader lock sinc raid_free_child always drops it */
3351 	raid_error_parent(ps, error);
3352 	raid_free_child(cs, 0);
3353 	/* decrement both pwfrags and frags */
3354 	raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK);
3355 	return (0);
3356 }
3357 
3358 
3359 /*
3360  * NAMES:	raid_stage
3361  * DESCRIPTION: post-processing routine for a RAID metadevice
3362  * PARAMETERS:	md_raidcs_t *cs - pointer to child structure
3363  */
3364 static void
3365 raid_stage(md_raidcs_t *cs)
3366 {
3367 	md_raidps_t	*ps = cs->cs_ps;
3368 	mr_unit_t	*un = cs->cs_un;
3369 	md_raidcbuf_t	*cbuf;
3370 	buf_t		*bp;
3371 	void		*private;
3372 	int		flag;
3373 
3374 	switch (cs->cs_stage) {
3375 	    case RAID_READ_DONE:
3376 		raid_free_child(cs, 1);
3377 		/* decrement readfrags */
3378 		raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
3379 		return;
3380 
3381 	    case RAID_WRITE_DONE:
3382 	    case RAID_WRITE_PONLY_DONE:
3383 	    case RAID_WRITE_DONLY_DONE:
3384 		/*
3385 		 *  Completed writing real parity and/or data.
3386 		 */
3387 		ASSERT(cs->cs_flags & MD_RCS_PWDONE);
3388 		raid_free_child(cs, 1);
3389 		/* decrement frags but not pwfrags */
3390 		raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK);
3391 		return;
3392 
3393 	    case RAID_PREWRITE_DONE:
3394 		/*
3395 		 * completed writing data and parity to prewrite entries
3396 		 */
3397 		/*
3398 		 * WARNING: don't release unit reader lock here..
3399 		 * decrement pwfrags but not frags
3400 		 */
3401 		raid_free_parent(ps, RFP_DECR_PWFRAGS);
3402 		cs->cs_flags |= MD_RCS_PWDONE;
3403 		cs->cs_frags = 2;
3404 		cs->cs_stage = RAID_WRITE_DONE;
3405 		cs->cs_call = raid_stage;
3406 		cs->cs_error_call = raid_write_error;
3407 		cs->cs_retry_call = raid_write_no_retry;
3408 		if (WRITE_ALT(un, cs->cs_pcolumn)) {
3409 			cs->cs_frags++;
3410 			raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY |
3411 			    RIO_WRITE);
3412 		}
3413 		if (WRITE_ALT(un, cs->cs_dcolumn)) {
3414 			cs->cs_frags++;
3415 			raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
3416 		}
3417 		ASSERT(cs->cs_frags < 4);
3418 		raidio(cs, RIO_DATA | RIO_WRITE);
3419 		raidio(cs, RIO_PARITY | RIO_WRITE);
3420 		if (cs->cs_pw_inval_list) {
3421 			raid_free_pwinvalidate(cs);
3422 		}
3423 		return;
3424 
3425 	    case RAID_LINE_PWDONE:
3426 		ASSERT(cs->cs_frags == 0);
3427 		raid_free_parent(ps, RFP_DECR_PWFRAGS);
3428 		cs->cs_flags |= MD_RCS_PWDONE;
3429 		cs->cs_frags = un->un_origcolumncnt;
3430 		cs->cs_call = raid_stage;
3431 		cs->cs_error_call = raid_write_error;
3432 		cs->cs_retry_call = raid_write_no_retry;
3433 		cs->cs_stage = RAID_WRITE_DONE;
3434 		for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
3435 			/*
3436 			 * fill in buffer for write to prewrite area
3437 			 */
3438 			bp = &cbuf->cbuf_bp;
3439 			bp->b_back = bp;
3440 			bp->b_forw = bp;
3441 			bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE;
3442 			bp->b_bcount = cbuf->cbuf_bcount;
3443 			bp->b_bufsize = cbuf->cbuf_bcount;
3444 			bp->b_lblkno =
3445 			    un->un_column[cbuf->cbuf_column].un_devstart +
3446 			    cs->cs_blkno;
3447 			bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR);
3448 			bp->b_flags &= ~nv_available;
3449 			bp->b_flags |= B_WRITE | B_BUSY;
3450 			bp->b_iodone = (int (*)())raid_done;
3451 			bp->b_edev = md_dev64_to_dev(
3452 				un->un_column[cbuf->cbuf_column].un_dev);
3453 			bp->b_chain = (struct buf *)cs;
3454 			private = cs->cs_strategy_private;
3455 			flag = cs->cs_strategy_flag;
3456 			md_call_strategy(bp, flag, private);
3457 		}
3458 		raidio(cs, RIO_DATA | RIO_WRITE);
3459 		raidio(cs, RIO_PARITY | RIO_WRITE);
3460 		if (cs->cs_pw_inval_list) {
3461 			raid_free_pwinvalidate(cs);
3462 		}
3463 		return;
3464 
3465 	    default:
3466 		ASSERT(0);
3467 		break;
3468 	}
3469 }
3470 /*
3471  * NAME:	md_raid_strategy
3472  * DESCRIPTION: RAID metadevice I/O oprations entry point.
3473  * PARAMETERS:	buf_t	  *pb - pointer to a user I/O buffer
3474  *		int	 flag - metadevice specific flag
3475  *		void *private - carry over flag ??
3476  *
3477  */
3478 
3479 void
3480 md_raid_strategy(buf_t *pb, int flag, void *private)
3481 {
3482 	md_raidps_t	*ps;
3483 	md_raidcs_t	*cs;
3484 	int		doing_writes;
3485 	int		err;
3486 	mr_unit_t	*un;
3487 	mdi_unit_t	*ui;
3488 	size_t		count;
3489 	diskaddr_t	blkno;
3490 	caddr_t		addr;
3491 	off_t		offset;
3492 	int		colcnt;
3493 	minor_t		mnum;
3494 	set_t		setno;
3495 
3496 	ui = MDI_UNIT(getminor(pb->b_edev));
3497 	md_kstat_waitq_enter(ui);
3498 	un = (mr_unit_t *)md_io_readerlock(ui);
3499 	setno = MD_MIN2SET(getminor(pb->b_edev));
3500 
3501 	if ((flag & MD_NOBLOCK) == 0) {
3502 		if (md_inc_iocount(setno) != 0) {
3503 			pb->b_flags |= B_ERROR;
3504 			pb->b_error = ENXIO;
3505 			pb->b_resid = pb->b_bcount;
3506 			md_io_readerexit(ui);
3507 			biodone(pb);
3508 			return;
3509 		}
3510 	} else {
3511 		md_inc_iocount_noblock(setno);
3512 	}
3513 
3514 	mnum = MD_SID(un);
3515 	colcnt = un->un_totalcolumncnt - 1;
3516 	count = pb->b_bcount;
3517 
3518 	STAT_CHECK(raid_512, count == 512);
3519 	STAT_CHECK(raid_1024, count == 1024);
3520 	STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192);
3521 	STAT_CHECK(raid_8192, count == 8192);
3522 	STAT_CHECK(raid_8192_bigger, count > 8192);
3523 
3524 	(void *) md_unit_readerlock(ui);
3525 	if (!(flag & MD_STR_NOTTOP)) {
3526 		err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */
3527 		if (err != 0) {
3528 			md_kstat_waitq_exit(ui);
3529 			md_io_readerexit(ui);
3530 			return;
3531 		}
3532 	}
3533 	md_unit_readerexit(ui);
3534 
3535 	STAT_INC(raid_total_io);
3536 
3537 	/* allocate a parent structure for the user I/O */
3538 	ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS);
3539 	raid_parent_init(ps);
3540 
3541 	/*
3542 	 * Save essential information from the original buffhdr
3543 	 * in the md_save structure.
3544 	 */
3545 	ps->ps_un = un;
3546 	ps->ps_ui = ui;
3547 	ps->ps_bp = pb;
3548 	ps->ps_addr = pb->b_un.b_addr;
3549 
3550 	if ((pb->b_flags & B_READ) == 0) {
3551 		ps->ps_flags |= MD_RPS_WRITE;
3552 		doing_writes = 1;
3553 		STAT_INC(raid_writes);
3554 	} else {
3555 		ps->ps_flags |= MD_RPS_READ;
3556 		doing_writes = 0;
3557 		STAT_INC(raid_reads);
3558 	}
3559 
3560 	count = lbtodb(pb->b_bcount);	/* transfer count (in blocks) */
3561 	blkno = pb->b_lblkno;		/* block number on device */
3562 	addr  = 0;
3563 	offset = 0;
3564 	ps->ps_pwfrags = 1;
3565 	ps->ps_frags = 1;
3566 	md_kstat_waitq_to_runq(ui);
3567 
3568 	do {
3569 		cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS);
3570 		raid_child_init(cs);
3571 		cs->cs_ps = ps;
3572 		cs->cs_un = un;
3573 		cs->cs_mdunit = mnum;
3574 		cs->cs_strategy_flag = flag;
3575 		cs->cs_strategy_private = private;
3576 		cs->cs_addr = addr;
3577 		cs->cs_offset = offset;
3578 		count = raid_iosetup(un, blkno, count, cs);
3579 		if (cs->cs_flags & MD_RCS_LINE) {
3580 			blkno += (cs->cs_blkcnt * colcnt);
3581 			offset += (cs->cs_bcount * colcnt);
3582 		} else {
3583 			blkno +=  cs->cs_blkcnt;
3584 			offset += cs->cs_bcount;
3585 		}
3586 		/* for each cs bump up the ps_pwfrags and ps_frags fields */
3587 		if (count) {
3588 			mutex_enter(&ps->ps_mx);
3589 			ps->ps_pwfrags++;
3590 			ps->ps_frags++;
3591 			mutex_exit(&ps->ps_mx);
3592 			if (doing_writes)
3593 				(void) raid_write(un, cs);
3594 			else
3595 				(void) raid_read(un, cs);
3596 		}
3597 	} while (count);
3598 	if (doing_writes) {
3599 		(void) raid_write(un, cs);
3600 	} else
3601 		(void) raid_read(un, cs);
3602 
3603 	if (! (flag & MD_STR_NOTTOP) && panicstr) {
3604 		while (! (ps->ps_flags & MD_RPS_DONE)) {
3605 			md_daemon(1, &md_done_daemon);
3606 			drv_usecwait(10);
3607 		}
3608 		kmem_cache_free(raid_parent_cache, ps);
3609 	}
3610 }
3611 
3612 /*
3613  * NAMES:	raid_snarf
3614  * DESCRIPTION: RAID metadevice SNARF entry point
3615  * PARAMETERS:	md_snarfcmd_t cmd,
3616  *		set_t setno
3617  * RETURNS:
3618  */
3619 static int
3620 raid_snarf(md_snarfcmd_t cmd, set_t setno)
3621 {
3622 	mr_unit_t	*un;
3623 	mddb_recid_t	recid;
3624 	int		gotsomething;
3625 	int		all_raid_gotten;
3626 	mddb_type_t	typ1;
3627 	uint_t		ncol;
3628 	mddb_de_ic_t	*dep;
3629 	mddb_rb32_t	*rbp;
3630 	size_t		newreqsize;
3631 	mr_unit_t	*big_un;
3632 	mr_unit32_od_t	*small_un;
3633 
3634 
3635 	if (cmd == MD_SNARF_CLEANUP)
3636 		return (0);
3637 
3638 	all_raid_gotten = 1;
3639 	gotsomething = 0;
3640 	typ1 = (mddb_type_t)md_getshared_key(setno,
3641 	    raid_md_ops.md_driver.md_drivername);
3642 	recid = mddb_makerecid(setno, 0);
3643 
3644 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
3645 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) {
3646 			continue;
3647 		}
3648 
3649 		dep = mddb_getrecdep(recid);
3650 		dep->de_flags = MDDB_F_RAID;
3651 		rbp = dep->de_rb;
3652 		if ((rbp->rb_revision == MDDB_REV_RB) &&
3653 		    ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
3654 			/*
3655 			 * This means, we have an old and small record
3656 			 * and this record hasn't already been converted.
3657 			 * Before we create an incore metadevice from this
3658 			 * we have to convert it to a big record.
3659 			 */
3660 			small_un = (mr_unit32_od_t *)mddb_getrecaddr(recid);
3661 			ncol = small_un->un_totalcolumncnt;
3662 			newreqsize = sizeof (mr_unit_t) +
3663 				((ncol - 1) * sizeof (mr_column_t));
3664 			big_un = (mr_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
3665 			raid_convert((caddr_t)small_un, (caddr_t)big_un,
3666 				SMALL_2_BIG);
3667 			kmem_free(small_un, dep->de_reqsize);
3668 			dep->de_rb_userdata = big_un;
3669 			dep->de_reqsize = newreqsize;
3670 			un = big_un;
3671 			rbp->rb_private |= MD_PRV_CONVD;
3672 		} else {
3673 			/* Big device */
3674 			un = (mr_unit_t *)mddb_getrecaddr(recid);
3675 		}
3676 
3677 		/* Set revision and flag accordingly */
3678 		if (rbp->rb_revision == MDDB_REV_RB) {
3679 			un->c.un_revision = MD_32BIT_META_DEV;
3680 		} else {
3681 			un->c.un_revision = MD_64BIT_META_DEV;
3682 			un->c.un_flag |= MD_EFILABEL;
3683 		}
3684 
3685 		/*
3686 		 * Create minor device node for snarfed entry.
3687 		 */
3688 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
3689 
3690 		if (MD_UNIT(MD_SID(un)) != NULL) {
3691 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
3692 			continue;
3693 		}
3694 		all_raid_gotten = 0;
3695 		if (raid_build_incore((void *)un, 1) == 0) {
3696 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
3697 			md_create_unit_incore(MD_SID(un), &raid_md_ops,
3698 			    1);
3699 			gotsomething = 1;
3700 		} else if (un->mr_ic) {
3701 			kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
3702 				un->un_totalcolumncnt);
3703 			kmem_free(un->mr_ic, sizeof (*un->mr_ic));
3704 		}
3705 	}
3706 
3707 	if (!all_raid_gotten) {
3708 		return (gotsomething);
3709 	}
3710 
3711 	recid = mddb_makerecid(setno, 0);
3712 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
3713 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
3714 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
3715 
3716 	return (0);
3717 }
3718 
3719 /*
3720  * NAMES:	raid_halt
3721  * DESCRIPTION: RAID metadevice HALT entry point
3722  * PARAMETERS:	md_haltcmd_t cmd -
3723  *		set_t	setno -
3724  * RETURNS:
3725  */
3726 static int
3727 raid_halt(md_haltcmd_t cmd, set_t setno)
3728 {
3729 	set_t		i;
3730 	mdi_unit_t	*ui;
3731 	minor_t		mnum;
3732 
3733 	if (cmd == MD_HALT_CLOSE)
3734 		return (0);
3735 
3736 	if (cmd == MD_HALT_OPEN)
3737 		return (0);
3738 
3739 	if (cmd == MD_HALT_UNLOAD)
3740 		return (0);
3741 
3742 	if (cmd == MD_HALT_CHECK) {
3743 		for (i = 0; i < md_nunits; i++) {
3744 			mnum = MD_MKMIN(setno, i);
3745 			if ((ui = MDI_UNIT(mnum)) == NULL)
3746 				continue;
3747 			if (ui->ui_opsindex != raid_md_ops.md_selfindex)
3748 				continue;
3749 			if (md_unit_isopen(ui))
3750 				return (1);
3751 		}
3752 		return (0);
3753 	}
3754 
3755 	if (cmd != MD_HALT_DOIT)
3756 		return (1);
3757 
3758 	for (i = 0; i < md_nunits; i++) {
3759 		mnum = MD_MKMIN(setno, i);
3760 		if ((ui = MDI_UNIT(mnum)) == NULL)
3761 			continue;
3762 		if (ui->ui_opsindex != raid_md_ops.md_selfindex)
3763 			continue;
3764 		reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0);
3765 	}
3766 	return (0);
3767 }
3768 
3769 /*
3770  * NAMES:	raid_close_all_devs
3771  * DESCRIPTION: Close all the devices of the unit.
3772  * PARAMETERS:	mr_unit_t *un - pointer to unit structure
3773  * RETURNS:
3774  */
3775 void
3776 raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags)
3777 {
3778 	int		i;
3779 	mr_column_t	*device;
3780 
3781 	for (i = 0; i < un->un_totalcolumncnt; i++) {
3782 		device = &un->un_column[i];
3783 		if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
3784 			ASSERT((device->un_dev != (md_dev64_t)0) &&
3785 			    (device->un_dev != NODEV64));
3786 			if ((device->un_devstate & RCS_OKAY) && init_pw)
3787 				(void) init_pw_area(un, device->un_dev,
3788 							device->un_pwstart, i);
3789 			md_layered_close(device->un_dev, md_cflags);
3790 			device->un_devflags &= ~MD_RAID_DEV_ISOPEN;
3791 		}
3792 	}
3793 }
3794 
3795 /*
3796  * NAMES:	raid_open_all_devs
3797  * DESCRIPTION: Open all the components (columns) of the device unit.
3798  * PARAMETERS:	mr_unit_t *un - pointer to unit structure
3799  * RETURNS:
3800  */
3801 static int
3802 raid_open_all_devs(mr_unit_t *un, int md_oflags)
3803 {
3804 	minor_t		mnum = MD_SID(un);
3805 	int		i;
3806 	int		not_opened = 0;
3807 	int		commit = 0;
3808 	int		col = -1;
3809 	mr_column_t	*device;
3810 	set_t		setno = MD_MIN2SET(MD_SID(un));
3811 	side_t		side = mddb_getsidenum(setno);
3812 	mdkey_t		key;
3813 	mdi_unit_t	*ui = MDI_UNIT(mnum);
3814 
3815 	ui->ui_tstate &= ~MD_INACCESSIBLE;
3816 
3817 	for (i = 0; i < un->un_totalcolumncnt; i++) {
3818 		md_dev64_t tmpdev;
3819 
3820 		device = &un->un_column[i];
3821 
3822 		if (COLUMN_STATE(un, i) & RCS_ERRED) {
3823 			not_opened++;
3824 			continue;
3825 		}
3826 
3827 		if (device->un_devflags & MD_RAID_DEV_ISOPEN)
3828 			continue;
3829 
3830 		tmpdev = device->un_dev;
3831 		/*
3832 		 * Open by device id
3833 		 */
3834 		key = HOTSPARED(un, i) ?
3835 			device->un_hs_key : device->un_orig_key;
3836 		if ((md_getmajor(tmpdev) != md_major) &&
3837 			md_devid_found(setno, side, key) == 1) {
3838 			tmpdev = md_resolve_bydevid(mnum, tmpdev, key);
3839 		}
3840 		if (md_layered_open(mnum, &tmpdev, md_oflags)) {
3841 			device->un_dev = tmpdev;
3842 			not_opened++;
3843 			continue;
3844 		}
3845 		device->un_dev = tmpdev;
3846 		device->un_devflags |= MD_RAID_DEV_ISOPEN;
3847 	}
3848 
3849 	/* if open errors and errored devices are 1 then device can run */
3850 	if (not_opened > 1) {
3851 		cmn_err(CE_WARN,
3852 		"md: %s failed to open. open error on %s\n",
3853 			md_shortname(MD_SID(un)),
3854 			md_devname(MD_UN2SET(un), device->un_orig_dev,
3855 					NULL, 0));
3856 
3857 		ui->ui_tstate |= MD_INACCESSIBLE;
3858 
3859 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
3860 		    MD_UN2SET(un), MD_SID(un));
3861 
3862 		return (not_opened > 1);
3863 	}
3864 
3865 	for (i = 0; i < un->un_totalcolumncnt; i++) {
3866 		device = &un->un_column[i];
3867 		if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
3868 			if (device->un_devstate & RCS_LAST_ERRED) {
3869 			/*
3870 			 * At this point in time there is a possibility
3871 			 * that errors were the result of a controller
3872 			 * failure with more than a single column on it
3873 			 * so clear out last errored columns and let errors
3874 			 * re-occur is necessary.
3875 			 */
3876 				raid_set_state(un, i, RCS_OKAY, 0);
3877 				commit++;
3878 			}
3879 			continue;
3880 		}
3881 		ASSERT(col == -1);
3882 		col = i;
3883 	}
3884 
3885 	if (col != -1) {
3886 		raid_set_state(un, col, RCS_ERRED, 0);
3887 		commit++;
3888 	}
3889 
3890 	if (commit)
3891 		raid_commit(un, NULL);
3892 
3893 	if (col != -1) {
3894 		if (COLUMN_STATE(un, col) & RCS_ERRED) {
3895 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
3896 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
3897 		} else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
3898 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
3899 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
3900 		}
3901 	}
3902 
3903 	return (0);
3904 }
3905 
3906 /*
3907  * NAMES:	raid_internal_open
3908  * DESCRIPTION: Do the actual RAID open
3909  * PARAMETERS:	minor_t mnum - minor number of the RAID device
3910  *		int flag -
3911  *		int otyp -
3912  *		int md_oflags - RAID open flags
3913  * RETURNS:	0 if successful, nonzero otherwise
3914  */
3915 int
3916 raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags)
3917 {
3918 	mr_unit_t	*un;
3919 	mdi_unit_t	*ui;
3920 	int		err = 0;
3921 	int		replay_error = 0;
3922 
3923 	ui = MDI_UNIT(mnum);
3924 	ASSERT(ui != NULL);
3925 
3926 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
3927 	/*
3928 	 * this MUST be checked before md_unit_isopen is checked.
3929 	 * raid_init_columns sets md_unit_isopen to block reset, halt.
3930 	 */
3931 	if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) &&
3932 			!(md_oflags & MD_OFLG_ISINIT)) {
3933 		md_unit_openclose_exit(ui);
3934 		return (EAGAIN);
3935 	}
3936 
3937 	if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) {
3938 		err = md_unit_incopen(mnum, flag, otyp);
3939 		goto out;
3940 	}
3941 
3942 	md_unit_readerexit(ui);
3943 
3944 	un = (mr_unit_t *)md_unit_writerlock(ui);
3945 	if (raid_open_all_devs(un, md_oflags) == 0) {
3946 		if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) {
3947 			md_unit_writerexit(ui);
3948 			un = (mr_unit_t *)md_unit_readerlock(ui);
3949 			raid_close_all_devs(un, 0, md_oflags);
3950 			goto out;
3951 		}
3952 	} else {
3953 		/*
3954 		 * if this unit contains more than two errored components
3955 		 * should return error and close all opened devices
3956 		 */
3957 
3958 		md_unit_writerexit(ui);
3959 		un = (mr_unit_t *)md_unit_readerlock(ui);
3960 		raid_close_all_devs(un, 0, md_oflags);
3961 		md_unit_openclose_exit(ui);
3962 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
3963 		    MD_UN2SET(un), MD_SID(un));
3964 		return (ENXIO);
3965 	}
3966 
3967 	if (!(MD_STATUS(un) & MD_UN_REPLAYED)) {
3968 		replay_error = raid_replay(un);
3969 		MD_STATUS(un) |= MD_UN_REPLAYED;
3970 	}
3971 
3972 	md_unit_writerexit(ui);
3973 	un = (mr_unit_t *)md_unit_readerlock(ui);
3974 
3975 	if ((replay_error == RAID_RPLY_READONLY) &&
3976 	    ((flag & (FREAD | FWRITE)) == FREAD)) {
3977 		md_unit_openclose_exit(ui);
3978 		return (0);
3979 	}
3980 
3981 	/* allocate hotspare if possible */
3982 	(void) raid_hotspares();
3983 
3984 
3985 out:
3986 	md_unit_openclose_exit(ui);
3987 	return (err);
3988 }
3989 /*
3990  * NAMES:	raid_open
3991  * DESCRIPTION: RAID metadevice OPEN entry point
3992  * PARAMETERS:	dev_t dev -
3993  *		int flag -
3994  *		int otyp -
3995  *		cred_t * cred_p -
3996  *		int md_oflags -
3997  * RETURNS:
3998  */
3999 /*ARGSUSED1*/
4000 static int
4001 raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
4002 {
4003 	int		error = 0;
4004 
4005 	if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) {
4006 		return (error);
4007 	}
4008 	return (0);
4009 }
4010 
4011 /*
4012  * NAMES:	raid_internal_close
4013  * DESCRIPTION: RAID metadevice CLOSE actual implementation
4014  * PARAMETERS:	minor_t - minor number of the RAID device
4015  *		int otyp -
4016  *		int init_pw -
4017  *		int md_cflags - RAID close flags
4018  * RETURNS:	0 if successful, nonzero otherwise
4019  */
4020 /*ARGSUSED*/
4021 int
4022 raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags)
4023 {
4024 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4025 	mr_unit_t	*un;
4026 	int		err = 0;
4027 
4028 	/* single thread */
4029 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
4030 
4031 	/* count closed */
4032 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
4033 		goto out;
4034 	/* close devices, if necessary */
4035 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
4036 		raid_close_all_devs(un, init_pw, md_cflags);
4037 	}
4038 
4039 	/* unlock, return success */
4040 out:
4041 	md_unit_openclose_exit(ui);
4042 	return (err);
4043 }
4044 
4045 /*
4046  * NAMES:	raid_close
4047  * DESCRIPTION: RAID metadevice close entry point
4048  * PARAMETERS:	dev_t dev -
4049  *		int flag -
4050  *		int otyp -
4051  *		cred_t * cred_p -
4052  *		int md_oflags -
4053  * RETURNS:
4054  */
4055 /*ARGSUSED1*/
4056 static int
4057 raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
4058 {
4059 	int retval;
4060 
4061 	(void) md_io_writerlock(MDI_UNIT(getminor(dev)));
4062 	retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags);
4063 	(void) md_io_writerexit(MDI_UNIT(getminor(dev)));
4064 	return (retval);
4065 }
4066 
4067 /*
4068  * raid_probe_close_all_devs
4069  */
4070 void
4071 raid_probe_close_all_devs(mr_unit_t *un)
4072 {
4073 	int		i;
4074 	mr_column_t	*device;
4075 
4076 	for (i = 0; i < un->un_totalcolumncnt; i++) {
4077 		device = &un->un_column[i];
4078 
4079 		if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
4080 			md_layered_close(device->un_dev,
4081 				MD_OFLG_PROBEDEV);
4082 			device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN;
4083 		}
4084 	}
4085 }
4086 /*
4087  * Raid_probe_dev:
4088  *
4089  * On entry the unit writerlock is held
4090  */
4091 static int
4092 raid_probe_dev(mdi_unit_t *ui, minor_t mnum)
4093 {
4094 	mr_unit_t	*un;
4095 	int		i;
4096 	int		not_opened = 0;
4097 	int		commit = 0;
4098 	int		col = -1;
4099 	mr_column_t	*device;
4100 	int		md_devopen = 0;
4101 
4102 	if (md_unit_isopen(ui))
4103 		md_devopen++;
4104 
4105 	un = MD_UNIT(mnum);
4106 	/*
4107 	 * If the state has been set to LAST_ERRED because
4108 	 * of an error when the raid device was open at some
4109 	 * point in the past, don't probe. We really don't want
4110 	 * to reset the state in this case.
4111 	 */
4112 	if (UNIT_STATE(un) == RUS_LAST_ERRED)
4113 		return (0);
4114 
4115 	ui->ui_tstate &= ~MD_INACCESSIBLE;
4116 
4117 	for (i = 0; i < un->un_totalcolumncnt; i++) {
4118 		md_dev64_t tmpdev;
4119 
4120 		device = &un->un_column[i];
4121 		if (COLUMN_STATE(un, i) & RCS_ERRED) {
4122 			not_opened++;
4123 			continue;
4124 		}
4125 
4126 		tmpdev = device->un_dev;
4127 		/*
4128 		 * Currently the flags passed are not needed since
4129 		 * there cannot be an underlying metadevice. However
4130 		 * they are kept here for consistency.
4131 		 *
4132 		 * Open by device id
4133 		 */
4134 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)?
4135 			device->un_hs_key : device->un_orig_key);
4136 		if (md_layered_open(mnum, &tmpdev,
4137 				MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) {
4138 			device->un_dev = tmpdev;
4139 			not_opened++;
4140 			continue;
4141 		}
4142 		device->un_dev = tmpdev;
4143 
4144 		device->un_devflags |= MD_RAID_DEV_PROBEOPEN;
4145 	}
4146 
4147 	/*
4148 	 * The code below is careful on setting the LAST_ERRED state.
4149 	 *
4150 	 * If open errors and exactly one device has failed we can run.
4151 	 * If more then one device fails we have to figure out when to set
4152 	 * LAST_ERRED state.  The rationale is to avoid unnecessary resyncs
4153 	 * since they are painful and time consuming.
4154 	 *
4155 	 * When more than one component/column fails there are 2 scenerios.
4156 	 *
4157 	 * 1. Metadevice has NOT been opened: In this case, the behavior
4158 	 *    mimics the open symantics. ie. Only the first failed device
4159 	 *    is ERRED and LAST_ERRED is not set.
4160 	 *
4161 	 * 2. Metadevice has been opened: Here the read/write sematics are
4162 	 *    followed. The first failed devicce is ERRED and on the next
4163 	 *    failed device LAST_ERRED is set.
4164 	 */
4165 
4166 	if (not_opened > 1 && !md_devopen) {
4167 		cmn_err(CE_WARN,
4168 			"md: %s failed to open. open error on %s\n",
4169 				md_shortname(MD_SID(un)),
4170 				md_devname(MD_UN2SET(un), device->un_orig_dev,
4171 						NULL, 0));
4172 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
4173 		    MD_UN2SET(un), MD_SID(un));
4174 		raid_probe_close_all_devs(un);
4175 		ui->ui_tstate |= MD_INACCESSIBLE;
4176 		return (not_opened > 1);
4177 	}
4178 
4179 	if (!md_devopen) {
4180 		for (i = 0; i < un->un_totalcolumncnt; i++) {
4181 			device = &un->un_column[i];
4182 			if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
4183 				if (device->un_devstate & RCS_LAST_ERRED) {
4184 					/*
4185 					 * At this point in time there is a
4186 					 * possibility that errors were the
4187 					 * result of a controller failure with
4188 					 * more than a single column on it so
4189 					 * clear out last errored columns and
4190 					 * let errors re-occur is necessary.
4191 					 */
4192 					raid_set_state(un, i, RCS_OKAY, 0);
4193 					commit++;
4194 					}
4195 				continue;
4196 			}
4197 			ASSERT(col == -1);
4198 			/*
4199 			 * note if multiple devices are failing then only
4200 			 * the last one is marked as error
4201 			 */
4202 			col = i;
4203 		}
4204 
4205 		if (col != -1) {
4206 			raid_set_state(un, col, RCS_ERRED, 0);
4207 			commit++;
4208 		}
4209 
4210 	} else {
4211 		for (i = 0; i < un->un_totalcolumncnt; i++) {
4212 			device = &un->un_column[i];
4213 
4214 			/* if we have LAST_ERRED go ahead and commit. */
4215 			if (un->un_state & RUS_LAST_ERRED)
4216 				break;
4217 			/*
4218 			 * could not open the component
4219 			 */
4220 
4221 			if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) {
4222 				col = i;
4223 				raid_set_state(un, col, RCS_ERRED, 0);
4224 				commit++;
4225 			}
4226 		}
4227 	}
4228 
4229 	if (commit)
4230 		raid_commit(un, NULL);
4231 
4232 	if (col != -1) {
4233 		if (COLUMN_STATE(un, col) & RCS_ERRED) {
4234 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
4235 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
4236 		} else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
4237 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
4238 			    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
4239 		}
4240 	}
4241 
4242 	raid_probe_close_all_devs(un);
4243 	return (0);
4244 }
4245 
4246 static int
4247 raid_imp_set(
4248 	set_t	setno
4249 )
4250 {
4251 	mddb_recid_t    recid;
4252 	int		i, gotsomething;
4253 	mddb_type_t	typ1;
4254 	mddb_de_ic_t	*dep;
4255 	mddb_rb32_t	*rbp;
4256 	mr_unit_t	*un64;
4257 	mr_unit32_od_t	*un32;
4258 	minor_t		*self_id;	/* minor needs to be updated */
4259 	md_parent_t	*parent_id;	/* parent needs to be updated */
4260 	mddb_recid_t	*record_id;	 /* record id needs to be updated */
4261 	hsp_t		*hsp_id;
4262 
4263 	gotsomething = 0;
4264 
4265 	typ1 = (mddb_type_t)md_getshared_key(setno,
4266 	    raid_md_ops.md_driver.md_drivername);
4267 	recid = mddb_makerecid(setno, 0);
4268 
4269 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
4270 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
4271 			continue;
4272 
4273 		dep = mddb_getrecdep(recid);
4274 		rbp = dep->de_rb;
4275 
4276 		if (rbp->rb_revision == MDDB_REV_RB) {
4277 			/*
4278 			 * Small device
4279 			 */
4280 			un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid);
4281 			self_id = &(un32->c.un_self_id);
4282 			parent_id = &(un32->c.un_parent);
4283 			record_id = &(un32->c.un_record_id);
4284 			hsp_id = &(un32->un_hsp_id);
4285 
4286 			for (i = 0; i < un32->un_totalcolumncnt; i++) {
4287 			    mr_column32_od_t *device;
4288 
4289 			    device = &un32->un_column[i];
4290 			    if (!md_update_minor(setno, mddb_getsidenum
4291 				(setno), device->un_orig_key))
4292 				goto out;
4293 
4294 			    if (device->un_hs_id != 0)
4295 				device->un_hs_id = MAKERECID(
4296 				setno, device->un_hs_id);
4297 			}
4298 		} else {
4299 			un64 = (mr_unit_t *)mddb_getrecaddr(recid);
4300 			self_id = &(un64->c.un_self_id);
4301 			parent_id = &(un64->c.un_parent);
4302 			record_id = &(un64->c.un_record_id);
4303 			hsp_id = &(un64->un_hsp_id);
4304 
4305 			for (i = 0; i < un64->un_totalcolumncnt; i++) {
4306 			    mr_column_t	*device;
4307 
4308 			    device = &un64->un_column[i];
4309 			    if (!md_update_minor(setno, mddb_getsidenum
4310 				(setno), device->un_orig_key))
4311 				goto out;
4312 
4313 			    if (device->un_hs_id != 0)
4314 				device->un_hs_id = MAKERECID(
4315 				setno, device->un_hs_id);
4316 			}
4317 		}
4318 
4319 		/*
4320 		 * Update unit with the imported setno
4321 		 */
4322 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
4323 
4324 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
4325 
4326 		if (*hsp_id != -1)
4327 			*hsp_id = MAKERECID(setno, DBID(*hsp_id));
4328 
4329 		if (*parent_id != MD_NO_PARENT)
4330 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
4331 		*record_id = MAKERECID(setno, DBID(*record_id));
4332 		gotsomething = 1;
4333 	}
4334 
4335 out:
4336 	return (gotsomething);
4337 }
4338 
4339 static md_named_services_t raid_named_services[] = {
4340 	{raid_hotspares,			"poke hotspares"	},
4341 	{raid_rename_check,			MDRNM_CHECK		},
4342 	{raid_rename_lock,			MDRNM_LOCK		},
4343 	{(intptr_t (*)()) raid_rename_unlock,	MDRNM_UNLOCK		},
4344 	{(intptr_t (*)()) raid_probe_dev,	"probe open test"	},
4345 	{NULL,					0			}
4346 };
4347 
4348 md_ops_t raid_md_ops = {
4349 	raid_open,		/* open */
4350 	raid_close,		/* close */
4351 	md_raid_strategy,	/* strategy */
4352 	NULL,			/* print */
4353 	NULL,			/* dump */
4354 	NULL,			/* read */
4355 	NULL,			/* write */
4356 	md_raid_ioctl,		/* ioctl, */
4357 	raid_snarf,		/* raid_snarf */
4358 	raid_halt,		/* raid_halt */
4359 	NULL,			/* aread */
4360 	NULL,			/* awrite */
4361 	raid_imp_set,		/* import set */
4362 	raid_named_services
4363 };
4364 
4365 static void
4366 init_init()
4367 {
4368 	/* default to a second */
4369 	if (md_wr_wait == 0)
4370 		md_wr_wait = md_hz >> 1;
4371 
4372 	raid_parent_cache = kmem_cache_create("md_raid_parent",
4373 	    sizeof (md_raidps_t), 0, raid_parent_constructor,
4374 	    raid_parent_destructor, raid_run_queue, NULL, NULL, 0);
4375 	raid_child_cache = kmem_cache_create("md_raid_child",
4376 	    sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0,
4377 	    raid_child_constructor, raid_child_destructor,
4378 	    raid_run_queue, NULL, NULL, 0);
4379 	raid_cbuf_cache = kmem_cache_create("md_raid_cbufs",
4380 	    sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor,
4381 	    raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0);
4382 }
4383 
4384 static void
4385 fini_uninit()
4386 {
4387 	kmem_cache_destroy(raid_parent_cache);
4388 	kmem_cache_destroy(raid_child_cache);
4389 	kmem_cache_destroy(raid_cbuf_cache);
4390 	raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL;
4391 }
4392 
4393 /* define the module linkage */
4394 MD_PLUGIN_MISC_MODULE("raid module %I%", init_init(), fini_uninit())
4395