xref: /titanic_41/usr/src/uts/common/io/lvm/stripe/stripe.c (revision afd1ac7b1c9a8cdf273c865aa5e9a14620341443)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/conf.h>
32 #include <sys/file.h>
33 #include <sys/user.h>
34 #include <sys/uio.h>
35 #include <sys/t_lock.h>
36 #include <sys/buf.h>
37 #include <sys/dkio.h>
38 #include <sys/vtoc.h>
39 #include <sys/kmem.h>
40 #include <vm/page.h>
41 #include <sys/cmn_err.h>
42 #include <sys/sysmacros.h>
43 #include <sys/types.h>
44 #include <sys/mkdev.h>
45 #include <sys/stat.h>
46 #include <sys/open.h>
47 #include <sys/lvm/mdio.h>
48 #include <sys/lvm/mdvar.h>
49 #include <sys/lvm/md_stripe.h>
50 #include <sys/lvm/md_convert.h>
51 #include <sys/lvm/md_notify.h>
52 #include <sys/modctl.h>
53 #include <sys/ddi.h>
54 #include <sys/sunddi.h>
55 #include <sys/debug.h>
56 #include <sys/sysevent/eventdefs.h>
57 #include <sys/sysevent/svm.h>
58 
59 md_ops_t		stripe_md_ops;
60 #ifndef	lint
61 static char		_depends_on[] = "drv/md";
62 static md_ops_t		*md_interface_ops = &stripe_md_ops;
63 #endif
64 
65 extern unit_t		md_nunits;
66 extern set_t		md_nsets;
67 extern md_set_t		md_set[];
68 
69 extern kmutex_t		md_mx;
70 extern kcondvar_t	md_cv;
71 
72 extern int		md_status;
73 extern major_t		md_major;
74 extern mdq_anchor_t	md_done_daemon;
75 
76 static int		md_stripe_mcs_buf_off;
77 static kmem_cache_t	*stripe_parent_cache = NULL;
78 static kmem_cache_t	*stripe_child_cache = NULL;
79 
80 /*ARGSUSED1*/
81 static int
82 stripe_parent_constructor(void *p, void *d1, int d2)
83 {
84 	mutex_init(&((md_sps_t *)p)->ps_mx,
85 	    NULL, MUTEX_DEFAULT, NULL);
86 	return (0);
87 }
88 
89 static void
90 stripe_parent_init(void *ps)
91 {
92 	bzero(ps, offsetof(md_sps_t, ps_mx));
93 }
94 
95 /*ARGSUSED1*/
96 static void
97 stripe_parent_destructor(void *p, void *d)
98 {
99 	mutex_destroy(&((md_sps_t *)p)->ps_mx);
100 }
101 
102 /*ARGSUSED1*/
103 static int
104 stripe_child_constructor(void *p, void *d1, int d2)
105 {
106 	bioinit(&((md_scs_t *)p)->cs_buf);
107 	return (0);
108 }
109 
110 static void
111 stripe_child_init(md_scs_t *cs)
112 {
113 	cs->cs_mdunit = 0;
114 	cs->cs_ps = NULL;
115 	cs->cs_comp = NULL;
116 	md_bioreset(&cs->cs_buf);
117 }
118 
119 /*ARGSUSED1*/
120 static void
121 stripe_child_destructor(void *p, void *d)
122 {
123 	biofini(&((md_scs_t *)p)->cs_buf);
124 }
125 
126 /*ARGSUSED*/
127 static void
128 stripe_run_queue(void *d)
129 {
130 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
131 		md_daemon(1, &md_done_daemon);
132 }
133 
134 static void
135 stripe_close_all_devs(ms_unit_t *un, int md_cflags)
136 {
137 	int		row;
138 	int		i;
139 	int		c;
140 	struct ms_comp	*mdcomp;
141 
142 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
143 	for (row = 0; row < un->un_nrows; row++) {
144 		struct ms_row *mdr = &un->un_row[row];
145 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
146 			struct ms_comp	*mdc;
147 			mdc = &mdcomp[c++];
148 			if (md_cflags & MD_OFLG_PROBEDEV) {
149 
150 			/*
151 			 * It is possible that the md_layered_open
152 			 * failed because the stripe unit structure
153 			 * contained a NODEV.  In such a case since
154 			 * there is nothing to open, there is nothing
155 			 * to close.
156 			 */
157 				if (mdc->un_dev == NODEV64)
158 					continue;
159 			}
160 			if ((md_cflags & MD_OFLG_PROBEDEV) &&
161 			    (mdc->un_mirror.ms_flags & MDM_S_PROBEOPEN)) {
162 				md_layered_close(mdc->un_dev,
163 				    md_cflags);
164 				mdc->un_mirror.ms_flags &=
165 						~MDM_S_PROBEOPEN;
166 			} else if (mdc->un_mirror.ms_flags & MDM_S_ISOPEN) {
167 				md_layered_close(mdc->un_dev, md_cflags);
168 				mdc->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
169 			}
170 		}
171 	}
172 }
173 
174 static int
175 stripe_open_all_devs(ms_unit_t *un, int md_oflags)
176 {
177 	minor_t		mnum = MD_SID(un);
178 	int		row;
179 	int		i;
180 	int		c;
181 	struct ms_comp	*mdcomp;
182 	int		err;
183 	int		cont_on_errors = (md_oflags & MD_OFLG_CONT_ERRS);
184 	int		probe_err_cnt = 0;
185 	int		total_comp_cnt = 0;
186 	set_t		setno = MD_MIN2SET(MD_SID(un));
187 	side_t		side = mddb_getsidenum(setno);
188 	mdkey_t		key;
189 
190 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
191 
192 	/*
193 	 * For a probe call, if any component of a stripe or a concat
194 	 * can be opened, it is considered to be a success. The total number
195 	 * of components in a stripe are computed prior to starting a probe.
196 	 * This number is then compared against the number of components
197 	 * that could be be successfully opened. If none of the components
198 	 * in a stripe can be opened, only then an ENXIO is returned for a
199 	 * probe type open.
200 	 */
201 
202 	for (row = 0; row < un->un_nrows; row++) {
203 		struct ms_row *mdr = &un->un_row[row];
204 
205 		if (md_oflags & MD_OFLG_PROBEDEV)
206 			total_comp_cnt += mdr->un_ncomp;
207 
208 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
209 			struct ms_comp	*mdc;
210 			md_dev64_t tmpdev;
211 
212 			mdc = &mdcomp[c++];
213 			tmpdev = mdc->un_dev;
214 			/*
215 			 * Do the open by device id
216 			 * Check if this comp is hotspared and
217 			 * if it is then use the key for hotspare.
218 			 * MN disksets don't use devids, so we better don't use
219 			 * md_devid_found/md_resolve_bydevid there. Rather do,
220 			 * what's done in stripe_build_incore()
221 			 */
222 			if (MD_MNSET_SETNO(setno)) {
223 				if (mdc->un_mirror.ms_hs_id != 0) {
224 					(void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
225 					    0, &mdc->un_mirror.ms_hs_id, NULL,
226 					    &tmpdev, NULL);
227 				}
228 			} else {
229 				key = mdc->un_mirror.ms_hs_id ?
230 				    mdc->un_mirror.ms_hs_key : mdc->un_key;
231 				if ((md_getmajor(tmpdev) != md_major) &&
232 				    md_devid_found(setno, side, key) == 1) {
233 					tmpdev = md_resolve_bydevid(mnum,
234 					    tmpdev, key);
235 				}
236 			}
237 
238 			/*
239 			 * For a submirror, we only want to open those devices
240 			 * that are not errored. If the device is errored then
241 			 * then there is no reason to open it and leaving it
242 			 * closed allows the RCM/DR code to work so that the
243 			 * errored device can be replaced.
244 			 */
245 			if ((md_oflags & MD_OFLG_PROBEDEV) ||
246 			    ! (mdc->un_mirror.ms_state & CS_ERRED)) {
247 
248 				err = md_layered_open(mnum, &tmpdev, md_oflags);
249 			} else {
250 				err = ENXIO;
251 			}
252 
253 			/*
254 			 * Only set the un_dev if the tmpdev != NODEV64. If
255 			 * it is NODEV64 then the md_layered_open() will have
256 			 * failed in some manner.
257 			 */
258 			if (tmpdev != NODEV64)
259 				mdc->un_dev = tmpdev;
260 
261 			if (err) {
262 				if (!cont_on_errors) {
263 					stripe_close_all_devs(un, md_oflags);
264 					return (ENXIO);
265 				}
266 
267 				if (md_oflags & MD_OFLG_PROBEDEV)
268 					probe_err_cnt++;
269 			} else {
270 				if (md_oflags & MD_OFLG_PROBEDEV) {
271 					mdc->un_mirror.ms_flags |=
272 						MDM_S_PROBEOPEN;
273 				} else
274 					mdc->un_mirror.ms_flags |= MDM_S_ISOPEN;
275 			}
276 		}
277 	}
278 
279 	/* If every component in a stripe could not be opened fail */
280 	if ((md_oflags & MD_OFLG_PROBEDEV) &&
281 	    (probe_err_cnt == total_comp_cnt))
282 		return (ENXIO);
283 	else
284 		return (0);
285 }
286 
287 int
288 stripe_build_incore(void *p, int snarfing)
289 {
290 	ms_unit_t *un = (ms_unit_t *)p;
291 	struct ms_comp	*mdcomp;
292 	minor_t		mnum;
293 	int		row;
294 	int		i;
295 	int		c;
296 	int		ncomps;
297 
298 	mnum = MD_SID(un);
299 
300 	if (MD_UNIT(mnum) != NULL)
301 		return (0);
302 
303 	MD_STATUS(un) = 0;
304 
305 	/*
306 	 * Reset all the is_open flags, these are probably set
307 	 * cause they just came out of the database.
308 	 */
309 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
310 
311 	ncomps = 0;
312 	for (row = 0; row < un->un_nrows; row++) {
313 		struct ms_row *mdr = &un->un_row[row];
314 		ncomps += mdr->un_ncomp;
315 	}
316 
317 	for (row = 0; row < un->un_nrows; row++) {
318 		struct ms_row *mdr = &un->un_row[row];
319 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
320 			struct ms_comp		*mdc;
321 			set_t			setno;
322 			md_dev64_t		tmpdev;
323 
324 			mdc = &mdcomp[c++];
325 			mdc->un_mirror.ms_flags &=
326 			    ~(MDM_S_ISOPEN | MDM_S_IOERR | MDM_S_RS_TRIED);
327 
328 			if (!snarfing)
329 				continue;
330 
331 			setno = MD_MIN2SET(mnum);
332 
333 			tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
334 			    mdc->un_key, MD_NOTRUST_DEVT);
335 			mdc->un_dev = tmpdev;
336 			/*
337 			 * Check for hotspares. If the hotspares haven't been
338 			 * snarfed yet, stripe_open_all_devs() will do the
339 			 * remapping of the dev's later.
340 			 */
341 			if (mdc->un_mirror.ms_hs_id != 0) {
342 				mdc->un_mirror.ms_orig_dev = mdc->un_dev;
343 				(void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
344 				    0, &mdc->un_mirror.ms_hs_id, NULL,
345 				    &tmpdev, NULL);
346 				mdc->un_dev = tmpdev;
347 			}
348 		}
349 	}
350 
351 	MD_UNIT(mnum) = un;
352 	return (0);
353 }
354 
355 void
356 reset_stripe(ms_unit_t *un, minor_t mnum, int removing)
357 {
358 	ms_comp_t	*mdcomp;
359 	struct ms_row	*mdr;
360 	int		i, c;
361 	int		row;
362 	int		nsv;
363 	int		isv;
364 	sv_dev_t	*sv;
365 	mddb_recid_t	*recids;
366 	mddb_recid_t	vtoc_id;
367 	int		rid = 0;
368 
369 	md_destroy_unit_incore(mnum, &stripe_md_ops);
370 
371 	MD_UNIT(mnum) = NULL;
372 
373 	if (!removing)
374 		return;
375 
376 	nsv = 0;
377 	/* Count the number of devices */
378 	for (row = 0; row < un->un_nrows; row++) {
379 		mdr = &un->un_row[row];
380 		nsv += mdr->un_ncomp;
381 	}
382 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t) * nsv, KM_SLEEP);
383 
384 	/*
385 	 * allocate recids array.  since we may have to commit
386 	 * underlying soft partition records, we need an array
387 	 * of size: total number of components in stripe + 3
388 	 * (one for the stripe itself, one for the hotspare, one
389 	 * for the end marker).
390 	 */
391 	recids = kmem_alloc(sizeof (mddb_recid_t) * (nsv + 3), KM_SLEEP);
392 
393 	/*
394 	 * Save the md_dev64_t's and driver nm indexes.
395 	 * Because after the mddb_deleterec() we will
396 	 * not be able to access the unit structure.
397 	 *
398 	 * NOTE: Deleting the names before deleting the
399 	 *	 unit structure would cause problems if
400 	 *	 the machine crashed in between the two.
401 	 */
402 	isv = 0;
403 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
404 
405 	for (row = 0; row < un->un_nrows; row++) {
406 		mdr = &un->un_row[row];
407 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
408 			struct ms_comp	*mdc;
409 			md_dev64_t	child_dev;
410 			md_unit_t	*child_un;
411 
412 			mdc = &mdcomp[c++];
413 			if (mdc->un_mirror.ms_hs_id != 0) {
414 				mdkey_t		hs_key;
415 
416 				hs_key = mdc->un_mirror.ms_hs_key;
417 
418 				mdc->un_dev = mdc->un_mirror.ms_orig_dev;
419 				mdc->un_start_block =
420 				    mdc->un_mirror.ms_orig_blk;
421 				mdc->un_mirror.ms_hs_id = 0;
422 				mdc->un_mirror.ms_hs_key = 0;
423 				mdc->un_mirror.ms_orig_dev = 0;
424 				recids[0] = 0;
425 				recids[1] = 0;	/* recids[1] filled in below */
426 				recids[2] = 0;
427 				(void) md_hot_spare_ifc(HS_FREE, un->un_hsp_id,
428 				    0, 0, &recids[0], &hs_key, NULL, NULL);
429 				mddb_commitrecs_wrapper(recids);
430 			}
431 
432 			/*
433 			 * check if we've got metadevice below us and
434 			 * deparent it if we do.
435 			 * NOTE: currently soft partitions are the
436 			 * the only metadevices stripes can be
437 			 * built on top of.
438 			 */
439 			child_dev = mdc->un_dev;
440 			if (md_getmajor(child_dev) == md_major) {
441 				child_un = MD_UNIT(md_getminor(child_dev));
442 				md_reset_parent(child_dev);
443 				recids[rid++] = MD_RECID(child_un);
444 			}
445 
446 			sv[isv].setno = MD_MIN2SET(mnum);
447 			sv[isv++].key = mdc->un_key;
448 		}
449 	}
450 
451 	recids[rid++] = un->c.un_record_id;
452 	recids[rid] = 0;	/* filled in below */
453 
454 	/*
455 	 * Decrement the HSP reference count and
456 	 * remove the knowledge of the HSP from the unit struct.
457 	 * This is done atomically to remove a window.
458 	 */
459 	if (un->un_hsp_id != -1) {
460 		(void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
461 		    &recids[rid++], NULL, NULL, NULL);
462 		un->un_hsp_id = -1;
463 	}
464 
465 	/* set end marker and commit records */
466 	recids[rid] = 0;
467 	mddb_commitrecs_wrapper(recids);
468 
469 	vtoc_id = un->c.un_vtoc_id;
470 
471 	/* Remove the unit structure */
472 	mddb_deleterec_wrapper(un->c.un_record_id);
473 
474 	/* Remove the vtoc, if present */
475 	if (vtoc_id)
476 		mddb_deleterec_wrapper(vtoc_id);
477 
478 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
479 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
480 	md_rem_names(sv, nsv);
481 	kmem_free(sv, sizeof (sv_dev_t) * nsv);
482 	kmem_free(recids, sizeof (mddb_recid_t) * (nsv + 3));
483 }
484 
485 static void
486 stripe_error(md_sps_t *ps)
487 {
488 	struct buf	*pb = ps->ps_bp;
489 	mdi_unit_t	*ui = ps->ps_ui;
490 	md_dev64_t	dev = ps->ps_errcomp->un_dev;
491 	md_dev64_t	md_dev = md_expldev(pb->b_edev);
492 	char		*str;
493 
494 	if (pb->b_flags & B_READ) {
495 		ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_READERR;
496 		str = "read";
497 	} else {
498 		ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_WRTERR;
499 		str = "write";
500 	}
501 	if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
502 		if (MUTEX_HELD(&ps->ps_mx)) {
503 			mutex_exit(&ps->ps_mx);
504 		}
505 	} else {
506 		ASSERT(panicstr);
507 	}
508 	SPS_FREE(stripe_parent_cache, ps);
509 	pb->b_flags |= B_ERROR;
510 
511 	md_kstat_done(ui, pb, 0);
512 	md_unit_readerexit(ui);
513 	md_biodone(pb);
514 
515 	cmn_err(CE_WARN, "md: %s: %s error on %s",
516 	    md_shortname(md_getminor(md_dev)), str,
517 	    md_devname(MD_DEV2SET(md_dev), dev, NULL, 0));
518 }
519 
520 static int
521 stripe_done(struct buf *cb)
522 {
523 	struct buf	*pb;
524 	mdi_unit_t	*ui;
525 	md_sps_t	*ps;
526 	md_scs_t	*cs;
527 
528 	/*LINTED*/
529 	cs = (md_scs_t *)((caddr_t)cb - md_stripe_mcs_buf_off);
530 	ps = cs->cs_ps;
531 	pb = ps->ps_bp;
532 
533 	mutex_enter(&ps->ps_mx);
534 	if (cb->b_flags & B_ERROR) {
535 		ps->ps_flags |= MD_SPS_ERROR;
536 		pb->b_error = cb->b_error;
537 		ps->ps_errcomp = cs->cs_comp;
538 	}
539 
540 	if (cb->b_flags & B_REMAPPED)
541 		bp_mapout(cb);
542 
543 	ps->ps_frags--;
544 	if (ps->ps_frags != 0) {
545 		mutex_exit(&ps->ps_mx);
546 		kmem_cache_free(stripe_child_cache, cs);
547 		return (1);
548 	}
549 	kmem_cache_free(stripe_child_cache, cs);
550 	if (ps->ps_flags & MD_SPS_ERROR) {
551 		stripe_error(ps);
552 		return (1);
553 	}
554 	ui = ps->ps_ui;
555 	if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
556 		mutex_exit(&ps->ps_mx);
557 	} else {
558 		ASSERT(panicstr);
559 	}
560 	SPS_FREE(stripe_parent_cache, ps);
561 	md_kstat_done(ui, pb, 0);
562 	md_unit_readerexit(ui);
563 	md_biodone(pb);
564 	return (0);
565 }
566 
567 
568 /*
569  * This routine does the mapping from virtual (dev, blkno) of a metapartition
570  * to the real (dev, blkno) of a real disk partition.
571  * It goes to the md_conf[] table to find out the correct real partition
572  * dev and block number for this buffer.
573  *
574  * A single buf request can not go across real disk partition boundary.
575  * When the virtual request specified by (dev, blkno) spans more than one
576  * real partition, md_mapbuf will return 1. Then the caller should prepare
577  * another real buf and continue calling md_mapbuf to do the mapping until
578  * it returns 0.
579  *
580  */
581 
582 static int
583 md_mapbuf(
584 	ms_unit_t	*un,
585 	diskaddr_t	blkno,
586 	u_longlong_t	bcount,
587 	buf_t		*bp,	/* if bp==NULL, skip bp updates */
588 	ms_comp_t	**mdc)	/* if bp==NULL, skip mdc update */
589 {
590 	struct ms_row	*mdr;
591 	struct ms_comp	*mdcomp;
592 	diskaddr_t	stripe_blk;
593 	diskaddr_t	fragment, blk_in_row, endblk;
594 	offset_t	interlace;
595 	size_t		dev_index;
596 	int		row_index, more;
597 	extern unsigned md_maxphys;
598 	/* Work var's when bp==NULL */
599 	u_longlong_t	wb_bcount;
600 	diskaddr_t	wb_blkno;
601 	md_dev64_t	wb_edev;
602 	ms_comp_t	*wmdc;
603 
604 	/*
605 	 * Do a real calculation to derive the minor device of the
606 	 * Virtual Disk, which in turn will let us derive the
607 	 * device/minor of the underlying real device.
608 	 */
609 
610 
611 	for (row_index = 0; row_index < un->un_nrows; row_index++) {
612 		mdr = &un->un_row[row_index];
613 		if (blkno < mdr->un_cum_blocks)
614 			break;
615 	}
616 	ASSERT(row_index != un->un_nrows);
617 
618 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
619 
620 	blk_in_row = blkno - mdr->un_cum_blocks + mdr->un_blocks;
621 	endblk = (diskaddr_t)(blkno + howmany(bcount, DEV_BSIZE));
622 	if (mdr->un_ncomp == 1) { /* No striping */
623 		if (endblk > mdr->un_cum_blocks) {
624 			wb_bcount = ldbtob(mdr->un_cum_blocks - blkno);
625 			if ((row_index + 1) == un->un_nrows)
626 				more = 0;
627 			else
628 				more = 1;
629 		} else {
630 			wb_bcount = bcount;
631 			more = 0;
632 		}
633 		wmdc = &mdcomp[mdr->un_icomp];
634 		wb_blkno = blk_in_row;
635 	} else { /* Have striping */
636 		interlace = mdr->un_interlace;
637 		fragment = blk_in_row % interlace;
638 		if (bcount > ldbtob(interlace - fragment)) {
639 			more = 1;
640 			wb_bcount = ldbtob(interlace - fragment);
641 		} else {
642 			more = 0;
643 			wb_bcount = bcount;
644 		}
645 
646 		stripe_blk = blk_in_row / interlace;
647 		dev_index = (size_t)(stripe_blk % mdr->un_ncomp);
648 		wmdc = &mdcomp[mdr->un_icomp + dev_index];
649 		wb_blkno = (diskaddr_t)(((stripe_blk / mdr->un_ncomp)
650 			* interlace) + fragment);
651 	}
652 
653 	wb_blkno += wmdc->un_start_block;
654 	wb_edev = wmdc->un_dev;
655 
656 	/* only break up the I/O if we're not built on another metadevice */
657 	if ((md_getmajor(wb_edev) != md_major) && (wb_bcount > md_maxphys)) {
658 		wb_bcount = md_maxphys;
659 		more = 1;
660 	}
661 	if (bp != (buf_t *)NULL) {
662 		/*
663 		 * wb_bcount is limited by md_maxphys which is 'int'
664 		 */
665 		bp->b_bcount = (size_t)wb_bcount;
666 		bp->b_lblkno = wb_blkno;
667 		bp->b_edev = md_dev64_to_dev(wb_edev);
668 		*mdc = wmdc;
669 	}
670 	return (more);
671 }
672 
673 static void
674 md_stripe_strategy(buf_t *pb, int flag, void *private)
675 {
676 	md_sps_t	*ps;
677 	md_scs_t	*cs;
678 	int		doing_writes;
679 	int		more;
680 	ms_unit_t	*un;
681 	mdi_unit_t	*ui;
682 	size_t		current_count;
683 	diskaddr_t	current_blkno;
684 	off_t		current_offset;
685 	buf_t		*cb;		/* child buf pointer */
686 	set_t		setno;
687 
688 	setno = MD_MIN2SET(getminor(pb->b_edev));
689 
690 	/*
691 	 * When doing IO to a multi owner meta device, check if set is halted.
692 	 * We do this check without the needed lock held, for performance
693 	 * reasons.
694 	 * If an IO just slips through while the set is locked via an
695 	 * MD_MN_SUSPEND_SET, we don't care about it.
696 	 * Only check for a suspended set if we are a top-level i/o request
697 	 * (MD_STR_NOTTOP is cleared in 'flag').
698 	 */
699 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
700 	    (MD_SET_HALTED | MD_SET_MNSET)) {
701 		if ((flag & MD_STR_NOTTOP) == 0) {
702 			mutex_enter(&md_mx);
703 			/* Here we loop until the set is no longer halted */
704 			while (md_set[setno].s_status & MD_SET_HALTED) {
705 				cv_wait(&md_cv, &md_mx);
706 			}
707 			mutex_exit(&md_mx);
708 		}
709 	}
710 
711 	ui = MDI_UNIT(getminor(pb->b_edev));
712 
713 	md_kstat_waitq_enter(ui);
714 
715 	un = (ms_unit_t *)md_unit_readerlock(ui);
716 
717 	if ((flag & MD_NOBLOCK) == 0) {
718 		if (md_inc_iocount(setno) != 0) {
719 			pb->b_flags |= B_ERROR;
720 			pb->b_error = ENXIO;
721 			pb->b_resid = pb->b_bcount;
722 			md_unit_readerexit(ui);
723 			biodone(pb);
724 			return;
725 		}
726 	} else {
727 		md_inc_iocount_noblock(setno);
728 	}
729 
730 	if (!(flag & MD_STR_NOTTOP)) {
731 		if (md_checkbuf(ui, (md_unit_t *)un, pb) != 0) {
732 			md_kstat_waitq_exit(ui);
733 			return;
734 		}
735 	}
736 
737 	ps = kmem_cache_alloc(stripe_parent_cache, MD_ALLOCFLAGS);
738 	stripe_parent_init(ps);
739 
740 	/*
741 	 * Save essential information from the original buffhdr
742 	 * in the md_save structure.
743 	 */
744 	ps->ps_un = un;
745 	ps->ps_ui = ui;
746 	ps->ps_bp = pb;
747 	ps->ps_addr = pb->b_un.b_addr;
748 
749 	if ((pb->b_flags & B_READ) == 0)
750 		doing_writes = 1;
751 	else
752 		doing_writes = 0;
753 
754 
755 	current_count = pb->b_bcount;
756 	current_blkno = pb->b_lblkno;
757 	current_offset  = 0;
758 
759 	if (!(flag & MD_STR_NOTTOP) && panicstr)
760 		ps->ps_flags |= MD_SPS_DONTFREE;
761 
762 	md_kstat_waitq_to_runq(ui);
763 
764 	ps->ps_frags++;
765 	do {
766 		cs = kmem_cache_alloc(stripe_child_cache, MD_ALLOCFLAGS);
767 		stripe_child_init(cs);
768 		cb = &cs->cs_buf;
769 		cs->cs_ps = ps;
770 		more = md_mapbuf(un, current_blkno, current_count, cb,
771 			&cs->cs_comp);
772 
773 		cb = md_bioclone(pb, current_offset, cb->b_bcount, cb->b_edev,
774 				cb->b_lblkno, stripe_done, cb, KM_NOSLEEP);
775 		/*
776 		 * Do these calculations now,
777 		 *  so that we pickup a valid b_bcount from the chld_bp.
778 		 */
779 		current_offset += cb->b_bcount;
780 		current_count -=  cb->b_bcount;
781 		current_blkno +=  (diskaddr_t)(lbtodb(cb->b_bcount));
782 
783 		if (more) {
784 			mutex_enter(&ps->ps_mx);
785 			ps->ps_frags++;
786 			mutex_exit(&ps->ps_mx);
787 		}
788 
789 		if (doing_writes &&
790 		    cs->cs_comp->un_mirror.ms_flags & MDM_S_NOWRITE) {
791 			(void) stripe_done(cb);
792 			continue;
793 		}
794 		md_call_strategy(cb, flag, private);
795 	} while (more);
796 
797 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
798 		while (!(ps->ps_flags & MD_SPS_DONE)) {
799 			md_daemon(1, &md_done_daemon);
800 			drv_usecwait(10);
801 		}
802 		kmem_cache_free(stripe_parent_cache, ps);
803 	}
804 }
805 
806 static int
807 stripe_snarf(md_snarfcmd_t cmd, set_t setno)
808 {
809 	ms_unit_t	*un;
810 	mddb_recid_t	recid;
811 	int		gotsomething;
812 	int		all_stripes_gotten;
813 	mddb_type_t	typ1;
814 	mddb_de_ic_t	*dep;
815 	mddb_rb32_t	*rbp;
816 	size_t		newreqsize;
817 	ms_unit_t	*big_un;
818 	ms_unit32_od_t	*small_un;
819 
820 
821 	if (cmd == MD_SNARF_CLEANUP)
822 		return (0);
823 
824 	all_stripes_gotten = 1;
825 	gotsomething = 0;
826 
827 	typ1 = (mddb_type_t)md_getshared_key(setno,
828 	    stripe_md_ops.md_driver.md_drivername);
829 	recid = mddb_makerecid(setno, 0);
830 
831 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
832 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
833 			continue;
834 
835 		dep = mddb_getrecdep(recid);
836 		dep->de_flags = MDDB_F_STRIPE;
837 		rbp = dep->de_rb;
838 
839 		if ((rbp->rb_revision == MDDB_REV_RB) &&
840 		    ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
841 			/*
842 			 * This means, we have an old and small record
843 			 * and this record hasn't already been converted.
844 			 * Before we create an incore metadevice from this
845 			 * we have to convert it to a big record.
846 			 */
847 			small_un = (ms_unit32_od_t *)mddb_getrecaddr(recid);
848 			newreqsize = get_big_stripe_req_size(small_un,
849 					COMPLETE_STRUCTURE);
850 			big_un = (ms_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
851 			stripe_convert((caddr_t)small_un, (caddr_t)big_un,
852 			    SMALL_2_BIG);
853 			kmem_free(small_un, dep->de_reqsize);
854 			dep->de_rb_userdata = big_un;
855 			dep->de_reqsize = newreqsize;
856 			un = big_un;
857 			rbp->rb_private |= MD_PRV_CONVD;
858 		} else {
859 			/* Big device */
860 			un = (ms_unit_t *)mddb_getrecaddr(recid);
861 		}
862 
863 		/* Set revision and flag accordingly */
864 		if (rbp->rb_revision == MDDB_REV_RB) {
865 			un->c.un_revision = MD_32BIT_META_DEV;
866 		} else {
867 			un->c.un_revision = MD_64BIT_META_DEV;
868 			un->c.un_flag |= MD_EFILABEL;
869 		}
870 
871 		/* Create minor node for snarfed unit. */
872 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
873 
874 		if (MD_UNIT(MD_SID(un)) != NULL) {
875 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
876 			continue;
877 		}
878 		all_stripes_gotten = 0;
879 		if (stripe_build_incore((void *)un, 1) == 0) {
880 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
881 			md_create_unit_incore(MD_SID(un), &stripe_md_ops, 0);
882 			gotsomething = 1;
883 		}
884 	}
885 
886 	if (!all_stripes_gotten)
887 		return (gotsomething);
888 
889 	recid = mddb_makerecid(setno, 0);
890 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
891 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
892 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
893 
894 	return (0);
895 }
896 
897 static int
898 stripe_halt(md_haltcmd_t cmd, set_t setno)
899 {
900 	int		i;
901 	mdi_unit_t	*ui;
902 	minor_t		mnum;
903 
904 	if (cmd == MD_HALT_CLOSE)
905 		return (0);
906 
907 	if (cmd == MD_HALT_OPEN)
908 		return (0);
909 
910 	if (cmd == MD_HALT_UNLOAD)
911 		return (0);
912 
913 	if (cmd == MD_HALT_CHECK) {
914 		for (i = 0; i < md_nunits; i++) {
915 			mnum = MD_MKMIN(setno, i);
916 			if ((ui = MDI_UNIT(mnum)) == NULL)
917 				continue;
918 			if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
919 				continue;
920 			if (md_unit_isopen(ui))
921 				return (1);
922 		}
923 		return (0);
924 	}
925 
926 	if (cmd != MD_HALT_DOIT)
927 		return (1);
928 
929 	for (i = 0; i < md_nunits; i++) {
930 		mnum = MD_MKMIN(setno, i);
931 		if ((ui = MDI_UNIT(mnum)) == NULL)
932 			continue;
933 		if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
934 			continue;
935 		reset_stripe((ms_unit_t *)MD_UNIT(mnum), mnum, 0);
936 	}
937 
938 	return (0);
939 }
940 
941 /*ARGSUSED3*/
942 static int
943 stripe_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
944 {
945 	minor_t		mnum = getminor(*dev);
946 	mdi_unit_t	*ui = MDI_UNIT(mnum);
947 	ms_unit_t	*un;
948 	int		err = 0;
949 	set_t		setno;
950 
951 	/*
952 	 * When doing an open of a multi owner metadevice, check to see if this
953 	 * node is a starting node and if a reconfig cycle is underway.
954 	 * If so, the system isn't sufficiently set up enough to handle the
955 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
956 	 */
957 	setno = MD_MIN2SET(mnum);
958 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
959 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
960 			return (ENXIO);
961 	}
962 
963 	/* single thread */
964 	un = (ms_unit_t *)md_unit_openclose_enter(ui);
965 
966 	/* open devices, if necessary */
967 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
968 		if ((err = stripe_open_all_devs(un, md_oflags)) != 0) {
969 			goto out;
970 		}
971 	}
972 
973 	/* count open */
974 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
975 		goto out;
976 
977 	/* unlock, return success */
978 out:
979 	md_unit_openclose_exit(ui);
980 	return (err);
981 }
982 
983 /*ARGSUSED1*/
984 static int
985 stripe_close(
986 	dev_t		dev,
987 	int		flag,
988 	int		otyp,
989 	cred_t		*cred_p,
990 	int		md_cflags
991 )
992 {
993 	minor_t		mnum = getminor(dev);
994 	mdi_unit_t	*ui = MDI_UNIT(mnum);
995 	ms_unit_t	*un;
996 	int		err = 0;
997 
998 	/* single thread */
999 	un = (ms_unit_t *)md_unit_openclose_enter(ui);
1000 
1001 	/* count closed */
1002 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
1003 		goto out;
1004 
1005 	/* close devices, if necessary */
1006 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1007 		stripe_close_all_devs(un, md_cflags);
1008 	}
1009 
1010 	/* unlock, return success */
1011 out:
1012 	md_unit_openclose_exit(ui);
1013 	return (err);
1014 }
1015 
1016 
1017 static struct buf dumpbuf;
1018 
1019 /*
1020  * This routine dumps memory to the disk.  It assumes that the memory has
1021  * already been mapped into mainbus space.  It is called at disk interrupt
1022  * priority when the system is in trouble.
1023  *
1024  */
1025 static int
1026 stripe_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1027 {
1028 	ms_unit_t	*un;
1029 	buf_t		*bp;
1030 	ms_comp_t	*mdc;
1031 	u_longlong_t	nb;
1032 	diskaddr_t	mapblk;
1033 	int		result;
1034 	int		more;
1035 	int		saveresult = 0;
1036 
1037 	/*
1038 	 * Don't need to grab the unit lock.
1039 	 * Cause nothing else is suppose to be happenning.
1040 	 * Also dump is not suppose to sleep.
1041 	 */
1042 	un = (ms_unit_t *)MD_UNIT(getminor(dev));
1043 
1044 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1045 		return (EINVAL);
1046 
1047 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
1048 		return (EINVAL);
1049 
1050 	bp = &dumpbuf;
1051 	nb = ldbtob(nblk);
1052 	do {
1053 		bzero((caddr_t)bp, sizeof (*bp));
1054 		more = md_mapbuf(un, (diskaddr_t)blkno, nb, bp, &mdc);
1055 		nblk = btodb(bp->b_bcount);
1056 		mapblk = bp->b_lblkno;
1057 		if (!(mdc->un_mirror.ms_flags & MDM_S_NOWRITE)) {
1058 			/*
1059 			 * bdev_dump() is currently only able to take
1060 			 * 32 bit wide blkno's.
1061 			 */
1062 			result = bdev_dump(bp->b_edev, addr, (daddr_t)mapblk,
1063 						nblk);
1064 			if (result)
1065 				saveresult = result;
1066 		}
1067 
1068 		nb -= bp->b_bcount;
1069 		addr += bp->b_bcount;
1070 		blkno += nblk;
1071 	} while (more);
1072 
1073 	return (saveresult);
1074 }
1075 
1076 /*ARGSUSED*/
1077 static intptr_t
1078 stripe_shared_by_blk(
1079 	md_dev64_t dev,
1080 	void *junk,
1081 	diskaddr_t blkno,
1082 	u_longlong_t *cnt)
1083 {
1084 	ms_unit_t	*un;
1085 	buf_t		bp;
1086 	ms_comp_t	*comp;
1087 
1088 	un = MD_UNIT(md_getminor(dev));
1089 	(void) md_mapbuf(un, blkno, ldbtob(*cnt), &bp, &comp);
1090 	*cnt = (u_longlong_t)lbtodb(bp.b_bcount);
1091 	return ((intptr_t)&comp->un_mirror);
1092 }
1093 
1094 /*
1095  * stripe_block_count_skip_size() returns the following values
1096  *	so that the logical to physical block mappings can
1097  *	be calculated without intimate knowledge of the underpinnings.
1098  *
1099  *	block - first logical block number of the device.
1100  *		block = [ # of blocks before THE row ] +
1101  *			[ # of blocks in THE row before the component ]
1102  *	count - # of segments (interlaced size).
1103  *	skip  - # of logical blocks between segments, or delta to
1104  *		  get to next segment
1105  *	size  - interlace size used for the block, count, skip.
1106  */
1107 /*ARGSUSED*/
1108 static intptr_t
1109 stripe_block_count_skip_size(
1110 	md_dev64_t	 dev,
1111 	void		*junk,
1112 	int		ci,
1113 	diskaddr_t	*block,
1114 	size_t		*count,
1115 	u_longlong_t	*skip,
1116 	u_longlong_t	*size)
1117 {
1118 	ms_unit_t	*un;
1119 	int		row;
1120 	struct ms_row	*mdr;
1121 	int		cmpcount = 0;
1122 
1123 	un = MD_UNIT(md_getminor(dev));
1124 
1125 	for (row = 0; row < un->un_nrows; row++) {
1126 		mdr = &un->un_row[row];
1127 		if ((mdr->un_ncomp + cmpcount) > ci)
1128 			break;
1129 		cmpcount += mdr->un_ncomp;
1130 	}
1131 	ASSERT(row != un->un_nrows);
1132 
1133 	/*
1134 	 * Concatenations are always contiguous blocks,
1135 	 * you cannot depend on the interlace being a usable
1136 	 * value (except for stripes).
1137 	 */
1138 	if (mdr->un_ncomp == 1) {	/* Concats */
1139 		*block = mdr->un_cum_blocks - mdr->un_blocks;
1140 		*count = 1;
1141 		*skip = 0;
1142 		*size = mdr->un_blocks;
1143 	} else {			/* Stripes */
1144 		*block = (mdr->un_cum_blocks - mdr->un_blocks) +
1145 		    ((ci - cmpcount) * mdr->un_interlace);
1146 		*count	= (size_t)(mdr->un_blocks / (mdr->un_interlace
1147 			* mdr->un_ncomp));
1148 		*skip = (mdr->un_interlace * mdr->un_ncomp) - mdr->un_interlace;
1149 		*size = mdr->un_interlace;
1150 	}
1151 
1152 	return (0);
1153 }
1154 
1155 /*ARGSUSED*/
1156 static intptr_t
1157 stripe_shared_by_indx(md_dev64_t dev, void *junk, int indx)
1158 {
1159 	ms_unit_t	*un;
1160 	ms_comp_t	*comp;
1161 
1162 	un = MD_UNIT(md_getminor(dev));
1163 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1164 	comp += indx;
1165 	return ((intptr_t)&comp->un_mirror);
1166 }
1167 
1168 /*ARGSUSED*/
1169 intptr_t
1170 stripe_component_count(md_dev64_t dev, void *junk)
1171 {
1172 	/*
1173 	 * See comments for stripe_get_dev
1174 	 */
1175 
1176 	ms_unit_t	*un;
1177 	int		count = 0;
1178 	int		row;
1179 
1180 	un = MD_UNIT(md_getminor(dev));
1181 	for (row = 0; row < un->un_nrows; row++)
1182 		count += un->un_row[row].un_ncomp;
1183 	return (count);
1184 }
1185 
1186 /*ARGSUSED*/
1187 intptr_t
1188 stripe_get_dev(md_dev64_t dev, void *junk, int indx, ms_cd_info_t *cd)
1189 {
1190 	/*
1191 	 * It should be noted that stripe_replace in stripe_ioctl.c calls this
1192 	 * routine using makedevice(0, minor) for the first argument.
1193 	 *
1194 	 * If this routine at some point in the future needs to use the major
1195 	 * number stripe_replace must be changed.
1196 	 */
1197 
1198 	ms_unit_t	*un;
1199 	ms_comp_t	*comp;
1200 	md_dev64_t	tmpdev;
1201 
1202 	un = MD_UNIT(md_getminor(dev));
1203 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1204 	comp += indx;
1205 	tmpdev = comp->un_dev;
1206 	/*
1207 	 * Try to resolve devt again if NODEV64
1208 	 * Check if this comp is hotspared and if it is
1209 	 * then use key for hotspare
1210 	 */
1211 	if (tmpdev == NODEV64) {
1212 		tmpdev = md_resolve_bydevid(md_getminor(dev), tmpdev,
1213 			comp->un_mirror.ms_hs_id ?
1214 			comp->un_mirror.ms_hs_key :
1215 			comp->un_key);
1216 		comp->un_dev = tmpdev;
1217 	}
1218 
1219 	cd->cd_dev = comp->un_dev;
1220 	cd->cd_orig_dev = comp->un_mirror.ms_orig_dev;
1221 	return (0);
1222 }
1223 
1224 /*ARGSUSED*/
1225 void
1226 stripe_replace_done(md_dev64_t dev, sv_dev_t *sv)
1227 {
1228 	/*
1229 	 * See comments for stripe_get_dev
1230 	 */
1231 
1232 	minor_t		mnum = md_getminor(dev);
1233 
1234 	if (sv != NULL) {
1235 		md_rem_names(sv, 1);
1236 		kmem_free(sv, sizeof (sv_dev_t));
1237 	}
1238 
1239 	md_unit_writerexit(MDI_UNIT(mnum));
1240 }
1241 
1242 /*ARGSUSED*/
1243 intptr_t
1244 stripe_replace_dev(md_dev64_t dev, void *junk, int ci, ms_new_dev_t *nd,
1245     mddb_recid_t *recids, int nrecids, void (**replace_done)(),
1246     void **replace_data)
1247 {
1248 	minor_t		mnum;
1249 	ms_unit_t	*un;
1250 	mdi_unit_t	*ui;
1251 	ms_comp_t	*comp;
1252 	diskaddr_t	dev_size;
1253 	int		row;
1254 	int		ncomps = 0;
1255 	int		cmpcount = 0;
1256 	int		rid = 0;
1257 	struct ms_row	*mdr;
1258 	sv_dev_t	*sv = NULL;
1259 	mddb_recid_t	hs_id = 0;
1260 	set_t		setno;
1261 	side_t		side;
1262 	md_dev64_t	this_dev;
1263 
1264 	mnum = md_getminor(dev);
1265 	ui = MDI_UNIT(mnum);
1266 	setno = MD_MIN2SET(mnum);
1267 	side = mddb_getsidenum(setno);
1268 
1269 	un = md_unit_writerlock(ui);
1270 
1271 	*replace_data = NULL;
1272 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1273 
1274 	comp += ci;
1275 
1276 	/*
1277 	 * Count the number of components
1278 	 */
1279 	for (row = 0; row < un->un_nrows; row++) {
1280 		struct ms_row *mdr = &un->un_row[row];
1281 		ncomps += mdr->un_ncomp;
1282 	}
1283 
1284 	recids[0] = 0;
1285 	/*
1286 	 * No need of checking size of new device,
1287 	 * when hotsparing (it has already been done), or
1288 	 * when enabling the device.
1289 	 */
1290 	if ((nd != NULL) && (nd->nd_hs_id == 0)) {
1291 		for (row = 0; row < un->un_nrows; row++) {
1292 			mdr = &un->un_row[row];
1293 			if ((mdr->un_ncomp + cmpcount) > ci)
1294 				break;
1295 			cmpcount += mdr->un_ncomp;
1296 		}
1297 		ASSERT(row != un->un_nrows);
1298 
1299 		/* Concatenations have a ncomp = 1 */
1300 		dev_size = mdr->un_blocks / mdr->un_ncomp;
1301 
1302 		/*
1303 		 * now check to see if new comp can be used in
1304 		 * place of old comp
1305 		 */
1306 		if ((un->c.un_flag & MD_LABELED) && (ci == 0) &&
1307 		    nd->nd_labeled)
1308 			nd->nd_start_blk = 0;
1309 		else
1310 			nd->nd_nblks -= nd->nd_start_blk;
1311 
1312 		if (dev_size > nd->nd_nblks) {
1313 			md_unit_writerexit(ui);
1314 			return (MDE_COMP_TOO_SMALL);
1315 		}
1316 
1317 		sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
1318 		sv->setno = MD_MIN2SET(mnum);
1319 		sv->key = comp->un_key;
1320 	}
1321 
1322 	/*
1323 	 * Close this component.
1324 	 */
1325 	if (comp->un_mirror.ms_flags & MDM_S_ISOPEN) {
1326 		md_layered_close(comp->un_dev, MD_OFLG_NULL);
1327 		comp->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
1328 	}
1329 
1330 	/*
1331 	 * If the component is hotspared, return to the pool.
1332 	 */
1333 	if (comp->un_mirror.ms_hs_id != 0) {
1334 		hs_cmds_t	cmd;
1335 		mdkey_t		hs_key;
1336 
1337 		hs_key = comp->un_mirror.ms_hs_key;
1338 		comp->un_dev = comp->un_mirror.ms_orig_dev;
1339 		comp->un_start_block = comp->un_mirror.ms_orig_blk;
1340 		comp->un_mirror.ms_hs_key = 0;
1341 		comp->un_mirror.ms_hs_id = 0;
1342 		comp->un_mirror.ms_orig_dev = 0;
1343 
1344 		cmd = HS_FREE;
1345 		if ((comp->un_mirror.ms_state != CS_OKAY) &&
1346 		    (comp->un_mirror.ms_state != CS_RESYNC))
1347 			cmd = HS_BAD;
1348 		(void) md_hot_spare_ifc(cmd, un->un_hsp_id, 0, 0, &hs_id,
1349 		    &hs_key, NULL, NULL);
1350 	}
1351 
1352 	/*
1353 	 * Open by device id; for enable (indicated by a NULL
1354 	 * nd pointer), use the existing component info.  For
1355 	 * replace, use the new device.
1356 	 */
1357 	if (nd == NULL) {
1358 		this_dev = md_resolve_bydevid(mnum, comp->un_dev, comp->un_key);
1359 		/*
1360 		 * If someone replaced a new disk in the same slot
1361 		 * we get NODEV64 since old device id cannot be
1362 		 * resolved. The new devt is obtained from the
1363 		 * mddb since devt is going to be unchanged for the
1364 		 * enable case. No need to check for multiple
1365 		 * keys here because the caller (comp_replace)
1366 		 * has already sanity checked it for us.
1367 		 */
1368 		if (this_dev == NODEV64) {
1369 			this_dev = md_getdevnum(setno, side, comp->un_key,
1370 			    MD_TRUST_DEVT);
1371 		}
1372 	} else {
1373 		/*
1374 		 * If this is a hotspare, save the original dev_t for later
1375 		 * use. If this has occured during boot then the value of
1376 		 * comp->un_dev will be NODEV64 because of the failure to look
1377 		 * up the devid of the device.
1378 		 */
1379 		if (nd->nd_hs_id != 0)
1380 			comp->un_mirror.ms_orig_dev = comp->un_dev;
1381 		this_dev = md_resolve_bydevid(mnum, nd->nd_dev, nd->nd_key);
1382 	}
1383 
1384 	comp->un_dev = this_dev;
1385 
1386 	/*
1387 	 * Now open the new device if required. Note for a single component
1388 	 * stripe it will not be open - leave this for the mirror driver to
1389 	 * deal with.
1390 	 */
1391 	if (md_unit_isopen(ui)) {
1392 		if (md_layered_open(mnum, &this_dev, MD_OFLG_NULL)) {
1393 			mddb_recid_t	ids[3];
1394 
1395 			ids[0] = un->c.un_record_id;
1396 			ids[1] = hs_id;
1397 			ids[2] = 0;
1398 			mddb_commitrecs_wrapper(ids);
1399 			if ((nd != NULL) && (nd->nd_hs_id != 0)) {
1400 				/*
1401 				 * Revert back to the original device.
1402 				 */
1403 				comp->un_dev = comp->un_mirror.ms_orig_dev;
1404 
1405 				cmn_err(CE_WARN,
1406 				    "md: %s: open error of hotspare %s",
1407 				    md_shortname(mnum),
1408 				    md_devname(MD_MIN2SET(mnum), nd->nd_dev,
1409 				    NULL, 0));
1410 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1411 				    SVM_TAG_HS, MD_MIN2SET(mnum), nd->nd_dev);
1412 			}
1413 			md_unit_writerexit(ui);
1414 			return (MDE_COMP_OPEN_ERR);
1415 		}
1416 		if (nd != NULL)
1417 			nd->nd_dev = this_dev;
1418 
1419 		comp->un_mirror.ms_flags |= MDM_S_ISOPEN;
1420 	}
1421 
1422 	if (nd == NULL) {
1423 		recids[0] = un->c.un_record_id;
1424 		recids[1] = hs_id;
1425 		recids[2] = 0;
1426 		*replace_done = stripe_replace_done;
1427 		return (0);
1428 	}
1429 
1430 	/* if hot sparing this device */
1431 	if (nd->nd_hs_id != 0) {
1432 		char	devname[MD_MAX_CTDLEN];
1433 		char	hs_devname[MD_MAX_CTDLEN];
1434 		set_t	setno;
1435 
1436 		comp->un_mirror.ms_hs_id = nd->nd_hs_id;
1437 		comp->un_mirror.ms_hs_key = nd->nd_key;
1438 
1439 		comp->un_mirror.ms_orig_blk = comp->un_start_block;
1440 
1441 		setno = MD_MIN2SET(mnum);
1442 
1443 		(void) md_devname(setno, comp->un_mirror.ms_orig_dev, devname,
1444 					sizeof (devname));
1445 		(void) md_devname(setno, nd->nd_dev, hs_devname,
1446 		    sizeof (hs_devname));
1447 
1448 		cmn_err(CE_NOTE, "md: %s: hotspared device %s with %s",
1449 		    md_shortname(mnum), devname, hs_devname);
1450 
1451 	} else {	/* replacing the device */
1452 		comp->un_key = nd->nd_key;
1453 		*replace_data = (void *)sv;
1454 
1455 		/*
1456 		 * For the old device, make sure to reset the parent
1457 		 * if it's a  metadevice.
1458 		 */
1459 		if (md_getmajor(comp->un_dev) == md_major) {
1460 			minor_t	  comp_mnum = md_getminor(comp->un_dev);
1461 			md_unit_t *comp_un = MD_UNIT(comp_mnum);
1462 
1463 			md_reset_parent(comp->un_dev);
1464 			recids[rid++] = MD_RECID(comp_un);
1465 		}
1466 	}
1467 
1468 	comp->un_dev = nd->nd_dev;
1469 	comp->un_start_block = nd->nd_start_blk;
1470 
1471 	/*
1472 	 * For the new device, make sure to set the parent if it's a
1473 	 * metadevice.
1474 	 *
1475 	 * If we ever support using metadevices as hot spares, this
1476 	 * will need to be tested, and possibly moved into the
1477 	 * preceding "else" clause, immediately following the parent
1478 	 * reset block.  For now, it's convenient to leave it here and
1479 	 * only compress nd->nd_dev once.
1480 	 */
1481 	if (md_getmajor(comp->un_dev) == md_major) {
1482 		minor_t		comp_mnum = md_getminor(comp->un_dev);
1483 		md_unit_t	*comp_un = MD_UNIT(comp_mnum);
1484 
1485 		md_set_parent(comp->un_dev, MD_SID(un));
1486 		recids[rid++] = MD_RECID(comp_un);
1487 	}
1488 
1489 	recids[rid++] = un->c.un_record_id;
1490 	recids[rid++] = hs_id;
1491 	recids[rid] = 0;
1492 	*replace_done = stripe_replace_done;
1493 	return (0);
1494 }
1495 
1496 /*ARGSUSED*/
1497 static intptr_t
1498 stripe_hotspare_dev(
1499 	md_dev64_t	dev,
1500 	void		*junk,
1501 	int		ci,
1502 	mddb_recid_t	*recids,
1503 	int		nrecids,
1504 	void		(**replace_done)(),
1505 	void		**replace_data)
1506 {
1507 	ms_unit_t	*un;
1508 	mdi_unit_t	*ui;
1509 	ms_comp_t	*comp;
1510 	int		row;
1511 	struct ms_row	*mdr;
1512 	ms_new_dev_t	nd;
1513 	int		err;
1514 	int		i;
1515 	minor_t		mnum;
1516 	set_t		setno;
1517 	int		cmpcount = 0;
1518 
1519 	mnum = md_getminor(dev);
1520 	ui = MDI_UNIT(mnum);
1521 	un = MD_UNIT(mnum);
1522 	setno = MD_MIN2SET(mnum);
1523 
1524 	if (md_get_setstatus(setno) & MD_SET_STALE)
1525 		return (1);
1526 
1527 	if (un->un_hsp_id == -1)
1528 		return (1);
1529 
1530 	for (row = 0; row < un->un_nrows; row++) {
1531 		mdr = &un->un_row[row];
1532 		if ((mdr->un_ncomp + cmpcount) > ci)
1533 			break;
1534 		cmpcount += mdr->un_ncomp;
1535 	}
1536 	ASSERT(row != un->un_nrows);
1537 
1538 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1539 	comp += ci;
1540 	/* Concatenations have a ncomp = 1 */
1541 	nd.nd_nblks = mdr->un_blocks / mdr->un_ncomp;
1542 
1543 	if ((un->c.un_flag & MD_LABELED) && (ci == 0))
1544 		nd.nd_labeled = 1;
1545 	else
1546 		nd.nd_labeled = 0;
1547 
1548 again:
1549 	err = md_hot_spare_ifc(HS_GET, un->un_hsp_id, nd.nd_nblks,
1550 	    nd.nd_labeled, &nd.nd_hs_id, &nd.nd_key, &nd.nd_dev,
1551 	    &nd.nd_start_blk);
1552 
1553 	if (err) {
1554 		if (!stripe_replace_dev(dev, junk, ci, NULL, recids, nrecids,
1555 		    replace_done, replace_data)) {
1556 			mddb_commitrecs_wrapper(recids);
1557 			md_unit_writerexit(ui);
1558 		}
1559 		recids[0] = 0;
1560 		return (1);
1561 	}
1562 
1563 	if (stripe_replace_dev(dev, junk, ci, &nd, recids, nrecids,
1564 		replace_done, replace_data)) {
1565 
1566 		(void) md_hot_spare_ifc(HS_BAD, un->un_hsp_id, 0, 0,
1567 		    &nd.nd_hs_id, &nd.nd_key, NULL, NULL);
1568 		mddb_commitrec_wrapper(nd.nd_hs_id);
1569 		goto again;
1570 	}
1571 
1572 	/* Leave a slot for the null recid */
1573 	for (i = 0; i < (nrecids - 1); i++) {
1574 		if (recids[i] == 0) {
1575 			recids[i++] = nd.nd_hs_id;
1576 			recids[i] = 0;
1577 		}
1578 	}
1579 	return (0);
1580 }
1581 
1582 static int
1583 stripe_imp_set(
1584 	set_t	setno
1585 )
1586 {
1587 
1588 	mddb_recid_t	recid;
1589 	int		i, row, c, gotsomething;
1590 	mddb_type_t	typ1;
1591 	mddb_de_ic_t	*dep;
1592 	mddb_rb32_t	*rbp;
1593 	ms_unit32_od_t	*un32;
1594 	ms_unit_t	*un64;
1595 	minor_t		*self_id;	/* minor needs to be updated */
1596 	md_parent_t	*parent_id;	/* parent needs to be updated */
1597 	mddb_recid_t	*record_id;	/* record id needs to be updated */
1598 	mddb_recid_t	*hsp_id;
1599 	ms_comp32_od_t	*comp32;
1600 	ms_comp_t	*comp64;
1601 
1602 
1603 	gotsomething = 0;
1604 
1605 	typ1 = (mddb_type_t)md_getshared_key(setno,
1606 	    stripe_md_ops.md_driver.md_drivername);
1607 	recid = mddb_makerecid(setno, 0);
1608 
1609 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
1610 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1611 			continue;
1612 
1613 		dep = mddb_getrecdep(recid);
1614 		rbp = dep->de_rb;
1615 
1616 		if (rbp->rb_revision == MDDB_REV_RB) {
1617 			/*
1618 			 * Small device
1619 			 */
1620 			un32 = (ms_unit32_od_t *)mddb_getrecaddr(recid);
1621 			self_id = &(un32->c.un_self_id);
1622 			parent_id = &(un32->c.un_parent);
1623 			record_id = &(un32->c.un_record_id);
1624 			hsp_id = &(un32->un_hsp_id);
1625 
1626 			comp32 = (ms_comp32_od_t *)((void *)&((char *)un32)
1627 				[un32->un_ocomp]);
1628 			for (row = 0; row < un32->un_nrows; row++) {
1629 			    struct ms_row32_od *mdr = &un32->un_row[row];
1630 			    for (i = 0, c = mdr->un_icomp;
1631 				i < mdr->un_ncomp; i++) {
1632 				ms_comp32_od_t *mdc;
1633 				mdc = &comp32[c++];
1634 
1635 				if (!md_update_minor(setno, mddb_getsidenum
1636 				    (setno), mdc->un_key))
1637 					goto out;
1638 
1639 				if (mdc->un_mirror.ms_hs_id != 0)
1640 				    mdc->un_mirror.ms_hs_id = MAKERECID(
1641 				    setno, mdc->un_mirror.ms_hs_id);
1642 			    }
1643 			}
1644 		} else {
1645 			un64 = (ms_unit_t *)mddb_getrecaddr(recid);
1646 			self_id = &(un64->c.un_self_id);
1647 			parent_id = &(un64->c.un_parent);
1648 			record_id = &(un64->c.un_record_id);
1649 			hsp_id = &(un64->un_hsp_id);
1650 
1651 			comp64 = (ms_comp_t *)((void *)&((char *)un64)
1652 				[un64->un_ocomp]);
1653 			for (row = 0; row < un64->un_nrows; row++) {
1654 			    struct ms_row *mdr = &un64->un_row[row];
1655 			    for (i = 0, c = mdr->un_icomp;
1656 				i < mdr->un_ncomp; i++) {
1657 				ms_comp_t *mdc;
1658 				mdc = &comp64[c++];
1659 
1660 				if (!md_update_minor(setno, mddb_getsidenum
1661 				    (setno), mdc->un_key))
1662 					goto out;
1663 
1664 				if (mdc->un_mirror.ms_hs_id != 0)
1665 				    mdc->un_mirror.ms_hs_id = MAKERECID(
1666 				    setno, mdc->un_mirror.ms_hs_id);
1667 			    }
1668 			}
1669 		}
1670 
1671 		/*
1672 		 * Update unit with the imported setno
1673 		 *
1674 		 */
1675 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
1676 
1677 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1678 
1679 		if (*hsp_id != -1)
1680 			*hsp_id = MAKERECID(setno, DBID(*hsp_id));
1681 
1682 		if (*parent_id != MD_NO_PARENT)
1683 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1684 		*record_id = MAKERECID(setno, DBID(*record_id));
1685 
1686 		gotsomething = 1;
1687 	}
1688 
1689 out:
1690 	return (gotsomething);
1691 }
1692 
1693 static md_named_services_t stripe_named_services[] = {
1694 	{stripe_shared_by_blk,			"shared by blk"		    },
1695 	{stripe_shared_by_indx,			"shared by indx"	    },
1696 	{stripe_component_count,		"get component count"	    },
1697 	{stripe_block_count_skip_size,		"get block count skip size" },
1698 	{stripe_get_dev,			"get device"		    },
1699 	{stripe_replace_dev,			"replace device"	    },
1700 	{stripe_hotspare_dev,			"hotspare device"	    },
1701 	{stripe_rename_check,			MDRNM_CHECK		    },
1702 	{NULL,					0}
1703 };
1704 
1705 md_ops_t stripe_md_ops = {
1706 	stripe_open,		/* open */
1707 	stripe_close,		/* close */
1708 	md_stripe_strategy,	/* strategy */
1709 	NULL,			/* print */
1710 	stripe_dump,		/* dump */
1711 	NULL,			/* read */
1712 	NULL,			/* write */
1713 	md_stripe_ioctl,	/* stripe_ioctl, */
1714 	stripe_snarf,		/* stripe_snarf */
1715 	stripe_halt,		/* stripe_halt */
1716 	NULL,			/* aread */
1717 	NULL,			/* awrite */
1718 	stripe_imp_set,		/* import set */
1719 	stripe_named_services
1720 };
1721 
1722 static void
1723 init_init()
1724 {
1725 	md_stripe_mcs_buf_off = sizeof (md_scs_t) - sizeof (buf_t);
1726 
1727 	stripe_parent_cache = kmem_cache_create("md_stripe_parent",
1728 	    sizeof (md_sps_t), 0, stripe_parent_constructor,
1729 	    stripe_parent_destructor, stripe_run_queue, NULL, NULL,
1730 	    0);
1731 	stripe_child_cache = kmem_cache_create("md_stripe_child",
1732 	    sizeof (md_scs_t) - sizeof (buf_t) + biosize(), 0,
1733 	    stripe_child_constructor, stripe_child_destructor,
1734 	    stripe_run_queue, NULL, NULL, 0);
1735 }
1736 
1737 static void
1738 fini_uninit()
1739 {
1740 	kmem_cache_destroy(stripe_parent_cache);
1741 	kmem_cache_destroy(stripe_child_cache);
1742 	stripe_parent_cache = stripe_child_cache = NULL;
1743 }
1744 
1745 /* define the module linkage */
1746 MD_PLUGIN_MISC_MODULE("stripes module %I%", init_init(), fini_uninit())
1747