xref: /titanic_41/usr/src/uts/common/io/lvm/stripe/stripe.c (revision 31a2903539e29171f5c5da80e5c9616c70108116)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/param.h>
27 #include <sys/systm.h>
28 #include <sys/conf.h>
29 #include <sys/file.h>
30 #include <sys/user.h>
31 #include <sys/uio.h>
32 #include <sys/t_lock.h>
33 #include <sys/buf.h>
34 #include <sys/dkio.h>
35 #include <sys/vtoc.h>
36 #include <sys/kmem.h>
37 #include <vm/page.h>
38 #include <sys/cmn_err.h>
39 #include <sys/sysmacros.h>
40 #include <sys/types.h>
41 #include <sys/mkdev.h>
42 #include <sys/stat.h>
43 #include <sys/open.h>
44 #include <sys/lvm/mdio.h>
45 #include <sys/lvm/mdvar.h>
46 #include <sys/lvm/md_stripe.h>
47 #include <sys/lvm/md_convert.h>
48 #include <sys/lvm/md_notify.h>
49 #include <sys/modctl.h>
50 #include <sys/ddi.h>
51 #include <sys/sunddi.h>
52 #include <sys/debug.h>
53 #include <sys/sysevent/eventdefs.h>
54 #include <sys/sysevent/svm.h>
55 
56 md_ops_t		stripe_md_ops;
57 #ifndef	lint
58 char			_depends_on[] = "drv/md";
59 md_ops_t		*md_interface_ops = &stripe_md_ops;
60 #endif
61 
62 extern unit_t		md_nunits;
63 extern set_t		md_nsets;
64 extern md_set_t		md_set[];
65 
66 extern kmutex_t		md_mx;
67 extern kcondvar_t	md_cv;
68 
69 extern int		md_status;
70 extern major_t		md_major;
71 extern mdq_anchor_t	md_done_daemon;
72 
73 static int		md_stripe_mcs_buf_off;
74 static kmem_cache_t	*stripe_parent_cache = NULL;
75 static kmem_cache_t	*stripe_child_cache = NULL;
76 
77 /*ARGSUSED1*/
78 static int
79 stripe_parent_constructor(void *p, void *d1, int d2)
80 {
81 	mutex_init(&((md_sps_t *)p)->ps_mx,
82 	    NULL, MUTEX_DEFAULT, NULL);
83 	return (0);
84 }
85 
86 static void
87 stripe_parent_init(void *ps)
88 {
89 	bzero(ps, offsetof(md_sps_t, ps_mx));
90 }
91 
92 /*ARGSUSED1*/
93 static void
94 stripe_parent_destructor(void *p, void *d)
95 {
96 	mutex_destroy(&((md_sps_t *)p)->ps_mx);
97 }
98 
99 /*ARGSUSED1*/
100 static int
101 stripe_child_constructor(void *p, void *d1, int d2)
102 {
103 	bioinit(&((md_scs_t *)p)->cs_buf);
104 	return (0);
105 }
106 
107 static void
108 stripe_child_init(md_scs_t *cs)
109 {
110 	cs->cs_mdunit = 0;
111 	cs->cs_ps = NULL;
112 	cs->cs_comp = NULL;
113 	md_bioreset(&cs->cs_buf);
114 }
115 
116 /*ARGSUSED1*/
117 static void
118 stripe_child_destructor(void *p, void *d)
119 {
120 	biofini(&((md_scs_t *)p)->cs_buf);
121 }
122 
123 /*ARGSUSED*/
124 static void
125 stripe_run_queue(void *d)
126 {
127 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
128 		md_daemon(1, &md_done_daemon);
129 }
130 
131 static void
132 stripe_close_all_devs(ms_unit_t *un, int md_cflags)
133 {
134 	int		row;
135 	int		i;
136 	int		c;
137 	struct ms_comp	*mdcomp;
138 
139 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
140 	for (row = 0; row < un->un_nrows; row++) {
141 		struct ms_row *mdr = &un->un_row[row];
142 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
143 			struct ms_comp	*mdc;
144 			mdc = &mdcomp[c++];
145 			if (md_cflags & MD_OFLG_PROBEDEV) {
146 
147 			/*
148 			 * It is possible that the md_layered_open
149 			 * failed because the stripe unit structure
150 			 * contained a NODEV.  In such a case since
151 			 * there is nothing to open, there is nothing
152 			 * to close.
153 			 */
154 				if (mdc->un_dev == NODEV64)
155 					continue;
156 			}
157 			if ((md_cflags & MD_OFLG_PROBEDEV) &&
158 			    (mdc->un_mirror.ms_flags & MDM_S_PROBEOPEN)) {
159 				md_layered_close(mdc->un_dev,
160 				    md_cflags);
161 				mdc->un_mirror.ms_flags &= ~MDM_S_PROBEOPEN;
162 			} else if (mdc->un_mirror.ms_flags & MDM_S_ISOPEN) {
163 				md_layered_close(mdc->un_dev, md_cflags);
164 				mdc->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
165 			}
166 		}
167 	}
168 }
169 
170 static int
171 stripe_open_all_devs(ms_unit_t *un, int md_oflags)
172 {
173 	minor_t		mnum = MD_SID(un);
174 	int		row;
175 	int		i;
176 	int		c;
177 	struct ms_comp	*mdcomp;
178 	int		err;
179 	int		cont_on_errors = (md_oflags & MD_OFLG_CONT_ERRS);
180 	int		probe_err_cnt = 0;
181 	int		total_comp_cnt = 0;
182 	set_t		setno = MD_MIN2SET(MD_SID(un));
183 	side_t		side = mddb_getsidenum(setno);
184 	mdkey_t		key;
185 
186 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
187 
188 	/*
189 	 * For a probe call, if any component of a stripe or a concat
190 	 * can be opened, it is considered to be a success. The total number
191 	 * of components in a stripe are computed prior to starting a probe.
192 	 * This number is then compared against the number of components
193 	 * that could be be successfully opened. If none of the components
194 	 * in a stripe can be opened, only then an ENXIO is returned for a
195 	 * probe type open.
196 	 */
197 
198 	for (row = 0; row < un->un_nrows; row++) {
199 		struct ms_row *mdr = &un->un_row[row];
200 
201 		if (md_oflags & MD_OFLG_PROBEDEV)
202 			total_comp_cnt += mdr->un_ncomp;
203 
204 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
205 			struct ms_comp	*mdc;
206 			md_dev64_t tmpdev;
207 
208 			mdc = &mdcomp[c++];
209 			tmpdev = mdc->un_dev;
210 			/*
211 			 * Do the open by device id
212 			 * Check if this comp is hotspared and
213 			 * if it is then use the key for hotspare.
214 			 * MN disksets don't use devids, so we better don't use
215 			 * md_devid_found/md_resolve_bydevid there. Rather do,
216 			 * what's done in stripe_build_incore()
217 			 */
218 			if (MD_MNSET_SETNO(setno)) {
219 				if (mdc->un_mirror.ms_hs_id != 0) {
220 					(void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
221 					    0, &mdc->un_mirror.ms_hs_id, NULL,
222 					    &tmpdev, NULL);
223 				}
224 			} else {
225 				key = mdc->un_mirror.ms_hs_id ?
226 				    mdc->un_mirror.ms_hs_key : mdc->un_key;
227 				if ((md_getmajor(tmpdev) != md_major) &&
228 				    md_devid_found(setno, side, key) == 1) {
229 					tmpdev = md_resolve_bydevid(mnum,
230 					    tmpdev, key);
231 				}
232 			}
233 
234 			/*
235 			 * For a submirror, we only want to open those devices
236 			 * that are not errored. If the device is errored then
237 			 * then there is no reason to open it and leaving it
238 			 * closed allows the RCM/DR code to work so that the
239 			 * errored device can be replaced.
240 			 */
241 			if ((md_oflags & MD_OFLG_PROBEDEV) ||
242 			    ! (mdc->un_mirror.ms_state & CS_ERRED)) {
243 
244 				err = md_layered_open(mnum, &tmpdev, md_oflags);
245 			} else {
246 				err = ENXIO;
247 			}
248 
249 			/*
250 			 * Only set the un_dev if the tmpdev != NODEV64. If
251 			 * it is NODEV64 then the md_layered_open() will have
252 			 * failed in some manner.
253 			 */
254 			if (tmpdev != NODEV64)
255 				mdc->un_dev = tmpdev;
256 
257 			if (err) {
258 				if (!cont_on_errors) {
259 					stripe_close_all_devs(un, md_oflags);
260 					return (ENXIO);
261 				}
262 
263 				if (md_oflags & MD_OFLG_PROBEDEV)
264 					probe_err_cnt++;
265 			} else {
266 				if (md_oflags & MD_OFLG_PROBEDEV) {
267 					mdc->un_mirror.ms_flags |=
268 					    MDM_S_PROBEOPEN;
269 				} else
270 					mdc->un_mirror.ms_flags |= MDM_S_ISOPEN;
271 			}
272 		}
273 	}
274 
275 	/* If every component in a stripe could not be opened fail */
276 	if ((md_oflags & MD_OFLG_PROBEDEV) &&
277 	    (probe_err_cnt == total_comp_cnt))
278 		return (ENXIO);
279 	else
280 		return (0);
281 }
282 
283 int
284 stripe_build_incore(void *p, int snarfing)
285 {
286 	ms_unit_t *un = (ms_unit_t *)p;
287 	struct ms_comp	*mdcomp;
288 	minor_t		mnum;
289 	int		row;
290 	int		i;
291 	int		c;
292 	int		ncomps;
293 
294 	mnum = MD_SID(un);
295 
296 	if (MD_UNIT(mnum) != NULL)
297 		return (0);
298 
299 	MD_STATUS(un) = 0;
300 
301 	/*
302 	 * Reset all the is_open flags, these are probably set
303 	 * cause they just came out of the database.
304 	 */
305 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
306 
307 	ncomps = 0;
308 	for (row = 0; row < un->un_nrows; row++) {
309 		struct ms_row *mdr = &un->un_row[row];
310 		ncomps += mdr->un_ncomp;
311 	}
312 
313 	for (row = 0; row < un->un_nrows; row++) {
314 		struct ms_row *mdr = &un->un_row[row];
315 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
316 			struct ms_comp		*mdc;
317 			set_t			setno;
318 			md_dev64_t		tmpdev;
319 
320 			mdc = &mdcomp[c++];
321 			mdc->un_mirror.ms_flags &=
322 			    ~(MDM_S_ISOPEN | MDM_S_IOERR | MDM_S_RS_TRIED);
323 
324 			if (!snarfing)
325 				continue;
326 
327 			setno = MD_MIN2SET(mnum);
328 
329 			tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
330 			    mdc->un_key, MD_NOTRUST_DEVT);
331 			mdc->un_dev = tmpdev;
332 			/*
333 			 * Check for hotspares. If the hotspares haven't been
334 			 * snarfed yet, stripe_open_all_devs() will do the
335 			 * remapping of the dev's later.
336 			 */
337 			if (mdc->un_mirror.ms_hs_id != 0) {
338 				mdc->un_mirror.ms_orig_dev = mdc->un_dev;
339 				(void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
340 				    0, &mdc->un_mirror.ms_hs_id, NULL,
341 				    &tmpdev, NULL);
342 				mdc->un_dev = tmpdev;
343 			}
344 		}
345 	}
346 
347 	/* place various information in the in-core data structures */
348 	md_nblocks_set(mnum, un->c.un_total_blocks);
349 	MD_UNIT(mnum) = un;
350 
351 	return (0);
352 }
353 
354 void
355 reset_stripe(ms_unit_t *un, minor_t mnum, int removing)
356 {
357 	ms_comp_t	*mdcomp;
358 	struct ms_row	*mdr;
359 	int		i, c;
360 	int		row;
361 	int		nsv;
362 	int		isv;
363 	sv_dev_t	*sv;
364 	mddb_recid_t	*recids;
365 	mddb_recid_t	vtoc_id;
366 	int		rid = 0;
367 
368 	md_destroy_unit_incore(mnum, &stripe_md_ops);
369 
370 	md_nblocks_set(mnum, -1ULL);
371 	MD_UNIT(mnum) = NULL;
372 
373 	/*
374 	 * Attempt release of its minor node
375 	 */
376 	md_remove_minor_node(mnum);
377 
378 	if (!removing)
379 		return;
380 
381 	nsv = 0;
382 	/* Count the number of devices */
383 	for (row = 0; row < un->un_nrows; row++) {
384 		mdr = &un->un_row[row];
385 		nsv += mdr->un_ncomp;
386 	}
387 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t) * nsv, KM_SLEEP);
388 
389 	/*
390 	 * allocate recids array.  since we may have to commit
391 	 * underlying soft partition records, we need an array
392 	 * of size: total number of components in stripe + 3
393 	 * (one for the stripe itself, one for the hotspare, one
394 	 * for the end marker).
395 	 */
396 	recids = kmem_alloc(sizeof (mddb_recid_t) * (nsv + 3), KM_SLEEP);
397 
398 	/*
399 	 * Save the md_dev64_t's and driver nm indexes.
400 	 * Because after the mddb_deleterec() we will
401 	 * not be able to access the unit structure.
402 	 *
403 	 * NOTE: Deleting the names before deleting the
404 	 *	 unit structure would cause problems if
405 	 *	 the machine crashed in between the two.
406 	 */
407 	isv = 0;
408 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
409 
410 	for (row = 0; row < un->un_nrows; row++) {
411 		mdr = &un->un_row[row];
412 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
413 			struct ms_comp	*mdc;
414 			md_dev64_t	child_dev;
415 			md_unit_t	*child_un;
416 
417 			mdc = &mdcomp[c++];
418 			if (mdc->un_mirror.ms_hs_id != 0) {
419 				mdkey_t		hs_key;
420 
421 				hs_key = mdc->un_mirror.ms_hs_key;
422 
423 				mdc->un_dev = mdc->un_mirror.ms_orig_dev;
424 				mdc->un_start_block =
425 				    mdc->un_mirror.ms_orig_blk;
426 				mdc->un_mirror.ms_hs_id = 0;
427 				mdc->un_mirror.ms_hs_key = 0;
428 				mdc->un_mirror.ms_orig_dev = 0;
429 				recids[0] = 0;
430 				recids[1] = 0;	/* recids[1] filled in below */
431 				recids[2] = 0;
432 				(void) md_hot_spare_ifc(HS_FREE, un->un_hsp_id,
433 				    0, 0, &recids[0], &hs_key, NULL, NULL);
434 				mddb_commitrecs_wrapper(recids);
435 			}
436 
437 			/*
438 			 * check if we've got metadevice below us and
439 			 * deparent it if we do.
440 			 * NOTE: currently soft partitions are the
441 			 * the only metadevices stripes can be
442 			 * built on top of.
443 			 */
444 			child_dev = mdc->un_dev;
445 			if (md_getmajor(child_dev) == md_major) {
446 				child_un = MD_UNIT(md_getminor(child_dev));
447 				md_reset_parent(child_dev);
448 				recids[rid++] = MD_RECID(child_un);
449 			}
450 
451 			sv[isv].setno = MD_MIN2SET(mnum);
452 			sv[isv++].key = mdc->un_key;
453 		}
454 	}
455 
456 	recids[rid++] = un->c.un_record_id;
457 	recids[rid] = 0;	/* filled in below */
458 
459 	/*
460 	 * Decrement the HSP reference count and
461 	 * remove the knowledge of the HSP from the unit struct.
462 	 * This is done atomically to remove a window.
463 	 */
464 	if (un->un_hsp_id != -1) {
465 		(void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
466 		    &recids[rid++], NULL, NULL, NULL);
467 		un->un_hsp_id = -1;
468 	}
469 
470 	/* set end marker and commit records */
471 	recids[rid] = 0;
472 	mddb_commitrecs_wrapper(recids);
473 
474 	vtoc_id = un->c.un_vtoc_id;
475 
476 	/*
477 	 * Remove self from the namespace
478 	 */
479 	if (un->c.un_revision & MD_FN_META_DEV) {
480 		(void) md_rem_selfname(un->c.un_self_id);
481 	}
482 
483 	/* Remove the unit structure */
484 	mddb_deleterec_wrapper(un->c.un_record_id);
485 
486 	/* Remove the vtoc, if present */
487 	if (vtoc_id)
488 		mddb_deleterec_wrapper(vtoc_id);
489 
490 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
491 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
492 	md_rem_names(sv, nsv);
493 	kmem_free(sv, sizeof (sv_dev_t) * nsv);
494 	kmem_free(recids, sizeof (mddb_recid_t) * (nsv + 3));
495 }
496 
497 static void
498 stripe_error(md_sps_t *ps)
499 {
500 	struct buf	*pb = ps->ps_bp;
501 	mdi_unit_t	*ui = ps->ps_ui;
502 	md_dev64_t	dev = ps->ps_errcomp->un_dev;
503 	md_dev64_t	md_dev = md_expldev(pb->b_edev);
504 	char		*str;
505 
506 	if (pb->b_flags & B_READ) {
507 		ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_READERR;
508 		str = "read";
509 	} else {
510 		ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_WRTERR;
511 		str = "write";
512 	}
513 	if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
514 		if (MUTEX_HELD(&ps->ps_mx)) {
515 			mutex_exit(&ps->ps_mx);
516 		}
517 	} else {
518 		ASSERT(panicstr);
519 	}
520 	SPS_FREE(stripe_parent_cache, ps);
521 	pb->b_flags |= B_ERROR;
522 
523 	md_kstat_done(ui, pb, 0);
524 	md_unit_readerexit(ui);
525 	md_biodone(pb);
526 
527 	cmn_err(CE_WARN, "md: %s: %s error on %s",
528 	    md_shortname(md_getminor(md_dev)), str,
529 	    md_devname(MD_DEV2SET(md_dev), dev, NULL, 0));
530 }
531 
532 static int
533 stripe_done(struct buf *cb)
534 {
535 	struct buf	*pb;
536 	mdi_unit_t	*ui;
537 	md_sps_t	*ps;
538 	md_scs_t	*cs;
539 
540 	/*LINTED*/
541 	cs = (md_scs_t *)((caddr_t)cb - md_stripe_mcs_buf_off);
542 	ps = cs->cs_ps;
543 	pb = ps->ps_bp;
544 
545 	mutex_enter(&ps->ps_mx);
546 	if (cb->b_flags & B_ERROR) {
547 		ps->ps_flags |= MD_SPS_ERROR;
548 		pb->b_error = cb->b_error;
549 		ps->ps_errcomp = cs->cs_comp;
550 	}
551 
552 	if (cb->b_flags & B_REMAPPED)
553 		bp_mapout(cb);
554 
555 	ps->ps_frags--;
556 	if (ps->ps_frags != 0) {
557 		mutex_exit(&ps->ps_mx);
558 		kmem_cache_free(stripe_child_cache, cs);
559 		return (1);
560 	}
561 	kmem_cache_free(stripe_child_cache, cs);
562 	if (ps->ps_flags & MD_SPS_ERROR) {
563 		stripe_error(ps);
564 		return (1);
565 	}
566 	ui = ps->ps_ui;
567 	if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
568 		mutex_exit(&ps->ps_mx);
569 	} else {
570 		ASSERT(panicstr);
571 	}
572 	SPS_FREE(stripe_parent_cache, ps);
573 	md_kstat_done(ui, pb, 0);
574 	md_unit_readerexit(ui);
575 	md_biodone(pb);
576 	return (0);
577 }
578 
579 
580 /*
581  * This routine does the mapping from virtual (dev, blkno) of a metapartition
582  * to the real (dev, blkno) of a real disk partition.
583  * It goes to the md_conf[] table to find out the correct real partition
584  * dev and block number for this buffer.
585  *
586  * A single buf request can not go across real disk partition boundary.
587  * When the virtual request specified by (dev, blkno) spans more than one
588  * real partition, md_mapbuf will return 1. Then the caller should prepare
589  * another real buf and continue calling md_mapbuf to do the mapping until
590  * it returns 0.
591  *
592  */
593 
594 static int
595 md_mapbuf(
596 	ms_unit_t	*un,
597 	diskaddr_t	blkno,
598 	u_longlong_t	bcount,
599 	buf_t		*bp,	/* if bp==NULL, skip bp updates */
600 	ms_comp_t	**mdc)	/* if bp==NULL, skip mdc update */
601 {
602 	struct ms_row	*mdr;
603 	struct ms_comp	*mdcomp;
604 	diskaddr_t	stripe_blk;
605 	diskaddr_t	fragment, blk_in_row, endblk;
606 	offset_t	interlace;
607 	size_t		dev_index;
608 	int		row_index, more;
609 	extern unsigned md_maxphys;
610 	/* Work var's when bp==NULL */
611 	u_longlong_t	wb_bcount;
612 	diskaddr_t	wb_blkno;
613 	md_dev64_t	wb_edev;
614 	ms_comp_t	*wmdc;
615 
616 	/*
617 	 * Do a real calculation to derive the minor device of the
618 	 * Virtual Disk, which in turn will let us derive the
619 	 * device/minor of the underlying real device.
620 	 */
621 
622 
623 	for (row_index = 0; row_index < un->un_nrows; row_index++) {
624 		mdr = &un->un_row[row_index];
625 		if (blkno < mdr->un_cum_blocks)
626 			break;
627 	}
628 	ASSERT(row_index != un->un_nrows);
629 
630 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
631 
632 	blk_in_row = blkno - mdr->un_cum_blocks + mdr->un_blocks;
633 	endblk = (diskaddr_t)(blkno + howmany(bcount, DEV_BSIZE));
634 	if (mdr->un_ncomp == 1) { /* No striping */
635 		if (endblk > mdr->un_cum_blocks) {
636 			wb_bcount = ldbtob(mdr->un_cum_blocks - blkno);
637 			if ((row_index + 1) == un->un_nrows)
638 				more = 0;
639 			else
640 				more = 1;
641 		} else {
642 			wb_bcount = bcount;
643 			more = 0;
644 		}
645 		wmdc = &mdcomp[mdr->un_icomp];
646 		wb_blkno = blk_in_row;
647 	} else { /* Have striping */
648 		interlace = mdr->un_interlace;
649 		fragment = blk_in_row % interlace;
650 		if (bcount > ldbtob(interlace - fragment)) {
651 			more = 1;
652 			wb_bcount = ldbtob(interlace - fragment);
653 		} else {
654 			more = 0;
655 			wb_bcount = bcount;
656 		}
657 
658 		stripe_blk = blk_in_row / interlace;
659 		dev_index = (size_t)(stripe_blk % mdr->un_ncomp);
660 		wmdc = &mdcomp[mdr->un_icomp + dev_index];
661 		wb_blkno = (diskaddr_t)(((stripe_blk / mdr->un_ncomp) *
662 		    interlace) + fragment);
663 	}
664 
665 	wb_blkno += wmdc->un_start_block;
666 	wb_edev = wmdc->un_dev;
667 
668 	/* only break up the I/O if we're not built on another metadevice */
669 	if ((md_getmajor(wb_edev) != md_major) && (wb_bcount > md_maxphys)) {
670 		wb_bcount = md_maxphys;
671 		more = 1;
672 	}
673 	if (bp != (buf_t *)NULL) {
674 		/*
675 		 * wb_bcount is limited by md_maxphys which is 'int'
676 		 */
677 		bp->b_bcount = (size_t)wb_bcount;
678 		bp->b_lblkno = wb_blkno;
679 		bp->b_edev = md_dev64_to_dev(wb_edev);
680 		*mdc = wmdc;
681 	}
682 	return (more);
683 }
684 
685 static void
686 md_stripe_strategy(buf_t *pb, int flag, void *private)
687 {
688 	md_sps_t	*ps;
689 	md_scs_t	*cs;
690 	int		doing_writes;
691 	int		more;
692 	ms_unit_t	*un;
693 	mdi_unit_t	*ui;
694 	size_t		current_count;
695 	diskaddr_t	current_blkno;
696 	off_t		current_offset;
697 	buf_t		*cb;		/* child buf pointer */
698 	set_t		setno;
699 
700 	setno = MD_MIN2SET(getminor(pb->b_edev));
701 
702 	/*
703 	 * When doing IO to a multi owner meta device, check if set is halted.
704 	 * We do this check without the needed lock held, for performance
705 	 * reasons.
706 	 * If an IO just slips through while the set is locked via an
707 	 * MD_MN_SUSPEND_SET, we don't care about it.
708 	 * Only check for a suspended set if we are a top-level i/o request
709 	 * (MD_STR_NOTTOP is cleared in 'flag').
710 	 */
711 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
712 	    (MD_SET_HALTED | MD_SET_MNSET)) {
713 		if ((flag & MD_STR_NOTTOP) == 0) {
714 			mutex_enter(&md_mx);
715 			/* Here we loop until the set is no longer halted */
716 			while (md_set[setno].s_status & MD_SET_HALTED) {
717 				cv_wait(&md_cv, &md_mx);
718 			}
719 			mutex_exit(&md_mx);
720 		}
721 	}
722 
723 	ui = MDI_UNIT(getminor(pb->b_edev));
724 
725 	md_kstat_waitq_enter(ui);
726 
727 	un = (ms_unit_t *)md_unit_readerlock(ui);
728 
729 	if ((flag & MD_NOBLOCK) == 0) {
730 		if (md_inc_iocount(setno) != 0) {
731 			pb->b_flags |= B_ERROR;
732 			pb->b_error = ENXIO;
733 			pb->b_resid = pb->b_bcount;
734 			md_kstat_waitq_exit(ui);
735 			md_unit_readerexit(ui);
736 			biodone(pb);
737 			return;
738 		}
739 	} else {
740 		md_inc_iocount_noblock(setno);
741 	}
742 
743 	if (!(flag & MD_STR_NOTTOP)) {
744 		if (md_checkbuf(ui, (md_unit_t *)un, pb) != 0) {
745 			md_kstat_waitq_exit(ui);
746 			return;
747 		}
748 	}
749 
750 	ps = kmem_cache_alloc(stripe_parent_cache, MD_ALLOCFLAGS);
751 	stripe_parent_init(ps);
752 
753 	/*
754 	 * Save essential information from the original buffhdr
755 	 * in the md_save structure.
756 	 */
757 	ps->ps_un = un;
758 	ps->ps_ui = ui;
759 	ps->ps_bp = pb;
760 	ps->ps_addr = pb->b_un.b_addr;
761 
762 	if ((pb->b_flags & B_READ) == 0)
763 		doing_writes = 1;
764 	else
765 		doing_writes = 0;
766 
767 
768 	current_count = pb->b_bcount;
769 	current_blkno = pb->b_lblkno;
770 	current_offset  = 0;
771 
772 	if (!(flag & MD_STR_NOTTOP) && panicstr)
773 		ps->ps_flags |= MD_SPS_DONTFREE;
774 
775 	md_kstat_waitq_to_runq(ui);
776 
777 	ps->ps_frags++;
778 	do {
779 		cs = kmem_cache_alloc(stripe_child_cache, MD_ALLOCFLAGS);
780 		stripe_child_init(cs);
781 		cb = &cs->cs_buf;
782 		cs->cs_ps = ps;
783 		more = md_mapbuf(un, current_blkno, current_count, cb,
784 		    &cs->cs_comp);
785 
786 		cb = md_bioclone(pb, current_offset, cb->b_bcount, cb->b_edev,
787 		    cb->b_lblkno, stripe_done, cb, KM_NOSLEEP);
788 		/*
789 		 * Do these calculations now,
790 		 *  so that we pickup a valid b_bcount from the chld_bp.
791 		 */
792 		current_offset += cb->b_bcount;
793 		current_count -=  cb->b_bcount;
794 		current_blkno +=  (diskaddr_t)(lbtodb(cb->b_bcount));
795 
796 		if (more) {
797 			mutex_enter(&ps->ps_mx);
798 			ps->ps_frags++;
799 			mutex_exit(&ps->ps_mx);
800 		}
801 
802 		if (doing_writes &&
803 		    cs->cs_comp->un_mirror.ms_flags & MDM_S_NOWRITE) {
804 			(void) stripe_done(cb);
805 			continue;
806 		}
807 		md_call_strategy(cb, flag, private);
808 	} while (more);
809 
810 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
811 		while (!(ps->ps_flags & MD_SPS_DONE)) {
812 			md_daemon(1, &md_done_daemon);
813 			drv_usecwait(10);
814 		}
815 		kmem_cache_free(stripe_parent_cache, ps);
816 	}
817 }
818 
819 static int
820 stripe_snarf(md_snarfcmd_t cmd, set_t setno)
821 {
822 	ms_unit_t	*un;
823 	mddb_recid_t	recid;
824 	int		gotsomething;
825 	int		all_stripes_gotten;
826 	mddb_type_t	typ1;
827 	mddb_de_ic_t	*dep;
828 	mddb_rb32_t	*rbp;
829 	size_t		newreqsize;
830 	ms_unit_t	*big_un;
831 	ms_unit32_od_t	*small_un;
832 
833 
834 	if (cmd == MD_SNARF_CLEANUP)
835 		return (0);
836 
837 	all_stripes_gotten = 1;
838 	gotsomething = 0;
839 
840 	typ1 = (mddb_type_t)md_getshared_key(setno,
841 	    stripe_md_ops.md_driver.md_drivername);
842 	recid = mddb_makerecid(setno, 0);
843 
844 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
845 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
846 			continue;
847 
848 		dep = mddb_getrecdep(recid);
849 		dep->de_flags = MDDB_F_STRIPE;
850 		rbp = dep->de_rb;
851 
852 		switch (rbp->rb_revision) {
853 		case MDDB_REV_RB:
854 		case MDDB_REV_RBFN:
855 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
856 				/*
857 				 * This means, we have an old and small record
858 				 * and this record hasn't already been
859 				 * converted.  Before we create an incore
860 				 * metadevice from this we have to convert it to
861 				 * a big record.
862 				 */
863 				small_un =
864 				    (ms_unit32_od_t *)mddb_getrecaddr(recid);
865 				newreqsize = get_big_stripe_req_size(small_un,
866 				    COMPLETE_STRUCTURE);
867 				big_un = (ms_unit_t *)kmem_zalloc(newreqsize,
868 				    KM_SLEEP);
869 				stripe_convert((caddr_t)small_un,
870 				    (caddr_t)big_un, SMALL_2_BIG);
871 				kmem_free(small_un, dep->de_reqsize);
872 				dep->de_rb_userdata = big_un;
873 				dep->de_reqsize = newreqsize;
874 				un = big_un;
875 				rbp->rb_private |= MD_PRV_CONVD;
876 			} else {
877 				/* Small device had already been converted */
878 				un = (ms_unit_t *)mddb_getrecaddr(recid);
879 			}
880 			un->c.un_revision &= ~MD_64BIT_META_DEV;
881 			break;
882 		case MDDB_REV_RB64:
883 		case MDDB_REV_RB64FN:
884 			/* Big device */
885 			un = (ms_unit_t *)mddb_getrecaddr(recid);
886 			un->c.un_revision |= MD_64BIT_META_DEV;
887 			un->c.un_flag |= MD_EFILABEL;
888 			break;
889 		}
890 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
891 
892 		/* Create minor node for snarfed unit. */
893 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
894 
895 		if (MD_UNIT(MD_SID(un)) != NULL) {
896 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
897 			continue;
898 		}
899 		all_stripes_gotten = 0;
900 		if (stripe_build_incore((void *)un, 1) == 0) {
901 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
902 			md_create_unit_incore(MD_SID(un), &stripe_md_ops, 0);
903 			gotsomething = 1;
904 		}
905 	}
906 
907 	if (!all_stripes_gotten)
908 		return (gotsomething);
909 
910 	recid = mddb_makerecid(setno, 0);
911 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
912 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
913 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
914 
915 	return (0);
916 }
917 
918 static int
919 stripe_halt(md_haltcmd_t cmd, set_t setno)
920 {
921 	int		i;
922 	mdi_unit_t	*ui;
923 	minor_t		mnum;
924 
925 	if (cmd == MD_HALT_CLOSE)
926 		return (0);
927 
928 	if (cmd == MD_HALT_OPEN)
929 		return (0);
930 
931 	if (cmd == MD_HALT_UNLOAD)
932 		return (0);
933 
934 	if (cmd == MD_HALT_CHECK) {
935 		for (i = 0; i < md_nunits; i++) {
936 			mnum = MD_MKMIN(setno, i);
937 			if ((ui = MDI_UNIT(mnum)) == NULL)
938 				continue;
939 			if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
940 				continue;
941 			if (md_unit_isopen(ui))
942 				return (1);
943 		}
944 		return (0);
945 	}
946 
947 	if (cmd != MD_HALT_DOIT)
948 		return (1);
949 
950 	for (i = 0; i < md_nunits; i++) {
951 		mnum = MD_MKMIN(setno, i);
952 		if ((ui = MDI_UNIT(mnum)) == NULL)
953 			continue;
954 		if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
955 			continue;
956 		reset_stripe((ms_unit_t *)MD_UNIT(mnum), mnum, 0);
957 	}
958 
959 	return (0);
960 }
961 
962 /*ARGSUSED3*/
963 static int
964 stripe_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
965 {
966 	minor_t		mnum = getminor(*dev);
967 	mdi_unit_t	*ui = MDI_UNIT(mnum);
968 	ms_unit_t	*un;
969 	int		err = 0;
970 	set_t		setno;
971 
972 	/*
973 	 * When doing an open of a multi owner metadevice, check to see if this
974 	 * node is a starting node and if a reconfig cycle is underway.
975 	 * If so, the system isn't sufficiently set up enough to handle the
976 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
977 	 */
978 	setno = MD_MIN2SET(mnum);
979 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
980 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
981 			return (ENXIO);
982 	}
983 
984 	/* single thread */
985 	un = (ms_unit_t *)md_unit_openclose_enter(ui);
986 
987 	/* open devices, if necessary */
988 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
989 		if ((err = stripe_open_all_devs(un, md_oflags)) != 0) {
990 			goto out;
991 		}
992 	}
993 
994 	/* count open */
995 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
996 		goto out;
997 
998 	/* unlock, return success */
999 out:
1000 	md_unit_openclose_exit(ui);
1001 	return (err);
1002 }
1003 
1004 /*ARGSUSED1*/
1005 static int
1006 stripe_close(
1007 	dev_t		dev,
1008 	int		flag,
1009 	int		otyp,
1010 	cred_t		*cred_p,
1011 	int		md_cflags
1012 )
1013 {
1014 	minor_t		mnum = getminor(dev);
1015 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1016 	ms_unit_t	*un;
1017 	int		err = 0;
1018 
1019 	/* single thread */
1020 	un = (ms_unit_t *)md_unit_openclose_enter(ui);
1021 
1022 	/* count closed */
1023 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
1024 		goto out;
1025 
1026 	/* close devices, if necessary */
1027 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1028 		stripe_close_all_devs(un, md_cflags);
1029 	}
1030 
1031 	/* unlock, return success */
1032 out:
1033 	md_unit_openclose_exit(ui);
1034 	return (err);
1035 }
1036 
1037 
1038 static struct buf dumpbuf;
1039 
1040 /*
1041  * This routine dumps memory to the disk.  It assumes that the memory has
1042  * already been mapped into mainbus space.  It is called at disk interrupt
1043  * priority when the system is in trouble.
1044  *
1045  */
1046 static int
1047 stripe_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1048 {
1049 	ms_unit_t	*un;
1050 	buf_t		*bp;
1051 	ms_comp_t	*mdc;
1052 	u_longlong_t	nb;
1053 	diskaddr_t	mapblk;
1054 	int		result;
1055 	int		more;
1056 	int		saveresult = 0;
1057 
1058 	/*
1059 	 * Don't need to grab the unit lock.
1060 	 * Cause nothing else is suppose to be happenning.
1061 	 * Also dump is not suppose to sleep.
1062 	 */
1063 	un = (ms_unit_t *)MD_UNIT(getminor(dev));
1064 
1065 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1066 		return (EINVAL);
1067 
1068 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
1069 		return (EINVAL);
1070 
1071 	bp = &dumpbuf;
1072 	nb = ldbtob(nblk);
1073 	do {
1074 		bzero((caddr_t)bp, sizeof (*bp));
1075 		more = md_mapbuf(un, (diskaddr_t)blkno, nb, bp, &mdc);
1076 		nblk = btodb(bp->b_bcount);
1077 		mapblk = bp->b_lblkno;
1078 		if (!(mdc->un_mirror.ms_flags & MDM_S_NOWRITE)) {
1079 			/*
1080 			 * bdev_dump() is currently only able to take
1081 			 * 32 bit wide blkno's.
1082 			 */
1083 			result = bdev_dump(bp->b_edev, addr, (daddr_t)mapblk,
1084 			    nblk);
1085 			if (result)
1086 				saveresult = result;
1087 		}
1088 
1089 		nb -= bp->b_bcount;
1090 		addr += bp->b_bcount;
1091 		blkno += nblk;
1092 	} while (more);
1093 
1094 	return (saveresult);
1095 }
1096 
1097 /*ARGSUSED*/
1098 static intptr_t
1099 stripe_shared_by_blk(
1100 	md_dev64_t dev,
1101 	void *junk,
1102 	diskaddr_t blkno,
1103 	u_longlong_t *cnt)
1104 {
1105 	ms_unit_t	*un;
1106 	buf_t		bp;
1107 	ms_comp_t	*comp;
1108 
1109 	un = MD_UNIT(md_getminor(dev));
1110 	(void) md_mapbuf(un, blkno, ldbtob(*cnt), &bp, &comp);
1111 	*cnt = (u_longlong_t)lbtodb(bp.b_bcount);
1112 	return ((intptr_t)&comp->un_mirror);
1113 }
1114 
1115 /*
1116  * stripe_block_count_skip_size() returns the following values
1117  *	so that the logical to physical block mappings can
1118  *	be calculated without intimate knowledge of the underpinnings.
1119  *
1120  *	block - first logical block number of the device.
1121  *		block = [ # of blocks before THE row ] +
1122  *			[ # of blocks in THE row before the component ]
1123  *	count - # of segments (interlaced size).
1124  *	skip  - # of logical blocks between segments, or delta to
1125  *		  get to next segment
1126  *	size  - interlace size used for the block, count, skip.
1127  */
1128 /*ARGSUSED*/
1129 static intptr_t
1130 stripe_block_count_skip_size(
1131 	md_dev64_t	 dev,
1132 	void		*junk,
1133 	int		ci,
1134 	diskaddr_t	*block,
1135 	size_t		*count,
1136 	u_longlong_t	*skip,
1137 	u_longlong_t	*size)
1138 {
1139 	ms_unit_t	*un;
1140 	int		row;
1141 	struct ms_row	*mdr;
1142 	int		cmpcount = 0;
1143 
1144 	un = MD_UNIT(md_getminor(dev));
1145 
1146 	for (row = 0; row < un->un_nrows; row++) {
1147 		mdr = &un->un_row[row];
1148 		if ((mdr->un_ncomp + cmpcount) > ci)
1149 			break;
1150 		cmpcount += mdr->un_ncomp;
1151 	}
1152 	ASSERT(row != un->un_nrows);
1153 
1154 	/*
1155 	 * Concatenations are always contiguous blocks,
1156 	 * you cannot depend on the interlace being a usable
1157 	 * value (except for stripes).
1158 	 */
1159 	if (mdr->un_ncomp == 1) {	/* Concats */
1160 		*block = mdr->un_cum_blocks - mdr->un_blocks;
1161 		*count = 1;
1162 		*skip = 0;
1163 		*size = mdr->un_blocks;
1164 	} else {			/* Stripes */
1165 		*block = (mdr->un_cum_blocks - mdr->un_blocks) +
1166 		    ((ci - cmpcount) * mdr->un_interlace);
1167 		*count	= (size_t)(mdr->un_blocks / (mdr->un_interlace *
1168 		    mdr->un_ncomp));
1169 		*skip = (mdr->un_interlace * mdr->un_ncomp) - mdr->un_interlace;
1170 		*size = mdr->un_interlace;
1171 	}
1172 
1173 	return (0);
1174 }
1175 
1176 /*ARGSUSED*/
1177 static intptr_t
1178 stripe_shared_by_indx(md_dev64_t dev, void *junk, int indx)
1179 {
1180 	ms_unit_t	*un;
1181 	ms_comp_t	*comp;
1182 
1183 	un = MD_UNIT(md_getminor(dev));
1184 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1185 	comp += indx;
1186 	return ((intptr_t)&comp->un_mirror);
1187 }
1188 
1189 /*ARGSUSED*/
1190 intptr_t
1191 stripe_component_count(md_dev64_t dev, void *junk)
1192 {
1193 	/*
1194 	 * See comments for stripe_get_dev
1195 	 */
1196 
1197 	ms_unit_t	*un;
1198 	int		count = 0;
1199 	int		row;
1200 
1201 	un = MD_UNIT(md_getminor(dev));
1202 	for (row = 0; row < un->un_nrows; row++)
1203 		count += un->un_row[row].un_ncomp;
1204 	return (count);
1205 }
1206 
1207 /*ARGSUSED*/
1208 intptr_t
1209 stripe_get_dev(md_dev64_t dev, void *junk, int indx, ms_cd_info_t *cd)
1210 {
1211 	/*
1212 	 * It should be noted that stripe_replace in stripe_ioctl.c calls this
1213 	 * routine using makedevice(0, minor) for the first argument.
1214 	 *
1215 	 * If this routine at some point in the future needs to use the major
1216 	 * number stripe_replace must be changed.
1217 	 */
1218 
1219 	ms_unit_t	*un;
1220 	ms_comp_t	*comp;
1221 	md_dev64_t	tmpdev;
1222 
1223 	un = MD_UNIT(md_getminor(dev));
1224 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1225 	comp += indx;
1226 	tmpdev = comp->un_dev;
1227 	/*
1228 	 * Try to resolve devt again if NODEV64
1229 	 * Check if this comp is hotspared and if it is
1230 	 * then use key for hotspare
1231 	 */
1232 	if (tmpdev == NODEV64) {
1233 		tmpdev = md_resolve_bydevid(md_getminor(dev), tmpdev,
1234 		    comp->un_mirror.ms_hs_id ?
1235 		    comp->un_mirror.ms_hs_key :
1236 		    comp->un_key);
1237 		comp->un_dev = tmpdev;
1238 	}
1239 
1240 	cd->cd_dev = comp->un_dev;
1241 	cd->cd_orig_dev = comp->un_mirror.ms_orig_dev;
1242 	return (0);
1243 }
1244 
1245 /*ARGSUSED*/
1246 void
1247 stripe_replace_done(md_dev64_t dev, sv_dev_t *sv)
1248 {
1249 	/*
1250 	 * See comments for stripe_get_dev
1251 	 */
1252 
1253 	minor_t		mnum = md_getminor(dev);
1254 
1255 	if (sv != NULL) {
1256 		md_rem_names(sv, 1);
1257 		kmem_free(sv, sizeof (sv_dev_t));
1258 	}
1259 
1260 	md_unit_writerexit(MDI_UNIT(mnum));
1261 }
1262 
1263 /*ARGSUSED*/
1264 intptr_t
1265 stripe_replace_dev(md_dev64_t dev, void *junk, int ci, ms_new_dev_t *nd,
1266     mddb_recid_t *recids, int nrecids, void (**replace_done)(),
1267     void **replace_data)
1268 {
1269 	minor_t		mnum;
1270 	ms_unit_t	*un;
1271 	mdi_unit_t	*ui;
1272 	ms_comp_t	*comp;
1273 	diskaddr_t	dev_size;
1274 	int		row;
1275 	int		ncomps = 0;
1276 	int		cmpcount = 0;
1277 	int		rid = 0;
1278 	struct ms_row	*mdr;
1279 	sv_dev_t	*sv = NULL;
1280 	mddb_recid_t	hs_id = 0;
1281 	set_t		setno;
1282 	side_t		side;
1283 	md_dev64_t	this_dev;
1284 	md_dev64_t	old_dev;
1285 
1286 	mnum = md_getminor(dev);
1287 	ui = MDI_UNIT(mnum);
1288 	setno = MD_MIN2SET(mnum);
1289 	side = mddb_getsidenum(setno);
1290 
1291 	un = md_unit_writerlock(ui);
1292 
1293 	*replace_data = NULL;
1294 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1295 
1296 	comp += ci;
1297 	old_dev = comp->un_dev;
1298 
1299 	/*
1300 	 * Count the number of components
1301 	 */
1302 	for (row = 0; row < un->un_nrows; row++) {
1303 		struct ms_row *mdr = &un->un_row[row];
1304 		ncomps += mdr->un_ncomp;
1305 	}
1306 
1307 	recids[0] = 0;
1308 	/*
1309 	 * No need of checking size of new device,
1310 	 * when hotsparing (it has already been done), or
1311 	 * when enabling the device.
1312 	 */
1313 	if ((nd != NULL) && (nd->nd_hs_id == 0)) {
1314 		for (row = 0; row < un->un_nrows; row++) {
1315 			mdr = &un->un_row[row];
1316 			if ((mdr->un_ncomp + cmpcount) > ci)
1317 				break;
1318 			cmpcount += mdr->un_ncomp;
1319 		}
1320 		ASSERT(row != un->un_nrows);
1321 
1322 		/* Concatenations have a ncomp = 1 */
1323 		dev_size = mdr->un_blocks / mdr->un_ncomp;
1324 
1325 		/*
1326 		 * now check to see if new comp can be used in
1327 		 * place of old comp
1328 		 */
1329 		if ((un->c.un_flag & MD_LABELED) && (ci == 0) &&
1330 		    nd->nd_labeled)
1331 			nd->nd_start_blk = 0;
1332 		else
1333 			nd->nd_nblks -= nd->nd_start_blk;
1334 
1335 		if (dev_size > nd->nd_nblks) {
1336 			md_unit_writerexit(ui);
1337 			return (MDE_COMP_TOO_SMALL);
1338 		}
1339 
1340 		sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
1341 		sv->setno = MD_MIN2SET(mnum);
1342 		sv->key = comp->un_key;
1343 	}
1344 
1345 	/*
1346 	 * Close this component.
1347 	 */
1348 	if (comp->un_mirror.ms_flags & MDM_S_ISOPEN) {
1349 		md_layered_close(comp->un_dev, MD_OFLG_NULL);
1350 		comp->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
1351 	}
1352 
1353 	/*
1354 	 * If the component is hotspared, return to the pool.
1355 	 */
1356 	if (comp->un_mirror.ms_hs_id != 0) {
1357 		hs_cmds_t	cmd;
1358 		mdkey_t		hs_key;
1359 
1360 		hs_key = comp->un_mirror.ms_hs_key;
1361 		comp->un_dev = comp->un_mirror.ms_orig_dev;
1362 		comp->un_start_block = comp->un_mirror.ms_orig_blk;
1363 		comp->un_mirror.ms_hs_key = 0;
1364 		comp->un_mirror.ms_hs_id = 0;
1365 		comp->un_mirror.ms_orig_dev = 0;
1366 
1367 		cmd = HS_FREE;
1368 		if ((comp->un_mirror.ms_state != CS_OKAY) &&
1369 		    (comp->un_mirror.ms_state != CS_RESYNC))
1370 			cmd = HS_BAD;
1371 		(void) md_hot_spare_ifc(cmd, un->un_hsp_id, 0, 0, &hs_id,
1372 		    &hs_key, NULL, NULL);
1373 	}
1374 
1375 	/*
1376 	 * Open by device id; for enable (indicated by a NULL
1377 	 * nd pointer), use the existing component info.  For
1378 	 * replace, use the new device.
1379 	 */
1380 	if (nd == NULL) {
1381 		this_dev = md_resolve_bydevid(mnum, comp->un_dev, comp->un_key);
1382 		/*
1383 		 * If someone replaced a new disk in the same slot
1384 		 * we get NODEV64 since old device id cannot be
1385 		 * resolved. The new devt is obtained from the
1386 		 * mddb since devt is going to be unchanged for the
1387 		 * enable case. No need to check for multiple
1388 		 * keys here because the caller (comp_replace)
1389 		 * has already sanity checked it for us.
1390 		 */
1391 		if (this_dev == NODEV64) {
1392 			this_dev = md_getdevnum(setno, side, comp->un_key,
1393 			    MD_TRUST_DEVT);
1394 		}
1395 	} else {
1396 		/*
1397 		 * If this is a hotspare, save the original dev_t for later
1398 		 * use. If this has occured during boot then the value of
1399 		 * comp->un_dev will be NODEV64 because of the failure to look
1400 		 * up the devid of the device.
1401 		 */
1402 		if (nd->nd_hs_id != 0)
1403 			comp->un_mirror.ms_orig_dev = comp->un_dev;
1404 		this_dev = md_resolve_bydevid(mnum, nd->nd_dev, nd->nd_key);
1405 	}
1406 
1407 	comp->un_dev = this_dev;
1408 
1409 	/*
1410 	 * Now open the new device if required. Note for a single component
1411 	 * stripe it will not be open - leave this for the mirror driver to
1412 	 * deal with.
1413 	 */
1414 	if (md_unit_isopen(ui)) {
1415 		if (md_layered_open(mnum, &this_dev, MD_OFLG_NULL)) {
1416 			mddb_recid_t	ids[3];
1417 
1418 			ids[0] = un->c.un_record_id;
1419 			ids[1] = hs_id;
1420 			ids[2] = 0;
1421 			mddb_commitrecs_wrapper(ids);
1422 			if ((nd != NULL) && (nd->nd_hs_id != 0)) {
1423 				/*
1424 				 * Revert back to the original device.
1425 				 */
1426 				comp->un_dev = comp->un_mirror.ms_orig_dev;
1427 
1428 				cmn_err(CE_WARN,
1429 				    "md: %s: open error of hotspare %s",
1430 				    md_shortname(mnum),
1431 				    md_devname(MD_MIN2SET(mnum), nd->nd_dev,
1432 				    NULL, 0));
1433 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1434 				    SVM_TAG_HS, MD_MIN2SET(mnum), nd->nd_dev);
1435 			}
1436 			md_unit_writerexit(ui);
1437 			return (MDE_COMP_OPEN_ERR);
1438 		}
1439 		if (nd != NULL)
1440 			nd->nd_dev = this_dev;
1441 
1442 		comp->un_mirror.ms_flags |= MDM_S_ISOPEN;
1443 	}
1444 
1445 	if (nd == NULL) {
1446 		recids[0] = un->c.un_record_id;
1447 		recids[1] = hs_id;
1448 		recids[2] = 0;
1449 		*replace_done = stripe_replace_done;
1450 		return (0);
1451 	}
1452 
1453 	/* if hot sparing this device */
1454 	if (nd->nd_hs_id != 0) {
1455 		char	devname[MD_MAX_CTDLEN];
1456 		char	hs_devname[MD_MAX_CTDLEN];
1457 		set_t	setno;
1458 
1459 		comp->un_mirror.ms_hs_id = nd->nd_hs_id;
1460 		comp->un_mirror.ms_hs_key = nd->nd_key;
1461 
1462 		comp->un_mirror.ms_orig_blk = comp->un_start_block;
1463 
1464 		setno = MD_MIN2SET(mnum);
1465 
1466 		(void) md_devname(setno, comp->un_mirror.ms_orig_dev, devname,
1467 		    sizeof (devname));
1468 		(void) md_devname(setno, nd->nd_dev, hs_devname,
1469 		    sizeof (hs_devname));
1470 
1471 		cmn_err(CE_NOTE, "md: %s: hotspared device %s with %s",
1472 		    md_shortname(mnum), devname, hs_devname);
1473 
1474 	} else {	/* replacing the device */
1475 		comp->un_key = nd->nd_key;
1476 		*replace_data = (void *)sv;
1477 
1478 		/*
1479 		 * For the old device, make sure to reset the parent
1480 		 * if it's a  metadevice.
1481 		 */
1482 		if (md_getmajor(comp->un_dev) == md_major) {
1483 			minor_t	  comp_mnum = md_getminor(old_dev);
1484 			md_unit_t *comp_un = MD_UNIT(comp_mnum);
1485 
1486 			md_reset_parent(old_dev);
1487 			recids[rid++] = MD_RECID(comp_un);
1488 		}
1489 	}
1490 
1491 	comp->un_dev = nd->nd_dev;
1492 	comp->un_start_block = nd->nd_start_blk;
1493 
1494 	/*
1495 	 * For the new device, make sure to set the parent if it's a
1496 	 * metadevice.
1497 	 *
1498 	 * If we ever support using metadevices as hot spares, this
1499 	 * will need to be tested, and possibly moved into the
1500 	 * preceding "else" clause, immediately following the parent
1501 	 * reset block.  For now, it's convenient to leave it here and
1502 	 * only compress nd->nd_dev once.
1503 	 */
1504 	if (md_getmajor(comp->un_dev) == md_major) {
1505 		minor_t		comp_mnum = md_getminor(comp->un_dev);
1506 		md_unit_t	*comp_un = MD_UNIT(comp_mnum);
1507 
1508 		md_set_parent(comp->un_dev, MD_SID(un));
1509 		recids[rid++] = MD_RECID(comp_un);
1510 	}
1511 
1512 	recids[rid++] = un->c.un_record_id;
1513 	recids[rid++] = hs_id;
1514 	recids[rid] = 0;
1515 	*replace_done = stripe_replace_done;
1516 	return (0);
1517 }
1518 
1519 /*ARGSUSED*/
1520 static intptr_t
1521 stripe_hotspare_dev(
1522 	md_dev64_t	dev,
1523 	void		*junk,
1524 	int		ci,
1525 	mddb_recid_t	*recids,
1526 	int		nrecids,
1527 	void		(**replace_done)(),
1528 	void		**replace_data)
1529 {
1530 	ms_unit_t	*un;
1531 	mdi_unit_t	*ui;
1532 	ms_comp_t	*comp;
1533 	int		row;
1534 	struct ms_row	*mdr;
1535 	ms_new_dev_t	nd;
1536 	int		err;
1537 	int		i;
1538 	minor_t		mnum;
1539 	set_t		setno;
1540 	int		cmpcount = 0;
1541 
1542 	mnum = md_getminor(dev);
1543 	ui = MDI_UNIT(mnum);
1544 	un = MD_UNIT(mnum);
1545 	setno = MD_MIN2SET(mnum);
1546 
1547 	if (md_get_setstatus(setno) & MD_SET_STALE)
1548 		return (1);
1549 
1550 	if (un->un_hsp_id == -1)
1551 		return (1);
1552 
1553 	for (row = 0; row < un->un_nrows; row++) {
1554 		mdr = &un->un_row[row];
1555 		if ((mdr->un_ncomp + cmpcount) > ci)
1556 			break;
1557 		cmpcount += mdr->un_ncomp;
1558 	}
1559 	ASSERT(row != un->un_nrows);
1560 
1561 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1562 	comp += ci;
1563 	/* Concatenations have a ncomp = 1 */
1564 	nd.nd_nblks = mdr->un_blocks / mdr->un_ncomp;
1565 
1566 	if ((un->c.un_flag & MD_LABELED) && (ci == 0))
1567 		nd.nd_labeled = 1;
1568 	else
1569 		nd.nd_labeled = 0;
1570 
1571 again:
1572 	err = md_hot_spare_ifc(HS_GET, un->un_hsp_id, nd.nd_nblks,
1573 	    nd.nd_labeled, &nd.nd_hs_id, &nd.nd_key, &nd.nd_dev,
1574 	    &nd.nd_start_blk);
1575 
1576 	if (err) {
1577 		if (!stripe_replace_dev(dev, junk, ci, NULL, recids, nrecids,
1578 		    replace_done, replace_data)) {
1579 			mddb_commitrecs_wrapper(recids);
1580 			md_unit_writerexit(ui);
1581 		}
1582 		recids[0] = 0;
1583 		return (1);
1584 	}
1585 
1586 	if (stripe_replace_dev(dev, junk, ci, &nd, recids, nrecids,
1587 	    replace_done, replace_data)) {
1588 
1589 		(void) md_hot_spare_ifc(HS_BAD, un->un_hsp_id, 0, 0,
1590 		    &nd.nd_hs_id, &nd.nd_key, NULL, NULL);
1591 		mddb_commitrec_wrapper(nd.nd_hs_id);
1592 		goto again;
1593 	}
1594 
1595 	/* Leave a slot for the null recid */
1596 	for (i = 0; i < (nrecids - 1); i++) {
1597 		if (recids[i] == 0) {
1598 			recids[i++] = nd.nd_hs_id;
1599 			recids[i] = 0;
1600 		}
1601 	}
1602 	return (0);
1603 }
1604 
1605 static int
1606 stripe_imp_set(
1607 	set_t	setno
1608 )
1609 {
1610 
1611 	mddb_recid_t	recid;
1612 	int		i, row, c, gotsomething;
1613 	mddb_type_t	typ1;
1614 	mddb_de_ic_t	*dep;
1615 	mddb_rb32_t	*rbp;
1616 	ms_unit32_od_t	*un32;
1617 	ms_unit_t	*un64;
1618 	md_dev64_t	self_devt;
1619 	minor_t		*self_id;	/* minor needs to be updated */
1620 	md_parent_t	*parent_id;	/* parent needs to be updated */
1621 	mddb_recid_t	*record_id;	/* record id needs to be updated */
1622 	mddb_recid_t	*hsp_id;
1623 	ms_comp32_od_t	*comp32;
1624 	ms_comp_t	*comp64;
1625 
1626 
1627 	gotsomething = 0;
1628 
1629 	typ1 = (mddb_type_t)md_getshared_key(setno,
1630 	    stripe_md_ops.md_driver.md_drivername);
1631 	recid = mddb_makerecid(setno, 0);
1632 
1633 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
1634 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1635 			continue;
1636 
1637 		dep = mddb_getrecdep(recid);
1638 		rbp = dep->de_rb;
1639 
1640 		switch (rbp->rb_revision) {
1641 		case MDDB_REV_RB:
1642 		case MDDB_REV_RBFN:
1643 			/*
1644 			 * Small device
1645 			 */
1646 			un32 = (ms_unit32_od_t *)mddb_getrecaddr(recid);
1647 			self_id = &(un32->c.un_self_id);
1648 			parent_id = &(un32->c.un_parent);
1649 			record_id = &(un32->c.un_record_id);
1650 			hsp_id = &(un32->un_hsp_id);
1651 
1652 			comp32 = (ms_comp32_od_t *)
1653 			    ((void *)&((char *)un32)[un32->un_ocomp]);
1654 			for (row = 0; row < un32->un_nrows; row++) {
1655 				struct ms_row32_od *mdr = &un32->un_row[row];
1656 				for (i = 0, c = mdr->un_icomp;
1657 				    i < mdr->un_ncomp; i++) {
1658 					ms_comp32_od_t *mdc;
1659 
1660 					mdc = &comp32[c++];
1661 
1662 					if (!md_update_minor(setno,
1663 					    mddb_getsidenum(setno),
1664 					    mdc->un_key))
1665 						goto out;
1666 
1667 					if (mdc->un_mirror.ms_hs_id != 0)
1668 						mdc->un_mirror.ms_hs_id =
1669 						    MAKERECID(setno,
1670 						    mdc->un_mirror.ms_hs_id);
1671 				}
1672 			}
1673 			break;
1674 		case MDDB_REV_RB64:
1675 		case MDDB_REV_RB64FN:
1676 			un64 = (ms_unit_t *)mddb_getrecaddr(recid);
1677 			self_id = &(un64->c.un_self_id);
1678 			parent_id = &(un64->c.un_parent);
1679 			record_id = &(un64->c.un_record_id);
1680 			hsp_id = &(un64->un_hsp_id);
1681 
1682 			comp64 = (ms_comp_t *)
1683 			    ((void *)&((char *)un64)[un64->un_ocomp]);
1684 			for (row = 0; row < un64->un_nrows; row++) {
1685 				struct ms_row *mdr = &un64->un_row[row];
1686 
1687 				for (i = 0, c = mdr->un_icomp;
1688 				    i < mdr->un_ncomp; i++) {
1689 					ms_comp_t *mdc;
1690 
1691 					mdc = &comp64[c++];
1692 
1693 					if (!md_update_minor(setno,
1694 					    mddb_getsidenum(setno),
1695 					    mdc->un_key))
1696 						goto out;
1697 
1698 					if (mdc->un_mirror.ms_hs_id != 0)
1699 						mdc->un_mirror.ms_hs_id =
1700 						    MAKERECID(setno,
1701 						    mdc->un_mirror.ms_hs_id);
1702 				}
1703 			}
1704 			break;
1705 		}
1706 
1707 		/*
1708 		 * If this is a top level and a friendly name metadevice,
1709 		 * update its minor in the namespace.
1710 		 */
1711 		if ((*parent_id == MD_NO_PARENT) &&
1712 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
1713 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
1714 
1715 			self_devt = md_makedevice(md_major, *self_id);
1716 			if (!md_update_top_device_minor(setno,
1717 			    mddb_getsidenum(setno), self_devt))
1718 				goto out;
1719 		}
1720 
1721 		/*
1722 		 * Update unit with the imported setno
1723 		 *
1724 		 */
1725 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
1726 
1727 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1728 
1729 		if (*hsp_id != -1)
1730 			*hsp_id = MAKERECID(setno, DBID(*hsp_id));
1731 
1732 		if (*parent_id != MD_NO_PARENT)
1733 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1734 		*record_id = MAKERECID(setno, DBID(*record_id));
1735 
1736 		gotsomething = 1;
1737 	}
1738 
1739 out:
1740 	return (gotsomething);
1741 }
1742 
1743 static md_named_services_t stripe_named_services[] = {
1744 	{stripe_shared_by_blk,			"shared by blk"		    },
1745 	{stripe_shared_by_indx,			"shared by indx"	    },
1746 	{stripe_component_count,		"get component count"	    },
1747 	{stripe_block_count_skip_size,		"get block count skip size" },
1748 	{stripe_get_dev,			"get device"		    },
1749 	{stripe_replace_dev,			"replace device"	    },
1750 	{stripe_hotspare_dev,			"hotspare device"	    },
1751 	{stripe_rename_check,			MDRNM_CHECK		    },
1752 	{NULL,					0}
1753 };
1754 
1755 md_ops_t stripe_md_ops = {
1756 	stripe_open,		/* open */
1757 	stripe_close,		/* close */
1758 	md_stripe_strategy,	/* strategy */
1759 	NULL,			/* print */
1760 	stripe_dump,		/* dump */
1761 	NULL,			/* read */
1762 	NULL,			/* write */
1763 	md_stripe_ioctl,	/* stripe_ioctl, */
1764 	stripe_snarf,		/* stripe_snarf */
1765 	stripe_halt,		/* stripe_halt */
1766 	NULL,			/* aread */
1767 	NULL,			/* awrite */
1768 	stripe_imp_set,		/* import set */
1769 	stripe_named_services
1770 };
1771 
1772 static void
1773 init_init()
1774 {
1775 	md_stripe_mcs_buf_off = sizeof (md_scs_t) - sizeof (buf_t);
1776 
1777 	stripe_parent_cache = kmem_cache_create("md_stripe_parent",
1778 	    sizeof (md_sps_t), 0, stripe_parent_constructor,
1779 	    stripe_parent_destructor, stripe_run_queue, NULL, NULL,
1780 	    0);
1781 	stripe_child_cache = kmem_cache_create("md_stripe_child",
1782 	    sizeof (md_scs_t) - sizeof (buf_t) + biosize(), 0,
1783 	    stripe_child_constructor, stripe_child_destructor,
1784 	    stripe_run_queue, NULL, NULL, 0);
1785 }
1786 
1787 static void
1788 fini_uninit()
1789 {
1790 	kmem_cache_destroy(stripe_parent_cache);
1791 	kmem_cache_destroy(stripe_child_cache);
1792 	stripe_parent_cache = stripe_child_cache = NULL;
1793 }
1794 
1795 /* define the module linkage */
1796 MD_PLUGIN_MISC_MODULE("stripes module", init_init(), fini_uninit())
1797