xref: /titanic_41/usr/src/uts/common/io/lvm/stripe/stripe.c (revision 8c74a1f9477c04aa8539a84a49aa2bf629c7a14d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/conf.h>
31 #include <sys/file.h>
32 #include <sys/user.h>
33 #include <sys/uio.h>
34 #include <sys/t_lock.h>
35 #include <sys/buf.h>
36 #include <sys/dkio.h>
37 #include <sys/vtoc.h>
38 #include <sys/kmem.h>
39 #include <vm/page.h>
40 #include <sys/cmn_err.h>
41 #include <sys/sysmacros.h>
42 #include <sys/types.h>
43 #include <sys/mkdev.h>
44 #include <sys/stat.h>
45 #include <sys/open.h>
46 #include <sys/lvm/mdio.h>
47 #include <sys/lvm/mdvar.h>
48 #include <sys/lvm/md_stripe.h>
49 #include <sys/lvm/md_convert.h>
50 #include <sys/lvm/md_notify.h>
51 #include <sys/modctl.h>
52 #include <sys/ddi.h>
53 #include <sys/sunddi.h>
54 #include <sys/debug.h>
55 #include <sys/sysevent/eventdefs.h>
56 #include <sys/sysevent/svm.h>
57 
58 md_ops_t		stripe_md_ops;
59 #ifndef	lint
60 char			_depends_on[] = "drv/md";
61 md_ops_t		*md_interface_ops = &stripe_md_ops;
62 #endif
63 
64 extern unit_t		md_nunits;
65 extern set_t		md_nsets;
66 extern md_set_t		md_set[];
67 
68 extern kmutex_t		md_mx;
69 extern kcondvar_t	md_cv;
70 
71 extern int		md_status;
72 extern major_t		md_major;
73 extern mdq_anchor_t	md_done_daemon;
74 
75 static int		md_stripe_mcs_buf_off;
76 static kmem_cache_t	*stripe_parent_cache = NULL;
77 static kmem_cache_t	*stripe_child_cache = NULL;
78 
79 /*ARGSUSED1*/
80 static int
81 stripe_parent_constructor(void *p, void *d1, int d2)
82 {
83 	mutex_init(&((md_sps_t *)p)->ps_mx,
84 	    NULL, MUTEX_DEFAULT, NULL);
85 	return (0);
86 }
87 
88 static void
89 stripe_parent_init(void *ps)
90 {
91 	bzero(ps, offsetof(md_sps_t, ps_mx));
92 }
93 
94 /*ARGSUSED1*/
95 static void
96 stripe_parent_destructor(void *p, void *d)
97 {
98 	mutex_destroy(&((md_sps_t *)p)->ps_mx);
99 }
100 
101 /*ARGSUSED1*/
102 static int
103 stripe_child_constructor(void *p, void *d1, int d2)
104 {
105 	bioinit(&((md_scs_t *)p)->cs_buf);
106 	return (0);
107 }
108 
109 static void
110 stripe_child_init(md_scs_t *cs)
111 {
112 	cs->cs_mdunit = 0;
113 	cs->cs_ps = NULL;
114 	cs->cs_comp = NULL;
115 	md_bioreset(&cs->cs_buf);
116 }
117 
118 /*ARGSUSED1*/
119 static void
120 stripe_child_destructor(void *p, void *d)
121 {
122 	biofini(&((md_scs_t *)p)->cs_buf);
123 }
124 
125 /*ARGSUSED*/
126 static void
127 stripe_run_queue(void *d)
128 {
129 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
130 		md_daemon(1, &md_done_daemon);
131 }
132 
133 static void
134 stripe_close_all_devs(ms_unit_t *un, int md_cflags)
135 {
136 	int		row;
137 	int		i;
138 	int		c;
139 	struct ms_comp	*mdcomp;
140 
141 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
142 	for (row = 0; row < un->un_nrows; row++) {
143 		struct ms_row *mdr = &un->un_row[row];
144 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
145 			struct ms_comp	*mdc;
146 			mdc = &mdcomp[c++];
147 			if (md_cflags & MD_OFLG_PROBEDEV) {
148 
149 			/*
150 			 * It is possible that the md_layered_open
151 			 * failed because the stripe unit structure
152 			 * contained a NODEV.  In such a case since
153 			 * there is nothing to open, there is nothing
154 			 * to close.
155 			 */
156 				if (mdc->un_dev == NODEV64)
157 					continue;
158 			}
159 			if ((md_cflags & MD_OFLG_PROBEDEV) &&
160 			    (mdc->un_mirror.ms_flags & MDM_S_PROBEOPEN)) {
161 				md_layered_close(mdc->un_dev,
162 				    md_cflags);
163 				mdc->un_mirror.ms_flags &=
164 						~MDM_S_PROBEOPEN;
165 			} else if (mdc->un_mirror.ms_flags & MDM_S_ISOPEN) {
166 				md_layered_close(mdc->un_dev, md_cflags);
167 				mdc->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
168 			}
169 		}
170 	}
171 }
172 
173 static int
174 stripe_open_all_devs(ms_unit_t *un, int md_oflags)
175 {
176 	minor_t		mnum = MD_SID(un);
177 	int		row;
178 	int		i;
179 	int		c;
180 	struct ms_comp	*mdcomp;
181 	int		err;
182 	int		cont_on_errors = (md_oflags & MD_OFLG_CONT_ERRS);
183 	int		probe_err_cnt = 0;
184 	int		total_comp_cnt = 0;
185 	set_t		setno = MD_MIN2SET(MD_SID(un));
186 	side_t		side = mddb_getsidenum(setno);
187 	mdkey_t		key;
188 
189 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
190 
191 	/*
192 	 * For a probe call, if any component of a stripe or a concat
193 	 * can be opened, it is considered to be a success. The total number
194 	 * of components in a stripe are computed prior to starting a probe.
195 	 * This number is then compared against the number of components
196 	 * that could be be successfully opened. If none of the components
197 	 * in a stripe can be opened, only then an ENXIO is returned for a
198 	 * probe type open.
199 	 */
200 
201 	for (row = 0; row < un->un_nrows; row++) {
202 		struct ms_row *mdr = &un->un_row[row];
203 
204 		if (md_oflags & MD_OFLG_PROBEDEV)
205 			total_comp_cnt += mdr->un_ncomp;
206 
207 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
208 			struct ms_comp	*mdc;
209 			md_dev64_t tmpdev;
210 
211 			mdc = &mdcomp[c++];
212 			tmpdev = mdc->un_dev;
213 			/*
214 			 * Do the open by device id
215 			 * Check if this comp is hotspared and
216 			 * if it is then use the key for hotspare.
217 			 * MN disksets don't use devids, so we better don't use
218 			 * md_devid_found/md_resolve_bydevid there. Rather do,
219 			 * what's done in stripe_build_incore()
220 			 */
221 			if (MD_MNSET_SETNO(setno)) {
222 				if (mdc->un_mirror.ms_hs_id != 0) {
223 					(void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
224 					    0, &mdc->un_mirror.ms_hs_id, NULL,
225 					    &tmpdev, NULL);
226 				}
227 			} else {
228 				key = mdc->un_mirror.ms_hs_id ?
229 				    mdc->un_mirror.ms_hs_key : mdc->un_key;
230 				if ((md_getmajor(tmpdev) != md_major) &&
231 				    md_devid_found(setno, side, key) == 1) {
232 					tmpdev = md_resolve_bydevid(mnum,
233 					    tmpdev, key);
234 				}
235 			}
236 
237 			/*
238 			 * For a submirror, we only want to open those devices
239 			 * that are not errored. If the device is errored then
240 			 * then there is no reason to open it and leaving it
241 			 * closed allows the RCM/DR code to work so that the
242 			 * errored device can be replaced.
243 			 */
244 			if ((md_oflags & MD_OFLG_PROBEDEV) ||
245 			    ! (mdc->un_mirror.ms_state & CS_ERRED)) {
246 
247 				err = md_layered_open(mnum, &tmpdev, md_oflags);
248 			} else {
249 				err = ENXIO;
250 			}
251 
252 			/*
253 			 * Only set the un_dev if the tmpdev != NODEV64. If
254 			 * it is NODEV64 then the md_layered_open() will have
255 			 * failed in some manner.
256 			 */
257 			if (tmpdev != NODEV64)
258 				mdc->un_dev = tmpdev;
259 
260 			if (err) {
261 				if (!cont_on_errors) {
262 					stripe_close_all_devs(un, md_oflags);
263 					return (ENXIO);
264 				}
265 
266 				if (md_oflags & MD_OFLG_PROBEDEV)
267 					probe_err_cnt++;
268 			} else {
269 				if (md_oflags & MD_OFLG_PROBEDEV) {
270 					mdc->un_mirror.ms_flags |=
271 						MDM_S_PROBEOPEN;
272 				} else
273 					mdc->un_mirror.ms_flags |= MDM_S_ISOPEN;
274 			}
275 		}
276 	}
277 
278 	/* If every component in a stripe could not be opened fail */
279 	if ((md_oflags & MD_OFLG_PROBEDEV) &&
280 	    (probe_err_cnt == total_comp_cnt))
281 		return (ENXIO);
282 	else
283 		return (0);
284 }
285 
286 int
287 stripe_build_incore(void *p, int snarfing)
288 {
289 	ms_unit_t *un = (ms_unit_t *)p;
290 	struct ms_comp	*mdcomp;
291 	minor_t		mnum;
292 	int		row;
293 	int		i;
294 	int		c;
295 	int		ncomps;
296 
297 	mnum = MD_SID(un);
298 
299 	if (MD_UNIT(mnum) != NULL)
300 		return (0);
301 
302 	MD_STATUS(un) = 0;
303 
304 	/*
305 	 * Reset all the is_open flags, these are probably set
306 	 * cause they just came out of the database.
307 	 */
308 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
309 
310 	ncomps = 0;
311 	for (row = 0; row < un->un_nrows; row++) {
312 		struct ms_row *mdr = &un->un_row[row];
313 		ncomps += mdr->un_ncomp;
314 	}
315 
316 	for (row = 0; row < un->un_nrows; row++) {
317 		struct ms_row *mdr = &un->un_row[row];
318 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
319 			struct ms_comp		*mdc;
320 			set_t			setno;
321 			md_dev64_t		tmpdev;
322 
323 			mdc = &mdcomp[c++];
324 			mdc->un_mirror.ms_flags &=
325 			    ~(MDM_S_ISOPEN | MDM_S_IOERR | MDM_S_RS_TRIED);
326 
327 			if (!snarfing)
328 				continue;
329 
330 			setno = MD_MIN2SET(mnum);
331 
332 			tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
333 			    mdc->un_key, MD_NOTRUST_DEVT);
334 			mdc->un_dev = tmpdev;
335 			/*
336 			 * Check for hotspares. If the hotspares haven't been
337 			 * snarfed yet, stripe_open_all_devs() will do the
338 			 * remapping of the dev's later.
339 			 */
340 			if (mdc->un_mirror.ms_hs_id != 0) {
341 				mdc->un_mirror.ms_orig_dev = mdc->un_dev;
342 				(void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
343 				    0, &mdc->un_mirror.ms_hs_id, NULL,
344 				    &tmpdev, NULL);
345 				mdc->un_dev = tmpdev;
346 			}
347 		}
348 	}
349 
350 	MD_UNIT(mnum) = un;
351 	return (0);
352 }
353 
354 void
355 reset_stripe(ms_unit_t *un, minor_t mnum, int removing)
356 {
357 	ms_comp_t	*mdcomp;
358 	struct ms_row	*mdr;
359 	int		i, c;
360 	int		row;
361 	int		nsv;
362 	int		isv;
363 	sv_dev_t	*sv;
364 	mddb_recid_t	*recids;
365 	mddb_recid_t	vtoc_id;
366 	int		rid = 0;
367 
368 	md_destroy_unit_incore(mnum, &stripe_md_ops);
369 
370 	MD_UNIT(mnum) = NULL;
371 
372 	/*
373 	 * Attempt release of its minor node
374 	 */
375 	md_remove_minor_node(mnum);
376 
377 	if (!removing)
378 		return;
379 
380 	nsv = 0;
381 	/* Count the number of devices */
382 	for (row = 0; row < un->un_nrows; row++) {
383 		mdr = &un->un_row[row];
384 		nsv += mdr->un_ncomp;
385 	}
386 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t) * nsv, KM_SLEEP);
387 
388 	/*
389 	 * allocate recids array.  since we may have to commit
390 	 * underlying soft partition records, we need an array
391 	 * of size: total number of components in stripe + 3
392 	 * (one for the stripe itself, one for the hotspare, one
393 	 * for the end marker).
394 	 */
395 	recids = kmem_alloc(sizeof (mddb_recid_t) * (nsv + 3), KM_SLEEP);
396 
397 	/*
398 	 * Save the md_dev64_t's and driver nm indexes.
399 	 * Because after the mddb_deleterec() we will
400 	 * not be able to access the unit structure.
401 	 *
402 	 * NOTE: Deleting the names before deleting the
403 	 *	 unit structure would cause problems if
404 	 *	 the machine crashed in between the two.
405 	 */
406 	isv = 0;
407 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
408 
409 	for (row = 0; row < un->un_nrows; row++) {
410 		mdr = &un->un_row[row];
411 		for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
412 			struct ms_comp	*mdc;
413 			md_dev64_t	child_dev;
414 			md_unit_t	*child_un;
415 
416 			mdc = &mdcomp[c++];
417 			if (mdc->un_mirror.ms_hs_id != 0) {
418 				mdkey_t		hs_key;
419 
420 				hs_key = mdc->un_mirror.ms_hs_key;
421 
422 				mdc->un_dev = mdc->un_mirror.ms_orig_dev;
423 				mdc->un_start_block =
424 				    mdc->un_mirror.ms_orig_blk;
425 				mdc->un_mirror.ms_hs_id = 0;
426 				mdc->un_mirror.ms_hs_key = 0;
427 				mdc->un_mirror.ms_orig_dev = 0;
428 				recids[0] = 0;
429 				recids[1] = 0;	/* recids[1] filled in below */
430 				recids[2] = 0;
431 				(void) md_hot_spare_ifc(HS_FREE, un->un_hsp_id,
432 				    0, 0, &recids[0], &hs_key, NULL, NULL);
433 				mddb_commitrecs_wrapper(recids);
434 			}
435 
436 			/*
437 			 * check if we've got metadevice below us and
438 			 * deparent it if we do.
439 			 * NOTE: currently soft partitions are the
440 			 * the only metadevices stripes can be
441 			 * built on top of.
442 			 */
443 			child_dev = mdc->un_dev;
444 			if (md_getmajor(child_dev) == md_major) {
445 				child_un = MD_UNIT(md_getminor(child_dev));
446 				md_reset_parent(child_dev);
447 				recids[rid++] = MD_RECID(child_un);
448 			}
449 
450 			sv[isv].setno = MD_MIN2SET(mnum);
451 			sv[isv++].key = mdc->un_key;
452 		}
453 	}
454 
455 	recids[rid++] = un->c.un_record_id;
456 	recids[rid] = 0;	/* filled in below */
457 
458 	/*
459 	 * Decrement the HSP reference count and
460 	 * remove the knowledge of the HSP from the unit struct.
461 	 * This is done atomically to remove a window.
462 	 */
463 	if (un->un_hsp_id != -1) {
464 		(void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
465 		    &recids[rid++], NULL, NULL, NULL);
466 		un->un_hsp_id = -1;
467 	}
468 
469 	/* set end marker and commit records */
470 	recids[rid] = 0;
471 	mddb_commitrecs_wrapper(recids);
472 
473 	vtoc_id = un->c.un_vtoc_id;
474 
475 	/*
476 	 * Remove self from the namespace
477 	 */
478 	if (un->c.un_revision & MD_FN_META_DEV) {
479 		(void) md_rem_selfname(un->c.un_self_id);
480 	}
481 
482 	/* Remove the unit structure */
483 	mddb_deleterec_wrapper(un->c.un_record_id);
484 
485 	/* Remove the vtoc, if present */
486 	if (vtoc_id)
487 		mddb_deleterec_wrapper(vtoc_id);
488 
489 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
490 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
491 	md_rem_names(sv, nsv);
492 	kmem_free(sv, sizeof (sv_dev_t) * nsv);
493 	kmem_free(recids, sizeof (mddb_recid_t) * (nsv + 3));
494 }
495 
496 static void
497 stripe_error(md_sps_t *ps)
498 {
499 	struct buf	*pb = ps->ps_bp;
500 	mdi_unit_t	*ui = ps->ps_ui;
501 	md_dev64_t	dev = ps->ps_errcomp->un_dev;
502 	md_dev64_t	md_dev = md_expldev(pb->b_edev);
503 	char		*str;
504 
505 	if (pb->b_flags & B_READ) {
506 		ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_READERR;
507 		str = "read";
508 	} else {
509 		ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_WRTERR;
510 		str = "write";
511 	}
512 	if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
513 		if (MUTEX_HELD(&ps->ps_mx)) {
514 			mutex_exit(&ps->ps_mx);
515 		}
516 	} else {
517 		ASSERT(panicstr);
518 	}
519 	SPS_FREE(stripe_parent_cache, ps);
520 	pb->b_flags |= B_ERROR;
521 
522 	md_kstat_done(ui, pb, 0);
523 	md_unit_readerexit(ui);
524 	md_biodone(pb);
525 
526 	cmn_err(CE_WARN, "md: %s: %s error on %s",
527 	    md_shortname(md_getminor(md_dev)), str,
528 	    md_devname(MD_DEV2SET(md_dev), dev, NULL, 0));
529 }
530 
531 static int
532 stripe_done(struct buf *cb)
533 {
534 	struct buf	*pb;
535 	mdi_unit_t	*ui;
536 	md_sps_t	*ps;
537 	md_scs_t	*cs;
538 
539 	/*LINTED*/
540 	cs = (md_scs_t *)((caddr_t)cb - md_stripe_mcs_buf_off);
541 	ps = cs->cs_ps;
542 	pb = ps->ps_bp;
543 
544 	mutex_enter(&ps->ps_mx);
545 	if (cb->b_flags & B_ERROR) {
546 		ps->ps_flags |= MD_SPS_ERROR;
547 		pb->b_error = cb->b_error;
548 		ps->ps_errcomp = cs->cs_comp;
549 	}
550 
551 	if (cb->b_flags & B_REMAPPED)
552 		bp_mapout(cb);
553 
554 	ps->ps_frags--;
555 	if (ps->ps_frags != 0) {
556 		mutex_exit(&ps->ps_mx);
557 		kmem_cache_free(stripe_child_cache, cs);
558 		return (1);
559 	}
560 	kmem_cache_free(stripe_child_cache, cs);
561 	if (ps->ps_flags & MD_SPS_ERROR) {
562 		stripe_error(ps);
563 		return (1);
564 	}
565 	ui = ps->ps_ui;
566 	if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
567 		mutex_exit(&ps->ps_mx);
568 	} else {
569 		ASSERT(panicstr);
570 	}
571 	SPS_FREE(stripe_parent_cache, ps);
572 	md_kstat_done(ui, pb, 0);
573 	md_unit_readerexit(ui);
574 	md_biodone(pb);
575 	return (0);
576 }
577 
578 
579 /*
580  * This routine does the mapping from virtual (dev, blkno) of a metapartition
581  * to the real (dev, blkno) of a real disk partition.
582  * It goes to the md_conf[] table to find out the correct real partition
583  * dev and block number for this buffer.
584  *
585  * A single buf request can not go across real disk partition boundary.
586  * When the virtual request specified by (dev, blkno) spans more than one
587  * real partition, md_mapbuf will return 1. Then the caller should prepare
588  * another real buf and continue calling md_mapbuf to do the mapping until
589  * it returns 0.
590  *
591  */
592 
593 static int
594 md_mapbuf(
595 	ms_unit_t	*un,
596 	diskaddr_t	blkno,
597 	u_longlong_t	bcount,
598 	buf_t		*bp,	/* if bp==NULL, skip bp updates */
599 	ms_comp_t	**mdc)	/* if bp==NULL, skip mdc update */
600 {
601 	struct ms_row	*mdr;
602 	struct ms_comp	*mdcomp;
603 	diskaddr_t	stripe_blk;
604 	diskaddr_t	fragment, blk_in_row, endblk;
605 	offset_t	interlace;
606 	size_t		dev_index;
607 	int		row_index, more;
608 	extern unsigned md_maxphys;
609 	/* Work var's when bp==NULL */
610 	u_longlong_t	wb_bcount;
611 	diskaddr_t	wb_blkno;
612 	md_dev64_t	wb_edev;
613 	ms_comp_t	*wmdc;
614 
615 	/*
616 	 * Do a real calculation to derive the minor device of the
617 	 * Virtual Disk, which in turn will let us derive the
618 	 * device/minor of the underlying real device.
619 	 */
620 
621 
622 	for (row_index = 0; row_index < un->un_nrows; row_index++) {
623 		mdr = &un->un_row[row_index];
624 		if (blkno < mdr->un_cum_blocks)
625 			break;
626 	}
627 	ASSERT(row_index != un->un_nrows);
628 
629 	mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
630 
631 	blk_in_row = blkno - mdr->un_cum_blocks + mdr->un_blocks;
632 	endblk = (diskaddr_t)(blkno + howmany(bcount, DEV_BSIZE));
633 	if (mdr->un_ncomp == 1) { /* No striping */
634 		if (endblk > mdr->un_cum_blocks) {
635 			wb_bcount = ldbtob(mdr->un_cum_blocks - blkno);
636 			if ((row_index + 1) == un->un_nrows)
637 				more = 0;
638 			else
639 				more = 1;
640 		} else {
641 			wb_bcount = bcount;
642 			more = 0;
643 		}
644 		wmdc = &mdcomp[mdr->un_icomp];
645 		wb_blkno = blk_in_row;
646 	} else { /* Have striping */
647 		interlace = mdr->un_interlace;
648 		fragment = blk_in_row % interlace;
649 		if (bcount > ldbtob(interlace - fragment)) {
650 			more = 1;
651 			wb_bcount = ldbtob(interlace - fragment);
652 		} else {
653 			more = 0;
654 			wb_bcount = bcount;
655 		}
656 
657 		stripe_blk = blk_in_row / interlace;
658 		dev_index = (size_t)(stripe_blk % mdr->un_ncomp);
659 		wmdc = &mdcomp[mdr->un_icomp + dev_index];
660 		wb_blkno = (diskaddr_t)(((stripe_blk / mdr->un_ncomp)
661 			* interlace) + fragment);
662 	}
663 
664 	wb_blkno += wmdc->un_start_block;
665 	wb_edev = wmdc->un_dev;
666 
667 	/* only break up the I/O if we're not built on another metadevice */
668 	if ((md_getmajor(wb_edev) != md_major) && (wb_bcount > md_maxphys)) {
669 		wb_bcount = md_maxphys;
670 		more = 1;
671 	}
672 	if (bp != (buf_t *)NULL) {
673 		/*
674 		 * wb_bcount is limited by md_maxphys which is 'int'
675 		 */
676 		bp->b_bcount = (size_t)wb_bcount;
677 		bp->b_lblkno = wb_blkno;
678 		bp->b_edev = md_dev64_to_dev(wb_edev);
679 		*mdc = wmdc;
680 	}
681 	return (more);
682 }
683 
684 static void
685 md_stripe_strategy(buf_t *pb, int flag, void *private)
686 {
687 	md_sps_t	*ps;
688 	md_scs_t	*cs;
689 	int		doing_writes;
690 	int		more;
691 	ms_unit_t	*un;
692 	mdi_unit_t	*ui;
693 	size_t		current_count;
694 	diskaddr_t	current_blkno;
695 	off_t		current_offset;
696 	buf_t		*cb;		/* child buf pointer */
697 	set_t		setno;
698 
699 	setno = MD_MIN2SET(getminor(pb->b_edev));
700 
701 	/*
702 	 * When doing IO to a multi owner meta device, check if set is halted.
703 	 * We do this check without the needed lock held, for performance
704 	 * reasons.
705 	 * If an IO just slips through while the set is locked via an
706 	 * MD_MN_SUSPEND_SET, we don't care about it.
707 	 * Only check for a suspended set if we are a top-level i/o request
708 	 * (MD_STR_NOTTOP is cleared in 'flag').
709 	 */
710 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
711 	    (MD_SET_HALTED | MD_SET_MNSET)) {
712 		if ((flag & MD_STR_NOTTOP) == 0) {
713 			mutex_enter(&md_mx);
714 			/* Here we loop until the set is no longer halted */
715 			while (md_set[setno].s_status & MD_SET_HALTED) {
716 				cv_wait(&md_cv, &md_mx);
717 			}
718 			mutex_exit(&md_mx);
719 		}
720 	}
721 
722 	ui = MDI_UNIT(getminor(pb->b_edev));
723 
724 	md_kstat_waitq_enter(ui);
725 
726 	un = (ms_unit_t *)md_unit_readerlock(ui);
727 
728 	if ((flag & MD_NOBLOCK) == 0) {
729 		if (md_inc_iocount(setno) != 0) {
730 			pb->b_flags |= B_ERROR;
731 			pb->b_error = ENXIO;
732 			pb->b_resid = pb->b_bcount;
733 			md_kstat_waitq_exit(ui);
734 			md_unit_readerexit(ui);
735 			biodone(pb);
736 			return;
737 		}
738 	} else {
739 		md_inc_iocount_noblock(setno);
740 	}
741 
742 	if (!(flag & MD_STR_NOTTOP)) {
743 		if (md_checkbuf(ui, (md_unit_t *)un, pb) != 0) {
744 			md_kstat_waitq_exit(ui);
745 			return;
746 		}
747 	}
748 
749 	ps = kmem_cache_alloc(stripe_parent_cache, MD_ALLOCFLAGS);
750 	stripe_parent_init(ps);
751 
752 	/*
753 	 * Save essential information from the original buffhdr
754 	 * in the md_save structure.
755 	 */
756 	ps->ps_un = un;
757 	ps->ps_ui = ui;
758 	ps->ps_bp = pb;
759 	ps->ps_addr = pb->b_un.b_addr;
760 
761 	if ((pb->b_flags & B_READ) == 0)
762 		doing_writes = 1;
763 	else
764 		doing_writes = 0;
765 
766 
767 	current_count = pb->b_bcount;
768 	current_blkno = pb->b_lblkno;
769 	current_offset  = 0;
770 
771 	if (!(flag & MD_STR_NOTTOP) && panicstr)
772 		ps->ps_flags |= MD_SPS_DONTFREE;
773 
774 	md_kstat_waitq_to_runq(ui);
775 
776 	ps->ps_frags++;
777 	do {
778 		cs = kmem_cache_alloc(stripe_child_cache, MD_ALLOCFLAGS);
779 		stripe_child_init(cs);
780 		cb = &cs->cs_buf;
781 		cs->cs_ps = ps;
782 		more = md_mapbuf(un, current_blkno, current_count, cb,
783 			&cs->cs_comp);
784 
785 		cb = md_bioclone(pb, current_offset, cb->b_bcount, cb->b_edev,
786 				cb->b_lblkno, stripe_done, cb, KM_NOSLEEP);
787 		/*
788 		 * Do these calculations now,
789 		 *  so that we pickup a valid b_bcount from the chld_bp.
790 		 */
791 		current_offset += cb->b_bcount;
792 		current_count -=  cb->b_bcount;
793 		current_blkno +=  (diskaddr_t)(lbtodb(cb->b_bcount));
794 
795 		if (more) {
796 			mutex_enter(&ps->ps_mx);
797 			ps->ps_frags++;
798 			mutex_exit(&ps->ps_mx);
799 		}
800 
801 		if (doing_writes &&
802 		    cs->cs_comp->un_mirror.ms_flags & MDM_S_NOWRITE) {
803 			(void) stripe_done(cb);
804 			continue;
805 		}
806 		md_call_strategy(cb, flag, private);
807 	} while (more);
808 
809 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
810 		while (!(ps->ps_flags & MD_SPS_DONE)) {
811 			md_daemon(1, &md_done_daemon);
812 			drv_usecwait(10);
813 		}
814 		kmem_cache_free(stripe_parent_cache, ps);
815 	}
816 }
817 
818 static int
819 stripe_snarf(md_snarfcmd_t cmd, set_t setno)
820 {
821 	ms_unit_t	*un;
822 	mddb_recid_t	recid;
823 	int		gotsomething;
824 	int		all_stripes_gotten;
825 	mddb_type_t	typ1;
826 	mddb_de_ic_t	*dep;
827 	mddb_rb32_t	*rbp;
828 	size_t		newreqsize;
829 	ms_unit_t	*big_un;
830 	ms_unit32_od_t	*small_un;
831 
832 
833 	if (cmd == MD_SNARF_CLEANUP)
834 		return (0);
835 
836 	all_stripes_gotten = 1;
837 	gotsomething = 0;
838 
839 	typ1 = (mddb_type_t)md_getshared_key(setno,
840 	    stripe_md_ops.md_driver.md_drivername);
841 	recid = mddb_makerecid(setno, 0);
842 
843 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
844 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
845 			continue;
846 
847 		dep = mddb_getrecdep(recid);
848 		dep->de_flags = MDDB_F_STRIPE;
849 		rbp = dep->de_rb;
850 
851 		switch (rbp->rb_revision) {
852 		case MDDB_REV_RB:
853 		case MDDB_REV_RBFN:
854 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
855 				/*
856 				 * This means, we have an old and small record
857 				 * and this record hasn't already been
858 				 * converted.  Before we create an incore
859 				 * metadevice from this we have to convert it to
860 				 * a big record.
861 				 */
862 				small_un =
863 				    (ms_unit32_od_t *)mddb_getrecaddr(recid);
864 				newreqsize = get_big_stripe_req_size(small_un,
865 						COMPLETE_STRUCTURE);
866 				big_un = (ms_unit_t *)kmem_zalloc(newreqsize,
867 					KM_SLEEP);
868 				stripe_convert((caddr_t)small_un,
869 					(caddr_t)big_un, SMALL_2_BIG);
870 				kmem_free(small_un, dep->de_reqsize);
871 				dep->de_rb_userdata = big_un;
872 				dep->de_reqsize = newreqsize;
873 				un = big_un;
874 				rbp->rb_private |= MD_PRV_CONVD;
875 			} else {
876 				/* Small device had already been converted */
877 				un = (ms_unit_t *)mddb_getrecaddr(recid);
878 			}
879 			un->c.un_revision &= ~MD_64BIT_META_DEV;
880 			break;
881 		case MDDB_REV_RB64:
882 		case MDDB_REV_RB64FN:
883 			/* Big device */
884 			un = (ms_unit_t *)mddb_getrecaddr(recid);
885 			un->c.un_revision |= MD_64BIT_META_DEV;
886 			un->c.un_flag |= MD_EFILABEL;
887 			break;
888 		}
889 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
890 
891 		/* Create minor node for snarfed unit. */
892 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
893 
894 		if (MD_UNIT(MD_SID(un)) != NULL) {
895 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
896 			continue;
897 		}
898 		all_stripes_gotten = 0;
899 		if (stripe_build_incore((void *)un, 1) == 0) {
900 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
901 			md_create_unit_incore(MD_SID(un), &stripe_md_ops, 0);
902 			gotsomething = 1;
903 		}
904 	}
905 
906 	if (!all_stripes_gotten)
907 		return (gotsomething);
908 
909 	recid = mddb_makerecid(setno, 0);
910 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
911 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
912 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
913 
914 	return (0);
915 }
916 
917 static int
918 stripe_halt(md_haltcmd_t cmd, set_t setno)
919 {
920 	int		i;
921 	mdi_unit_t	*ui;
922 	minor_t		mnum;
923 
924 	if (cmd == MD_HALT_CLOSE)
925 		return (0);
926 
927 	if (cmd == MD_HALT_OPEN)
928 		return (0);
929 
930 	if (cmd == MD_HALT_UNLOAD)
931 		return (0);
932 
933 	if (cmd == MD_HALT_CHECK) {
934 		for (i = 0; i < md_nunits; i++) {
935 			mnum = MD_MKMIN(setno, i);
936 			if ((ui = MDI_UNIT(mnum)) == NULL)
937 				continue;
938 			if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
939 				continue;
940 			if (md_unit_isopen(ui))
941 				return (1);
942 		}
943 		return (0);
944 	}
945 
946 	if (cmd != MD_HALT_DOIT)
947 		return (1);
948 
949 	for (i = 0; i < md_nunits; i++) {
950 		mnum = MD_MKMIN(setno, i);
951 		if ((ui = MDI_UNIT(mnum)) == NULL)
952 			continue;
953 		if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
954 			continue;
955 		reset_stripe((ms_unit_t *)MD_UNIT(mnum), mnum, 0);
956 	}
957 
958 	return (0);
959 }
960 
961 /*ARGSUSED3*/
962 static int
963 stripe_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
964 {
965 	minor_t		mnum = getminor(*dev);
966 	mdi_unit_t	*ui = MDI_UNIT(mnum);
967 	ms_unit_t	*un;
968 	int		err = 0;
969 	set_t		setno;
970 
971 	/*
972 	 * When doing an open of a multi owner metadevice, check to see if this
973 	 * node is a starting node and if a reconfig cycle is underway.
974 	 * If so, the system isn't sufficiently set up enough to handle the
975 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
976 	 */
977 	setno = MD_MIN2SET(mnum);
978 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
979 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
980 			return (ENXIO);
981 	}
982 
983 	/* single thread */
984 	un = (ms_unit_t *)md_unit_openclose_enter(ui);
985 
986 	/* open devices, if necessary */
987 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
988 		if ((err = stripe_open_all_devs(un, md_oflags)) != 0) {
989 			goto out;
990 		}
991 	}
992 
993 	/* count open */
994 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
995 		goto out;
996 
997 	/* unlock, return success */
998 out:
999 	md_unit_openclose_exit(ui);
1000 	return (err);
1001 }
1002 
1003 /*ARGSUSED1*/
1004 static int
1005 stripe_close(
1006 	dev_t		dev,
1007 	int		flag,
1008 	int		otyp,
1009 	cred_t		*cred_p,
1010 	int		md_cflags
1011 )
1012 {
1013 	minor_t		mnum = getminor(dev);
1014 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1015 	ms_unit_t	*un;
1016 	int		err = 0;
1017 
1018 	/* single thread */
1019 	un = (ms_unit_t *)md_unit_openclose_enter(ui);
1020 
1021 	/* count closed */
1022 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
1023 		goto out;
1024 
1025 	/* close devices, if necessary */
1026 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1027 		stripe_close_all_devs(un, md_cflags);
1028 	}
1029 
1030 	/* unlock, return success */
1031 out:
1032 	md_unit_openclose_exit(ui);
1033 	return (err);
1034 }
1035 
1036 
1037 static struct buf dumpbuf;
1038 
1039 /*
1040  * This routine dumps memory to the disk.  It assumes that the memory has
1041  * already been mapped into mainbus space.  It is called at disk interrupt
1042  * priority when the system is in trouble.
1043  *
1044  */
1045 static int
1046 stripe_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1047 {
1048 	ms_unit_t	*un;
1049 	buf_t		*bp;
1050 	ms_comp_t	*mdc;
1051 	u_longlong_t	nb;
1052 	diskaddr_t	mapblk;
1053 	int		result;
1054 	int		more;
1055 	int		saveresult = 0;
1056 
1057 	/*
1058 	 * Don't need to grab the unit lock.
1059 	 * Cause nothing else is suppose to be happenning.
1060 	 * Also dump is not suppose to sleep.
1061 	 */
1062 	un = (ms_unit_t *)MD_UNIT(getminor(dev));
1063 
1064 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1065 		return (EINVAL);
1066 
1067 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
1068 		return (EINVAL);
1069 
1070 	bp = &dumpbuf;
1071 	nb = ldbtob(nblk);
1072 	do {
1073 		bzero((caddr_t)bp, sizeof (*bp));
1074 		more = md_mapbuf(un, (diskaddr_t)blkno, nb, bp, &mdc);
1075 		nblk = btodb(bp->b_bcount);
1076 		mapblk = bp->b_lblkno;
1077 		if (!(mdc->un_mirror.ms_flags & MDM_S_NOWRITE)) {
1078 			/*
1079 			 * bdev_dump() is currently only able to take
1080 			 * 32 bit wide blkno's.
1081 			 */
1082 			result = bdev_dump(bp->b_edev, addr, (daddr_t)mapblk,
1083 						nblk);
1084 			if (result)
1085 				saveresult = result;
1086 		}
1087 
1088 		nb -= bp->b_bcount;
1089 		addr += bp->b_bcount;
1090 		blkno += nblk;
1091 	} while (more);
1092 
1093 	return (saveresult);
1094 }
1095 
1096 /*ARGSUSED*/
1097 static intptr_t
1098 stripe_shared_by_blk(
1099 	md_dev64_t dev,
1100 	void *junk,
1101 	diskaddr_t blkno,
1102 	u_longlong_t *cnt)
1103 {
1104 	ms_unit_t	*un;
1105 	buf_t		bp;
1106 	ms_comp_t	*comp;
1107 
1108 	un = MD_UNIT(md_getminor(dev));
1109 	(void) md_mapbuf(un, blkno, ldbtob(*cnt), &bp, &comp);
1110 	*cnt = (u_longlong_t)lbtodb(bp.b_bcount);
1111 	return ((intptr_t)&comp->un_mirror);
1112 }
1113 
1114 /*
1115  * stripe_block_count_skip_size() returns the following values
1116  *	so that the logical to physical block mappings can
1117  *	be calculated without intimate knowledge of the underpinnings.
1118  *
1119  *	block - first logical block number of the device.
1120  *		block = [ # of blocks before THE row ] +
1121  *			[ # of blocks in THE row before the component ]
1122  *	count - # of segments (interlaced size).
1123  *	skip  - # of logical blocks between segments, or delta to
1124  *		  get to next segment
1125  *	size  - interlace size used for the block, count, skip.
1126  */
1127 /*ARGSUSED*/
1128 static intptr_t
1129 stripe_block_count_skip_size(
1130 	md_dev64_t	 dev,
1131 	void		*junk,
1132 	int		ci,
1133 	diskaddr_t	*block,
1134 	size_t		*count,
1135 	u_longlong_t	*skip,
1136 	u_longlong_t	*size)
1137 {
1138 	ms_unit_t	*un;
1139 	int		row;
1140 	struct ms_row	*mdr;
1141 	int		cmpcount = 0;
1142 
1143 	un = MD_UNIT(md_getminor(dev));
1144 
1145 	for (row = 0; row < un->un_nrows; row++) {
1146 		mdr = &un->un_row[row];
1147 		if ((mdr->un_ncomp + cmpcount) > ci)
1148 			break;
1149 		cmpcount += mdr->un_ncomp;
1150 	}
1151 	ASSERT(row != un->un_nrows);
1152 
1153 	/*
1154 	 * Concatenations are always contiguous blocks,
1155 	 * you cannot depend on the interlace being a usable
1156 	 * value (except for stripes).
1157 	 */
1158 	if (mdr->un_ncomp == 1) {	/* Concats */
1159 		*block = mdr->un_cum_blocks - mdr->un_blocks;
1160 		*count = 1;
1161 		*skip = 0;
1162 		*size = mdr->un_blocks;
1163 	} else {			/* Stripes */
1164 		*block = (mdr->un_cum_blocks - mdr->un_blocks) +
1165 		    ((ci - cmpcount) * mdr->un_interlace);
1166 		*count	= (size_t)(mdr->un_blocks / (mdr->un_interlace
1167 			* mdr->un_ncomp));
1168 		*skip = (mdr->un_interlace * mdr->un_ncomp) - mdr->un_interlace;
1169 		*size = mdr->un_interlace;
1170 	}
1171 
1172 	return (0);
1173 }
1174 
1175 /*ARGSUSED*/
1176 static intptr_t
1177 stripe_shared_by_indx(md_dev64_t dev, void *junk, int indx)
1178 {
1179 	ms_unit_t	*un;
1180 	ms_comp_t	*comp;
1181 
1182 	un = MD_UNIT(md_getminor(dev));
1183 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1184 	comp += indx;
1185 	return ((intptr_t)&comp->un_mirror);
1186 }
1187 
1188 /*ARGSUSED*/
1189 intptr_t
1190 stripe_component_count(md_dev64_t dev, void *junk)
1191 {
1192 	/*
1193 	 * See comments for stripe_get_dev
1194 	 */
1195 
1196 	ms_unit_t	*un;
1197 	int		count = 0;
1198 	int		row;
1199 
1200 	un = MD_UNIT(md_getminor(dev));
1201 	for (row = 0; row < un->un_nrows; row++)
1202 		count += un->un_row[row].un_ncomp;
1203 	return (count);
1204 }
1205 
1206 /*ARGSUSED*/
1207 intptr_t
1208 stripe_get_dev(md_dev64_t dev, void *junk, int indx, ms_cd_info_t *cd)
1209 {
1210 	/*
1211 	 * It should be noted that stripe_replace in stripe_ioctl.c calls this
1212 	 * routine using makedevice(0, minor) for the first argument.
1213 	 *
1214 	 * If this routine at some point in the future needs to use the major
1215 	 * number stripe_replace must be changed.
1216 	 */
1217 
1218 	ms_unit_t	*un;
1219 	ms_comp_t	*comp;
1220 	md_dev64_t	tmpdev;
1221 
1222 	un = MD_UNIT(md_getminor(dev));
1223 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1224 	comp += indx;
1225 	tmpdev = comp->un_dev;
1226 	/*
1227 	 * Try to resolve devt again if NODEV64
1228 	 * Check if this comp is hotspared and if it is
1229 	 * then use key for hotspare
1230 	 */
1231 	if (tmpdev == NODEV64) {
1232 		tmpdev = md_resolve_bydevid(md_getminor(dev), tmpdev,
1233 			comp->un_mirror.ms_hs_id ?
1234 			comp->un_mirror.ms_hs_key :
1235 			comp->un_key);
1236 		comp->un_dev = tmpdev;
1237 	}
1238 
1239 	cd->cd_dev = comp->un_dev;
1240 	cd->cd_orig_dev = comp->un_mirror.ms_orig_dev;
1241 	return (0);
1242 }
1243 
1244 /*ARGSUSED*/
1245 void
1246 stripe_replace_done(md_dev64_t dev, sv_dev_t *sv)
1247 {
1248 	/*
1249 	 * See comments for stripe_get_dev
1250 	 */
1251 
1252 	minor_t		mnum = md_getminor(dev);
1253 
1254 	if (sv != NULL) {
1255 		md_rem_names(sv, 1);
1256 		kmem_free(sv, sizeof (sv_dev_t));
1257 	}
1258 
1259 	md_unit_writerexit(MDI_UNIT(mnum));
1260 }
1261 
1262 /*ARGSUSED*/
1263 intptr_t
1264 stripe_replace_dev(md_dev64_t dev, void *junk, int ci, ms_new_dev_t *nd,
1265     mddb_recid_t *recids, int nrecids, void (**replace_done)(),
1266     void **replace_data)
1267 {
1268 	minor_t		mnum;
1269 	ms_unit_t	*un;
1270 	mdi_unit_t	*ui;
1271 	ms_comp_t	*comp;
1272 	diskaddr_t	dev_size;
1273 	int		row;
1274 	int		ncomps = 0;
1275 	int		cmpcount = 0;
1276 	int		rid = 0;
1277 	struct ms_row	*mdr;
1278 	sv_dev_t	*sv = NULL;
1279 	mddb_recid_t	hs_id = 0;
1280 	set_t		setno;
1281 	side_t		side;
1282 	md_dev64_t	this_dev;
1283 
1284 	mnum = md_getminor(dev);
1285 	ui = MDI_UNIT(mnum);
1286 	setno = MD_MIN2SET(mnum);
1287 	side = mddb_getsidenum(setno);
1288 
1289 	un = md_unit_writerlock(ui);
1290 
1291 	*replace_data = NULL;
1292 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1293 
1294 	comp += ci;
1295 
1296 	/*
1297 	 * Count the number of components
1298 	 */
1299 	for (row = 0; row < un->un_nrows; row++) {
1300 		struct ms_row *mdr = &un->un_row[row];
1301 		ncomps += mdr->un_ncomp;
1302 	}
1303 
1304 	recids[0] = 0;
1305 	/*
1306 	 * No need of checking size of new device,
1307 	 * when hotsparing (it has already been done), or
1308 	 * when enabling the device.
1309 	 */
1310 	if ((nd != NULL) && (nd->nd_hs_id == 0)) {
1311 		for (row = 0; row < un->un_nrows; row++) {
1312 			mdr = &un->un_row[row];
1313 			if ((mdr->un_ncomp + cmpcount) > ci)
1314 				break;
1315 			cmpcount += mdr->un_ncomp;
1316 		}
1317 		ASSERT(row != un->un_nrows);
1318 
1319 		/* Concatenations have a ncomp = 1 */
1320 		dev_size = mdr->un_blocks / mdr->un_ncomp;
1321 
1322 		/*
1323 		 * now check to see if new comp can be used in
1324 		 * place of old comp
1325 		 */
1326 		if ((un->c.un_flag & MD_LABELED) && (ci == 0) &&
1327 		    nd->nd_labeled)
1328 			nd->nd_start_blk = 0;
1329 		else
1330 			nd->nd_nblks -= nd->nd_start_blk;
1331 
1332 		if (dev_size > nd->nd_nblks) {
1333 			md_unit_writerexit(ui);
1334 			return (MDE_COMP_TOO_SMALL);
1335 		}
1336 
1337 		sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
1338 		sv->setno = MD_MIN2SET(mnum);
1339 		sv->key = comp->un_key;
1340 	}
1341 
1342 	/*
1343 	 * Close this component.
1344 	 */
1345 	if (comp->un_mirror.ms_flags & MDM_S_ISOPEN) {
1346 		md_layered_close(comp->un_dev, MD_OFLG_NULL);
1347 		comp->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
1348 	}
1349 
1350 	/*
1351 	 * If the component is hotspared, return to the pool.
1352 	 */
1353 	if (comp->un_mirror.ms_hs_id != 0) {
1354 		hs_cmds_t	cmd;
1355 		mdkey_t		hs_key;
1356 
1357 		hs_key = comp->un_mirror.ms_hs_key;
1358 		comp->un_dev = comp->un_mirror.ms_orig_dev;
1359 		comp->un_start_block = comp->un_mirror.ms_orig_blk;
1360 		comp->un_mirror.ms_hs_key = 0;
1361 		comp->un_mirror.ms_hs_id = 0;
1362 		comp->un_mirror.ms_orig_dev = 0;
1363 
1364 		cmd = HS_FREE;
1365 		if ((comp->un_mirror.ms_state != CS_OKAY) &&
1366 		    (comp->un_mirror.ms_state != CS_RESYNC))
1367 			cmd = HS_BAD;
1368 		(void) md_hot_spare_ifc(cmd, un->un_hsp_id, 0, 0, &hs_id,
1369 		    &hs_key, NULL, NULL);
1370 	}
1371 
1372 	/*
1373 	 * Open by device id; for enable (indicated by a NULL
1374 	 * nd pointer), use the existing component info.  For
1375 	 * replace, use the new device.
1376 	 */
1377 	if (nd == NULL) {
1378 		this_dev = md_resolve_bydevid(mnum, comp->un_dev, comp->un_key);
1379 		/*
1380 		 * If someone replaced a new disk in the same slot
1381 		 * we get NODEV64 since old device id cannot be
1382 		 * resolved. The new devt is obtained from the
1383 		 * mddb since devt is going to be unchanged for the
1384 		 * enable case. No need to check for multiple
1385 		 * keys here because the caller (comp_replace)
1386 		 * has already sanity checked it for us.
1387 		 */
1388 		if (this_dev == NODEV64) {
1389 			this_dev = md_getdevnum(setno, side, comp->un_key,
1390 			    MD_TRUST_DEVT);
1391 		}
1392 	} else {
1393 		/*
1394 		 * If this is a hotspare, save the original dev_t for later
1395 		 * use. If this has occured during boot then the value of
1396 		 * comp->un_dev will be NODEV64 because of the failure to look
1397 		 * up the devid of the device.
1398 		 */
1399 		if (nd->nd_hs_id != 0)
1400 			comp->un_mirror.ms_orig_dev = comp->un_dev;
1401 		this_dev = md_resolve_bydevid(mnum, nd->nd_dev, nd->nd_key);
1402 	}
1403 
1404 	comp->un_dev = this_dev;
1405 
1406 	/*
1407 	 * Now open the new device if required. Note for a single component
1408 	 * stripe it will not be open - leave this for the mirror driver to
1409 	 * deal with.
1410 	 */
1411 	if (md_unit_isopen(ui)) {
1412 		if (md_layered_open(mnum, &this_dev, MD_OFLG_NULL)) {
1413 			mddb_recid_t	ids[3];
1414 
1415 			ids[0] = un->c.un_record_id;
1416 			ids[1] = hs_id;
1417 			ids[2] = 0;
1418 			mddb_commitrecs_wrapper(ids);
1419 			if ((nd != NULL) && (nd->nd_hs_id != 0)) {
1420 				/*
1421 				 * Revert back to the original device.
1422 				 */
1423 				comp->un_dev = comp->un_mirror.ms_orig_dev;
1424 
1425 				cmn_err(CE_WARN,
1426 				    "md: %s: open error of hotspare %s",
1427 				    md_shortname(mnum),
1428 				    md_devname(MD_MIN2SET(mnum), nd->nd_dev,
1429 				    NULL, 0));
1430 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1431 				    SVM_TAG_HS, MD_MIN2SET(mnum), nd->nd_dev);
1432 			}
1433 			md_unit_writerexit(ui);
1434 			return (MDE_COMP_OPEN_ERR);
1435 		}
1436 		if (nd != NULL)
1437 			nd->nd_dev = this_dev;
1438 
1439 		comp->un_mirror.ms_flags |= MDM_S_ISOPEN;
1440 	}
1441 
1442 	if (nd == NULL) {
1443 		recids[0] = un->c.un_record_id;
1444 		recids[1] = hs_id;
1445 		recids[2] = 0;
1446 		*replace_done = stripe_replace_done;
1447 		return (0);
1448 	}
1449 
1450 	/* if hot sparing this device */
1451 	if (nd->nd_hs_id != 0) {
1452 		char	devname[MD_MAX_CTDLEN];
1453 		char	hs_devname[MD_MAX_CTDLEN];
1454 		set_t	setno;
1455 
1456 		comp->un_mirror.ms_hs_id = nd->nd_hs_id;
1457 		comp->un_mirror.ms_hs_key = nd->nd_key;
1458 
1459 		comp->un_mirror.ms_orig_blk = comp->un_start_block;
1460 
1461 		setno = MD_MIN2SET(mnum);
1462 
1463 		(void) md_devname(setno, comp->un_mirror.ms_orig_dev, devname,
1464 					sizeof (devname));
1465 		(void) md_devname(setno, nd->nd_dev, hs_devname,
1466 		    sizeof (hs_devname));
1467 
1468 		cmn_err(CE_NOTE, "md: %s: hotspared device %s with %s",
1469 		    md_shortname(mnum), devname, hs_devname);
1470 
1471 	} else {	/* replacing the device */
1472 		comp->un_key = nd->nd_key;
1473 		*replace_data = (void *)sv;
1474 
1475 		/*
1476 		 * For the old device, make sure to reset the parent
1477 		 * if it's a  metadevice.
1478 		 */
1479 		if (md_getmajor(comp->un_dev) == md_major) {
1480 			minor_t	  comp_mnum = md_getminor(comp->un_dev);
1481 			md_unit_t *comp_un = MD_UNIT(comp_mnum);
1482 
1483 			md_reset_parent(comp->un_dev);
1484 			recids[rid++] = MD_RECID(comp_un);
1485 		}
1486 	}
1487 
1488 	comp->un_dev = nd->nd_dev;
1489 	comp->un_start_block = nd->nd_start_blk;
1490 
1491 	/*
1492 	 * For the new device, make sure to set the parent if it's a
1493 	 * metadevice.
1494 	 *
1495 	 * If we ever support using metadevices as hot spares, this
1496 	 * will need to be tested, and possibly moved into the
1497 	 * preceding "else" clause, immediately following the parent
1498 	 * reset block.  For now, it's convenient to leave it here and
1499 	 * only compress nd->nd_dev once.
1500 	 */
1501 	if (md_getmajor(comp->un_dev) == md_major) {
1502 		minor_t		comp_mnum = md_getminor(comp->un_dev);
1503 		md_unit_t	*comp_un = MD_UNIT(comp_mnum);
1504 
1505 		md_set_parent(comp->un_dev, MD_SID(un));
1506 		recids[rid++] = MD_RECID(comp_un);
1507 	}
1508 
1509 	recids[rid++] = un->c.un_record_id;
1510 	recids[rid++] = hs_id;
1511 	recids[rid] = 0;
1512 	*replace_done = stripe_replace_done;
1513 	return (0);
1514 }
1515 
1516 /*ARGSUSED*/
1517 static intptr_t
1518 stripe_hotspare_dev(
1519 	md_dev64_t	dev,
1520 	void		*junk,
1521 	int		ci,
1522 	mddb_recid_t	*recids,
1523 	int		nrecids,
1524 	void		(**replace_done)(),
1525 	void		**replace_data)
1526 {
1527 	ms_unit_t	*un;
1528 	mdi_unit_t	*ui;
1529 	ms_comp_t	*comp;
1530 	int		row;
1531 	struct ms_row	*mdr;
1532 	ms_new_dev_t	nd;
1533 	int		err;
1534 	int		i;
1535 	minor_t		mnum;
1536 	set_t		setno;
1537 	int		cmpcount = 0;
1538 
1539 	mnum = md_getminor(dev);
1540 	ui = MDI_UNIT(mnum);
1541 	un = MD_UNIT(mnum);
1542 	setno = MD_MIN2SET(mnum);
1543 
1544 	if (md_get_setstatus(setno) & MD_SET_STALE)
1545 		return (1);
1546 
1547 	if (un->un_hsp_id == -1)
1548 		return (1);
1549 
1550 	for (row = 0; row < un->un_nrows; row++) {
1551 		mdr = &un->un_row[row];
1552 		if ((mdr->un_ncomp + cmpcount) > ci)
1553 			break;
1554 		cmpcount += mdr->un_ncomp;
1555 	}
1556 	ASSERT(row != un->un_nrows);
1557 
1558 	comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1559 	comp += ci;
1560 	/* Concatenations have a ncomp = 1 */
1561 	nd.nd_nblks = mdr->un_blocks / mdr->un_ncomp;
1562 
1563 	if ((un->c.un_flag & MD_LABELED) && (ci == 0))
1564 		nd.nd_labeled = 1;
1565 	else
1566 		nd.nd_labeled = 0;
1567 
1568 again:
1569 	err = md_hot_spare_ifc(HS_GET, un->un_hsp_id, nd.nd_nblks,
1570 	    nd.nd_labeled, &nd.nd_hs_id, &nd.nd_key, &nd.nd_dev,
1571 	    &nd.nd_start_blk);
1572 
1573 	if (err) {
1574 		if (!stripe_replace_dev(dev, junk, ci, NULL, recids, nrecids,
1575 		    replace_done, replace_data)) {
1576 			mddb_commitrecs_wrapper(recids);
1577 			md_unit_writerexit(ui);
1578 		}
1579 		recids[0] = 0;
1580 		return (1);
1581 	}
1582 
1583 	if (stripe_replace_dev(dev, junk, ci, &nd, recids, nrecids,
1584 		replace_done, replace_data)) {
1585 
1586 		(void) md_hot_spare_ifc(HS_BAD, un->un_hsp_id, 0, 0,
1587 		    &nd.nd_hs_id, &nd.nd_key, NULL, NULL);
1588 		mddb_commitrec_wrapper(nd.nd_hs_id);
1589 		goto again;
1590 	}
1591 
1592 	/* Leave a slot for the null recid */
1593 	for (i = 0; i < (nrecids - 1); i++) {
1594 		if (recids[i] == 0) {
1595 			recids[i++] = nd.nd_hs_id;
1596 			recids[i] = 0;
1597 		}
1598 	}
1599 	return (0);
1600 }
1601 
1602 static int
1603 stripe_imp_set(
1604 	set_t	setno
1605 )
1606 {
1607 
1608 	mddb_recid_t	recid;
1609 	int		i, row, c, gotsomething;
1610 	mddb_type_t	typ1;
1611 	mddb_de_ic_t	*dep;
1612 	mddb_rb32_t	*rbp;
1613 	ms_unit32_od_t	*un32;
1614 	ms_unit_t	*un64;
1615 	md_dev64_t	self_devt;
1616 	minor_t		*self_id;	/* minor needs to be updated */
1617 	md_parent_t	*parent_id;	/* parent needs to be updated */
1618 	mddb_recid_t	*record_id;	/* record id needs to be updated */
1619 	mddb_recid_t	*hsp_id;
1620 	ms_comp32_od_t	*comp32;
1621 	ms_comp_t	*comp64;
1622 
1623 
1624 	gotsomething = 0;
1625 
1626 	typ1 = (mddb_type_t)md_getshared_key(setno,
1627 	    stripe_md_ops.md_driver.md_drivername);
1628 	recid = mddb_makerecid(setno, 0);
1629 
1630 	while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
1631 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1632 			continue;
1633 
1634 		dep = mddb_getrecdep(recid);
1635 		rbp = dep->de_rb;
1636 
1637 		switch (rbp->rb_revision) {
1638 		case MDDB_REV_RB:
1639 		case MDDB_REV_RBFN:
1640 			/*
1641 			 * Small device
1642 			 */
1643 			un32 = (ms_unit32_od_t *)mddb_getrecaddr(recid);
1644 			self_id = &(un32->c.un_self_id);
1645 			parent_id = &(un32->c.un_parent);
1646 			record_id = &(un32->c.un_record_id);
1647 			hsp_id = &(un32->un_hsp_id);
1648 
1649 			comp32 = (ms_comp32_od_t *)((void *)&((char *)un32)
1650 				[un32->un_ocomp]);
1651 			for (row = 0; row < un32->un_nrows; row++) {
1652 			    struct ms_row32_od *mdr = &un32->un_row[row];
1653 			    for (i = 0, c = mdr->un_icomp;
1654 				i < mdr->un_ncomp; i++) {
1655 				ms_comp32_od_t *mdc;
1656 				mdc = &comp32[c++];
1657 
1658 				if (!md_update_minor(setno, mddb_getsidenum
1659 				    (setno), mdc->un_key))
1660 					goto out;
1661 
1662 				if (mdc->un_mirror.ms_hs_id != 0)
1663 				    mdc->un_mirror.ms_hs_id = MAKERECID(
1664 				    setno, mdc->un_mirror.ms_hs_id);
1665 			    }
1666 			}
1667 			break;
1668 		case MDDB_REV_RB64:
1669 		case MDDB_REV_RB64FN:
1670 			un64 = (ms_unit_t *)mddb_getrecaddr(recid);
1671 			self_id = &(un64->c.un_self_id);
1672 			parent_id = &(un64->c.un_parent);
1673 			record_id = &(un64->c.un_record_id);
1674 			hsp_id = &(un64->un_hsp_id);
1675 
1676 			comp64 = (ms_comp_t *)((void *)&((char *)un64)
1677 				[un64->un_ocomp]);
1678 			for (row = 0; row < un64->un_nrows; row++) {
1679 			    struct ms_row *mdr = &un64->un_row[row];
1680 			    for (i = 0, c = mdr->un_icomp;
1681 				i < mdr->un_ncomp; i++) {
1682 				ms_comp_t *mdc;
1683 				mdc = &comp64[c++];
1684 
1685 				if (!md_update_minor(setno, mddb_getsidenum
1686 				    (setno), mdc->un_key))
1687 					goto out;
1688 
1689 				if (mdc->un_mirror.ms_hs_id != 0)
1690 				    mdc->un_mirror.ms_hs_id = MAKERECID(
1691 				    setno, mdc->un_mirror.ms_hs_id);
1692 			    }
1693 			}
1694 			break;
1695 		}
1696 
1697 		/*
1698 		 * If this is a top level and a friendly name metadevice,
1699 		 * update its minor in the namespace.
1700 		 */
1701 		if ((*parent_id == MD_NO_PARENT) &&
1702 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
1703 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
1704 
1705 			self_devt = md_makedevice(md_major, *self_id);
1706 			if (!md_update_top_device_minor(setno,
1707 			    mddb_getsidenum(setno), self_devt))
1708 				goto out;
1709 		}
1710 
1711 		/*
1712 		 * Update unit with the imported setno
1713 		 *
1714 		 */
1715 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
1716 
1717 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1718 
1719 		if (*hsp_id != -1)
1720 			*hsp_id = MAKERECID(setno, DBID(*hsp_id));
1721 
1722 		if (*parent_id != MD_NO_PARENT)
1723 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1724 		*record_id = MAKERECID(setno, DBID(*record_id));
1725 
1726 		gotsomething = 1;
1727 	}
1728 
1729 out:
1730 	return (gotsomething);
1731 }
1732 
1733 static md_named_services_t stripe_named_services[] = {
1734 	{stripe_shared_by_blk,			"shared by blk"		    },
1735 	{stripe_shared_by_indx,			"shared by indx"	    },
1736 	{stripe_component_count,		"get component count"	    },
1737 	{stripe_block_count_skip_size,		"get block count skip size" },
1738 	{stripe_get_dev,			"get device"		    },
1739 	{stripe_replace_dev,			"replace device"	    },
1740 	{stripe_hotspare_dev,			"hotspare device"	    },
1741 	{stripe_rename_check,			MDRNM_CHECK		    },
1742 	{NULL,					0}
1743 };
1744 
1745 md_ops_t stripe_md_ops = {
1746 	stripe_open,		/* open */
1747 	stripe_close,		/* close */
1748 	md_stripe_strategy,	/* strategy */
1749 	NULL,			/* print */
1750 	stripe_dump,		/* dump */
1751 	NULL,			/* read */
1752 	NULL,			/* write */
1753 	md_stripe_ioctl,	/* stripe_ioctl, */
1754 	stripe_snarf,		/* stripe_snarf */
1755 	stripe_halt,		/* stripe_halt */
1756 	NULL,			/* aread */
1757 	NULL,			/* awrite */
1758 	stripe_imp_set,		/* import set */
1759 	stripe_named_services
1760 };
1761 
1762 static void
1763 init_init()
1764 {
1765 	md_stripe_mcs_buf_off = sizeof (md_scs_t) - sizeof (buf_t);
1766 
1767 	stripe_parent_cache = kmem_cache_create("md_stripe_parent",
1768 	    sizeof (md_sps_t), 0, stripe_parent_constructor,
1769 	    stripe_parent_destructor, stripe_run_queue, NULL, NULL,
1770 	    0);
1771 	stripe_child_cache = kmem_cache_create("md_stripe_child",
1772 	    sizeof (md_scs_t) - sizeof (buf_t) + biosize(), 0,
1773 	    stripe_child_constructor, stripe_child_destructor,
1774 	    stripe_run_queue, NULL, NULL, 0);
1775 }
1776 
1777 static void
1778 fini_uninit()
1779 {
1780 	kmem_cache_destroy(stripe_parent_cache);
1781 	kmem_cache_destroy(stripe_child_cache);
1782 	stripe_parent_cache = stripe_child_cache = NULL;
1783 }
1784 
1785 /* define the module linkage */
1786 MD_PLUGIN_MISC_MODULE("stripes module %I%", init_init(), fini_uninit())
1787