xref: /titanic_41/usr/src/uts/common/io/lvm/md/md.c (revision 66310b5b6be388602e7aa8639a404efd2ddb5687)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Md - is the meta-disk driver.   It sits below the UFS file system
30  * but above the 'real' disk drivers, xy, id, sd etc.
31  *
32  * To the UFS software, md looks like a normal driver, since it has
33  * the normal kinds of entries in the bdevsw and cdevsw arrays. So
34  * UFS accesses md in the usual ways.  In particular, the strategy
35  * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
36  * and ufs_writelbn().
37  *
38  * Md maintains an array of minor devices (meta-partitions).   Each
39  * meta partition stands for a matrix of real partitions, in rows
40  * which are not necessarily of equal length.	Md maintains a table,
41  * with one entry for each meta-partition,  which lists the rows and
42  * columns of actual partitions, and the job of the strategy routine
43  * is to translate from the meta-partition device and block numbers
44  * known to UFS into the actual partitions' device and block numbers.
45  *
46  * See below, in mdstrategy(), mdreal(), and mddone() for details of
47  * this translation.
48  */
49 
50 /*
51  * Driver for Virtual Disk.
52  */
53 
54 #include <sys/user.h>
55 #include <sys/sysmacros.h>
56 #include <sys/conf.h>
57 #include <sys/stat.h>
58 #include <sys/errno.h>
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/file.h>
62 #include <sys/open.h>
63 #include <sys/dkio.h>
64 #include <sys/vtoc.h>
65 #include <sys/cmn_err.h>
66 #include <sys/ddi.h>
67 #include <sys/sunddi.h>
68 #include <sys/debug.h>
69 #include <sys/utsname.h>
70 #include <sys/lvm/mdvar.h>
71 #include <sys/lvm/md_names.h>
72 #include <sys/lvm/md_mddb.h>
73 #include <sys/lvm/md_sp.h>
74 #include <sys/types.h>
75 #include <sys/kmem.h>
76 #include <sys/cladm.h>
77 #include <sys/priv_names.h>
78 
79 #ifndef	lint
80 char 		_depends_on[] = "strmod/rpcmod";
81 #endif	/* lint */
82 int		md_init_debug	= 0;	/* module binding debug */
83 
84 /*
85  * Tunable to turn off the failfast behavior.
86  */
87 int		md_ff_disable = 0;
88 
89 /*
90  * dynamically allocated list of non FF driver names - needs to
91  * be freed when md is detached.
92  */
93 char	**non_ff_drivers = NULL;
94 
95 md_krwlock_t	md_unit_array_rw;	/* protects all unit arrays */
96 md_krwlock_t	nm_lock;		/* protects all the name spaces */
97 
98 md_resync_t	md_cpr_resync;
99 
100 extern char	svm_bootpath[];
101 #define	SVM_PSEUDO_STR	"/pseudo/md@0:"
102 
103 #define		VERSION_LENGTH	6
104 #define		VERSION		"1.0"
105 
106 /*
107  * Keep track of possible 'orphan' entries in the name space
108  */
109 int		*md_nm_snarfed = NULL;
110 
111 /*
112  * Global tunable giving the percentage of free space left in replica during
113  * conversion of non-devid style replica to devid style replica.
114  */
115 int		md_conv_perc = MDDB_DEVID_CONV_PERC;
116 
117 #ifdef	DEBUG
118 /* debug code to verify framework exclusion guarantees */
119 int		md_in;
120 kmutex_t	md_in_mx;			/* used to md global stuff */
121 #define	IN_INIT		0x01
122 #define	IN_FINI		0x02
123 #define	IN_ATTACH	0x04
124 #define	IN_DETACH	0x08
125 #define	IN_OPEN		0x10
126 #define	MD_SET_IN(x) {						\
127 	mutex_enter(&md_in_mx);					\
128 	if (md_in)						\
129 		debug_enter("MD_SET_IN exclusion lost");	\
130 	if (md_in & x)						\
131 		debug_enter("MD_SET_IN already set");		\
132 	md_in |= x;						\
133 	mutex_exit(&md_in_mx);					\
134 }
135 
136 #define	MD_CLR_IN(x) {						\
137 	mutex_enter(&md_in_mx);					\
138 	if (md_in & ~(x))					\
139 		debug_enter("MD_CLR_IN exclusion lost");	\
140 	if (!(md_in & x))					\
141 		debug_enter("MD_CLR_IN already clr");		\
142 	md_in &= ~x;						\
143 	mutex_exit(&md_in_mx);					\
144 }
145 #else	/* DEBUG */
146 #define	MD_SET_IN(x)
147 #define	MD_CLR_IN(x)
148 #endif	/* DEBUG */
149 hrtime_t savetime1, savetime2;
150 
151 
152 /*
153  * list things protected by md_mx even if they aren't
154  * used in this file.
155  */
156 kmutex_t	md_mx;			/* used to md global stuff */
157 kcondvar_t	md_cv;			/* md_status events */
158 int		md_status = 0;		/* global status for the meta-driver */
159 int		md_num_daemons = 0;
160 int		md_ioctl_cnt = 0;
161 int		md_mtioctl_cnt = 0;	/* multithreaded ioctl cnt */
162 uint_t		md_mdelay = 10;		/* variable so can be patched */
163 
164 int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
165 
166 major_t		md_major, md_major_targ;
167 
168 unit_t		md_nunits = MD_MAXUNITS;
169 set_t		md_nsets = MD_MAXSETS;
170 int		md_nmedh = 0;
171 char		*md_med_trans_lst = NULL;
172 md_set_t	md_set[MD_MAXSETS];
173 md_set_io_t	md_set_io[MD_MAXSETS];
174 
175 md_krwlock_t	hsp_rwlp;		/* protects hot_spare_interface */
176 md_krwlock_t	ni_rwlp;		/* protects notify_interface */
177 md_ops_t	**md_ops = NULL;
178 ddi_modhandle_t	*md_mods = NULL;
179 md_ops_t	*md_opslist;
180 clock_t		md_hz;
181 md_event_queue_t	*md_event_queue = NULL;
182 
183 int		md_in_upgrade;
184 int		md_keep_repl_state;
185 int		md_devid_destroy;
186 
187 /* for sending messages thru a door to userland */
188 door_handle_t	mdmn_door_handle = NULL;
189 int		mdmn_door_did = -1;
190 
191 dev_info_t		*md_devinfo = NULL;
192 
193 md_mn_nodeid_t	md_mn_mynode_id = ~0u;	/* My node id (for multi-node sets) */
194 
195 static	uint_t		md_ocnt[OTYPCNT];
196 
197 static int		mdinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
198 static int		mdattach(dev_info_t *, ddi_attach_cmd_t);
199 static int		mddetach(dev_info_t *, ddi_detach_cmd_t);
200 static int		mdopen(dev_t *, int, int, cred_t *);
201 static int		mdclose(dev_t, int, int, cred_t *);
202 static int		mddump(dev_t, caddr_t, daddr_t, int);
203 static int		mdread(dev_t, struct uio *, cred_t *);
204 static int		mdwrite(dev_t, struct uio *, cred_t *);
205 static int		mdaread(dev_t, struct aio_req *, cred_t *);
206 static int		mdawrite(dev_t, struct aio_req *, cred_t *);
207 static int		mdioctl(dev_t, int, intptr_t, int, cred_t *, int *);
208 static int		mdprop_op(dev_t, dev_info_t *,
209 				ddi_prop_op_t, int, char *, caddr_t, int *);
210 
211 static struct cb_ops md_cb_ops = {
212 	mdopen,			/* open */
213 	mdclose,		/* close */
214 	mdstrategy,		/* strategy */
215 				/* print routine -- none yet */
216 	(int(*)(dev_t, char *))nulldev,
217 	mddump,			/* dump */
218 	mdread,			/* read */
219 	mdwrite,		/* write */
220 	mdioctl,		/* ioctl */
221 				/* devmap */
222 	(int(*)(dev_t, devmap_cookie_t, offset_t, size_t, size_t *,
223 			uint_t))nodev,
224 				/* mmap */
225 	(int(*)(dev_t, off_t, int))nodev,
226 				/* segmap */
227 	(int(*)(dev_t, off_t, struct as *, caddr_t *, off_t, unsigned,
228 		unsigned, unsigned, cred_t *))nodev,
229 	nochpoll,		/* poll */
230 	mdprop_op,		/* prop_op */
231 	0,			/* streamtab */
232 	(D_64BIT|D_MP|D_NEW),	/* driver compatibility flag */
233 	CB_REV,			/* cb_ops version */
234 	mdaread,		/* aread */
235 	mdawrite,		/* awrite */
236 };
237 
238 static struct dev_ops md_devops = {
239 	DEVO_REV,		/* dev_ops version */
240 	0,			/* device reference count */
241 	mdinfo,			/* info routine */
242 	nulldev,		/* identify routine */
243 	nulldev,		/* probe - not defined */
244 	mdattach,		/* attach routine */
245 	mddetach,		/* detach routine */
246 	nodev,			/* reset - not defined */
247 	&md_cb_ops,		/* driver operations */
248 	NULL,			/* bus operations */
249 	nodev			/* power management */
250 };
251 
252 /*
253  * loadable module wrapper
254  */
255 #include <sys/modctl.h>
256 
257 static struct modldrv modldrv = {
258 	&mod_driverops,			/* type of module -- a pseudodriver */
259 	"Solaris Volume Manager base module", /* name of the module */
260 	&md_devops,			/* driver ops */
261 };
262 
263 static struct modlinkage modlinkage = {
264 	MODREV_1,
265 	(void *)&modldrv,
266 	NULL
267 };
268 
269 
270 /* md_medd.c */
271 extern	void	med_init(void);
272 extern	void	med_fini(void);
273 extern  void	md_devid_cleanup(set_t, uint_t);
274 
275 /* md_names.c */
276 extern void			*lookup_entry(struct nm_next_hdr *, set_t,
277 					side_t, mdkey_t, md_dev64_t, int);
278 extern struct nm_next_hdr	*get_first_record(set_t, int, int);
279 extern int			remove_entry(struct nm_next_hdr *,
280 					side_t, mdkey_t, int);
281 
282 int		md_maxphys	= 0;	/* maximum io size in bytes */
283 #define		MD_MAXBCOUNT	(1024 * 1024)
284 unsigned	md_maxbcount	= 0;	/* maximum physio size in bytes */
285 
286 /* allocate/free dynamic space associated with driver globals */
287 void
288 md_global_alloc_free(int alloc)
289 {
290 	set_t	s;
291 
292 	if (alloc) {
293 		/* initialize driver global locks */
294 		cv_init(&md_cv, NULL, CV_DEFAULT, NULL);
295 		mutex_init(&md_mx, NULL, MUTEX_DEFAULT, NULL);
296 		rw_init(&md_unit_array_rw.lock, NULL, RW_DEFAULT, NULL);
297 		rw_init(&nm_lock.lock, NULL, RW_DEFAULT, NULL);
298 		rw_init(&ni_rwlp.lock, NULL, RW_DRIVER, NULL);
299 		rw_init(&hsp_rwlp.lock, NULL, RW_DRIVER, NULL);
300 		mutex_init(&md_cpr_resync.md_resync_mutex, NULL,
301 			MUTEX_DEFAULT, NULL);
302 
303 		/* initialize per set driver global locks */
304 		for (s = 0; s < MD_MAXSETS; s++) {
305 			/* initialize per set driver globals locks */
306 			mutex_init(&md_set[s].s_dbmx,
307 			    NULL, MUTEX_DEFAULT, NULL);
308 			mutex_init(&md_set_io[s].md_io_mx,
309 			    NULL, MUTEX_DEFAULT, NULL);
310 			cv_init(&md_set_io[s].md_io_cv,
311 			    NULL, CV_DEFAULT, NULL);
312 		}
313 	} else {
314 		/* destroy per set driver global locks */
315 		for (s = 0; s < MD_MAXSETS; s++) {
316 			cv_destroy(&md_set_io[s].md_io_cv);
317 			mutex_destroy(&md_set_io[s].md_io_mx);
318 			mutex_destroy(&md_set[s].s_dbmx);
319 		}
320 
321 		/* destroy driver global locks */
322 		mutex_destroy(&md_cpr_resync.md_resync_mutex);
323 		rw_destroy(&hsp_rwlp.lock);
324 		rw_destroy(&ni_rwlp.lock);
325 		rw_destroy(&nm_lock.lock);
326 		rw_destroy(&md_unit_array_rw.lock);
327 		mutex_destroy(&md_mx);
328 		cv_destroy(&md_cv);
329 	}
330 }
331 
332 int
333 _init(void)
334 {
335 	set_t	s;
336 	int	err;
337 
338 	MD_SET_IN(IN_INIT);
339 
340 	/* allocate dynamic space associated with driver globals */
341 	md_global_alloc_free(1);
342 
343 	/* initialize driver globals */
344 	md_major = ddi_name_to_major("md");
345 	md_hz = drv_usectohz(NUM_USEC_IN_SEC);
346 
347 	/* initialize tunable globals */
348 	if (md_maxphys == 0)		/* maximum io size in bytes */
349 		md_maxphys = maxphys;
350 	if (md_maxbcount == 0)		/* maximum physio size in bytes */
351 		md_maxbcount = MD_MAXBCOUNT;
352 
353 	/* initialize per set driver globals */
354 	for (s = 0; s < MD_MAXSETS; s++)
355 		md_set_io[s].io_state = MD_SET_ACTIVE;
356 
357 	/*
358 	 * NOTE: the framework does not currently guarantee exclusion
359 	 * between _init and attach after calling mod_install.
360 	 */
361 	MD_CLR_IN(IN_INIT);
362 	if ((err = mod_install(&modlinkage))) {
363 		MD_SET_IN(IN_INIT);
364 		md_global_alloc_free(0);	/* free dynamic space */
365 		MD_CLR_IN(IN_INIT);
366 	}
367 	return (err);
368 }
369 
370 int
371 _fini(void)
372 {
373 	int	err;
374 
375 	/*
376 	 * NOTE: the framework currently does not guarantee exclusion
377 	 * with attach until after mod_remove returns 0.
378 	 */
379 	if ((err = mod_remove(&modlinkage)))
380 		return (err);
381 
382 	MD_SET_IN(IN_FINI);
383 	md_global_alloc_free(0);	/* free dynamic space */
384 	MD_CLR_IN(IN_FINI);
385 	return (err);
386 }
387 
388 int
389 _info(struct modinfo *modinfop)
390 {
391 	return (mod_info(&modlinkage, modinfop));
392 }
393 
394 /* ARGSUSED */
395 static int
396 mdattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
397 {
398 	int	len;
399 	unit_t	i;
400 	size_t	sz;
401 	char	ver[VERSION_LENGTH];
402 	char	**maj_str_array;
403 	char	*str, *str2;
404 
405 	MD_SET_IN(IN_ATTACH);
406 	md_in_upgrade = 0;
407 	md_keep_repl_state = 0;
408 	md_devid_destroy = 0;
409 
410 	if (cmd != DDI_ATTACH) {
411 		MD_CLR_IN(IN_ATTACH);
412 		return (DDI_FAILURE);
413 	}
414 
415 	if (md_devinfo != NULL) {
416 		MD_CLR_IN(IN_ATTACH);
417 		return (DDI_FAILURE);
418 	}
419 
420 	mddb_init();
421 
422 	if (md_start_daemons(TRUE)) {
423 		MD_CLR_IN(IN_ATTACH);
424 		mddb_unload();		/* undo mddb_init() allocations */
425 		return (DDI_FAILURE);
426 	}
427 
428 	/* clear the halted state */
429 	md_clr_status(MD_GBL_HALTED);
430 
431 	/* see if the diagnostic switch is on */
432 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
433 	    DDI_PROP_DONTPASS, "md_init_debug", 0))
434 		md_init_debug++;
435 
436 	/* see if the failfast disable switch is on */
437 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
438 	    DDI_PROP_DONTPASS, "md_ff_disable", 0))
439 		md_ff_disable++;
440 
441 	/* try and get the md_nmedh property */
442 	md_nmedh = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
443 	    DDI_PROP_DONTPASS, "md_nmedh", MED_DEF_HOSTS);
444 	if ((md_nmedh <= 0) || (md_nmedh > MED_MAX_HOSTS))
445 		md_nmedh = MED_DEF_HOSTS;
446 
447 	/* try and get the md_med_trans_lst property */
448 	len = 0;
449 	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN,
450 	    0, "md_med_trans_lst", NULL, &len) != DDI_PROP_SUCCESS ||
451 	    len == 0) {
452 		md_med_trans_lst = md_strdup("tcp");
453 	} else {
454 		md_med_trans_lst = kmem_zalloc((size_t)len, KM_SLEEP);
455 		if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
456 		    0, "md_med_trans_lst", md_med_trans_lst, &len) !=
457 		    DDI_PROP_SUCCESS) {
458 			kmem_free(md_med_trans_lst, (size_t)len);
459 			md_med_trans_lst = md_strdup("tcp");
460 		}
461 	}
462 
463 	/*
464 	 * Must initialize the internal data structures before the
465 	 * any possible calls to 'goto attach_failure' as _fini
466 	 * routine references them.
467 	 */
468 	med_init();
469 
470 	md_ops = (md_ops_t **)kmem_zalloc(
471 	    sizeof (md_ops_t *) * MD_NOPS, KM_SLEEP);
472 	md_mods = (ddi_modhandle_t *)kmem_zalloc(
473 	    sizeof (ddi_modhandle_t) * MD_NOPS, KM_SLEEP);
474 
475 	/* try and get the md_xlate property */
476 	/* Should we only do this if upgrade? */
477 	len = sizeof (char) * 5;
478 	if (ddi_prop_op(DDI_DEV_T_ANY, dip, PROP_LEN_AND_VAL_BUF,
479 	    0, "md_xlate_ver", ver, &len) == DDI_PROP_SUCCESS) {
480 		if (strcmp(ver, VERSION) == 0) {
481 			len = 0;
482 			if (ddi_prop_op(DDI_DEV_T_ANY, dip,
483 			    PROP_LEN_AND_VAL_ALLOC, 0, "md_xlate",
484 			    (caddr_t)&md_tuple_table, &len) !=
485 			    DDI_PROP_SUCCESS) {
486 				if (md_init_debug)
487 					cmn_err(CE_WARN,
488 					    "md_xlate ddi_prop_op failed");
489 				goto attach_failure;
490 			} else {
491 				md_tuple_length =
492 				    len/(2 * ((int)sizeof (dev32_t)));
493 				md_in_upgrade = 1;
494 			}
495 
496 			/* Get target's name to major table */
497 			if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY,
498 			    dip, DDI_PROP_DONTPASS,
499 			    "md_targ_nm_table", &maj_str_array,
500 			    &md_majortab_len) != DDI_PROP_SUCCESS) {
501 				md_majortab_len = 0;
502 				if (md_init_debug)
503 				    cmn_err(CE_WARN, "md_targ_nm_table "
504 				    "ddi_prop_lookup_string_array failed");
505 				goto attach_failure;
506 			}
507 
508 			md_major_tuple_table =
509 			    (struct md_xlate_major_table *)
510 			    kmem_zalloc(md_majortab_len *
511 			    sizeof (struct md_xlate_major_table), KM_SLEEP);
512 
513 			for (i = 0; i < md_majortab_len; i++) {
514 				/* Getting major name */
515 				str = strchr(maj_str_array[i], ' ');
516 				if (str == NULL)
517 					continue;
518 				*str = '\0';
519 				md_major_tuple_table[i].drv_name =
520 				    md_strdup(maj_str_array[i]);
521 
522 				/* Simplified atoi to get major number */
523 				str2 = str + 1;
524 				md_major_tuple_table[i].targ_maj = 0;
525 				while ((*str2 >= '0') && (*str2 <= '9')) {
526 				    md_major_tuple_table[i].targ_maj *= 10;
527 				    md_major_tuple_table[i].targ_maj +=
528 					*str2++ - '0';
529 				}
530 				*str = ' ';
531 			}
532 			ddi_prop_free((void *)maj_str_array);
533 		} else {
534 			if (md_init_debug)
535 				cmn_err(CE_WARN, "md_xlate_ver is incorrect");
536 			goto attach_failure;
537 		}
538 	}
539 
540 	/*
541 	 * Check for properties:
542 	 * 	md_keep_repl_state and md_devid_destroy
543 	 * and set globals if these exist.
544 	 */
545 	md_keep_repl_state = ddi_getprop(DDI_DEV_T_ANY, dip,
546 				    0, "md_keep_repl_state", 0);
547 
548 	md_devid_destroy = ddi_getprop(DDI_DEV_T_ANY, dip,
549 				    0, "md_devid_destroy", 0);
550 
551 	if (MD_UPGRADE)
552 		md_major_targ = md_targ_name_to_major("md");
553 	else
554 		md_major_targ = 0;
555 
556 	/* allocate admin device node */
557 	if (ddi_create_priv_minor_node(dip, "admin", S_IFCHR,
558 	    MD_ADM_MINOR, DDI_PSEUDO, 0, NULL, PRIV_SYS_CONFIG, 0640))
559 		goto attach_failure;
560 
561 	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
562 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_SUCCESS)
563 		goto attach_failure;
564 
565 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip,
566 	    "ddi-abrwrite-supported", 1) != DDI_SUCCESS)
567 		goto attach_failure;
568 
569 	/* these could have been cleared by a detach */
570 	md_nunits = MD_MAXUNITS;
571 	md_nsets = MD_MAXSETS;
572 
573 	sz = sizeof (void *) * MD_MAXUNITS;
574 	if (md_set[0].s_un == NULL)
575 		md_set[0].s_un = kmem_zalloc(sz, KM_SLEEP);
576 	if (md_set[0].s_ui == NULL)
577 		md_set[0].s_ui = kmem_zalloc(sz, KM_SLEEP);
578 
579 	md_devinfo = dip;
580 
581 	/*
582 	 * Only allocate device node for root mirror metadevice.
583 	 * Don't pre-allocate unnecessary device nodes (thus slowing down a
584 	 * boot when we attach).
585 	 * We can't read the mddbs in attach.  The mddbs will be read
586 	 * by metainit during the boot process when it is doing the
587 	 * auto-take processing and any other minor nodes will be
588 	 * allocated at that point.
589 	 *
590 	 * There are two scenarios to be aware of here:
591 	 * 1) when we are booting from a mirrored root we need the root
592 	 *    metadevice to exist very early (during vfs_mountroot processing)
593 	 * 2) we need all of the nodes to be created so that any mnttab entries
594 	 *    will succeed (handled by metainit reading the mddb during boot).
595 	 */
596 	if (strncmp(SVM_PSEUDO_STR, svm_bootpath, sizeof (SVM_PSEUDO_STR) - 1)
597 	    == 0) {
598 		char *p;
599 		int mnum = 0;
600 
601 		/*
602 		 * The svm_bootpath string looks something like
603 		 * /pseudo/md@0:0,150,blk where 150 is the minor number
604 		 * in this example so we need to set the pointer p onto
605 		 * the first digit of the minor number and convert it
606 		 * from ascii.
607 		 */
608 		for (p = svm_bootpath + sizeof (SVM_PSEUDO_STR) + 1;
609 		    *p >= '0' && *p <= '9'; p++) {
610 			mnum *= 10;
611 			mnum += *p - '0';
612 		}
613 
614 		if (md_create_minor_node(0, mnum)) {
615 			kmem_free(md_set[0].s_un, sz);
616 			kmem_free(md_set[0].s_ui, sz);
617 			goto attach_failure;
618 		}
619 	}
620 
621 	MD_CLR_IN(IN_ATTACH);
622 	return (DDI_SUCCESS);
623 
624 attach_failure:
625 	/*
626 	 * Use our own detach routine to toss any stuff we allocated above.
627 	 * NOTE: detach will call md_halt to free the mddb_init allocations.
628 	 */
629 	MD_CLR_IN(IN_ATTACH);
630 	if (mddetach(dip, DDI_DETACH) != DDI_SUCCESS)
631 		cmn_err(CE_WARN, "detach from attach failed");
632 	return (DDI_FAILURE);
633 }
634 
635 /* ARGSUSED */
636 static int
637 mddetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
638 {
639 	extern int	check_active_locators();
640 	set_t		s;
641 	size_t		sz;
642 	int		len;
643 
644 	MD_SET_IN(IN_DETACH);
645 
646 	/* check command */
647 	if (cmd != DDI_DETACH) {
648 		MD_CLR_IN(IN_DETACH);
649 		return (DDI_FAILURE);
650 	}
651 
652 	/*
653 	 * if we have not already halted yet we have no active config
654 	 * then automatically initiate a halt so we can detach.
655 	 */
656 	if (!(md_get_status() & MD_GBL_HALTED)) {
657 		if (check_active_locators() == 0) {
658 			/*
659 			 * NOTE: a successful md_halt will have done the
660 			 * mddb_unload to free allocations done in mddb_init
661 			 */
662 			if (md_halt(MD_NO_GBL_LOCKS_HELD)) {
663 				cmn_err(CE_NOTE, "md:detach: "
664 				    "Could not halt Solaris Volume Manager");
665 				MD_CLR_IN(IN_DETACH);
666 				return (DDI_FAILURE);
667 			}
668 		}
669 
670 		/* fail detach if we have not halted */
671 		if (!(md_get_status() & MD_GBL_HALTED)) {
672 			MD_CLR_IN(IN_DETACH);
673 			return (DDI_FAILURE);
674 		}
675 	}
676 
677 	/* must be in halted state, this will be cleared on next attach */
678 	ASSERT(md_get_status() & MD_GBL_HALTED);
679 
680 	/* cleanup attach allocations and initializations */
681 	md_major_targ = 0;
682 
683 	sz = sizeof (void *) * md_nunits;
684 	for (s = 0; s < md_nsets; s++) {
685 		if (md_set[s].s_un != NULL) {
686 			kmem_free(md_set[s].s_un, sz);
687 			md_set[s].s_un = NULL;
688 		}
689 
690 		if (md_set[s].s_ui != NULL) {
691 			kmem_free(md_set[s].s_ui, sz);
692 			md_set[s].s_ui = NULL;
693 		}
694 	}
695 	md_nunits = 0;
696 	md_nsets = 0;
697 	md_nmedh = 0;
698 
699 	if (non_ff_drivers != NULL) {
700 		int	i;
701 
702 		for (i = 0; non_ff_drivers[i] != NULL; i++)
703 		    kmem_free(non_ff_drivers[i], strlen(non_ff_drivers[i]) + 1);
704 
705 		/* free i+1 entries because there is a null entry at list end */
706 		kmem_free(non_ff_drivers, (i + 1) * sizeof (char *));
707 		non_ff_drivers = NULL;
708 	}
709 
710 	if (md_med_trans_lst != NULL) {
711 		kmem_free(md_med_trans_lst, strlen(md_med_trans_lst) + 1);
712 		md_med_trans_lst = NULL;
713 	}
714 
715 	if (md_mods != NULL) {
716 		kmem_free(md_mods, sizeof (ddi_modhandle_t) * MD_NOPS);
717 		md_mods = NULL;
718 	}
719 
720 	if (md_ops != NULL) {
721 		kmem_free(md_ops, sizeof (md_ops_t *) * MD_NOPS);
722 		md_ops = NULL;
723 	}
724 
725 	if (MD_UPGRADE) {
726 		len = md_tuple_length * (2 * ((int)sizeof (dev32_t)));
727 		md_in_upgrade = 0;
728 		md_xlate_free(len);
729 		md_majortab_free();
730 	}
731 
732 	/*
733 	 * Undo what we did in mdattach, freeing resources
734 	 * and removing things we installed.  The system
735 	 * framework guarantees we are not active with this devinfo
736 	 * node in any other entry points at this time.
737 	 */
738 	ddi_prop_remove_all(dip);
739 	ddi_remove_minor_node(dip, NULL);
740 
741 	med_fini();
742 	md_devinfo = NULL;
743 
744 	MD_CLR_IN(IN_DETACH);
745 	return (DDI_SUCCESS);
746 }
747 
748 
749 /*
750  * Given the device number return the devinfo pointer
751  * given to md via md_attach
752  */
753 /*ARGSUSED*/
754 static int
755 mdinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
756 {
757 	int		error = DDI_FAILURE;
758 
759 	switch (infocmd) {
760 	case DDI_INFO_DEVT2DEVINFO:
761 		if (md_devinfo) {
762 			*result = (void *)md_devinfo;
763 			error = DDI_SUCCESS;
764 		}
765 		break;
766 
767 	case DDI_INFO_DEVT2INSTANCE:
768 		*result = (void *)0;
769 		error = DDI_SUCCESS;
770 		break;
771 	}
772 	return (error);
773 }
774 
775 /*
776  * property operation routine.  return the number of blocks for the partition
777  * in question or forward the request to the property facilities.
778  */
779 static int
780 mdprop_op(
781 	dev_t dev,		/* device number associated with device */
782 	dev_info_t *dip,	/* device info struct for this device */
783 	ddi_prop_op_t prop_op,	/* property operator */
784 	int mod_flags,		/* property flags */
785 	char *name,		/* name of property */
786 	caddr_t valuep,		/* where to put property value */
787 	int *lengthp)		/* put length of property here */
788 {
789 	minor_t		mnum;
790 	set_t		setno;
791 	md_unit_t	*un;
792 	mdi_unit_t	*ui;
793 	uint64_t	nblocks64;
794 
795 	/*
796 	 * Our dynamic properties are all device specific and size oriented.
797 	 * Requests issued under conditions where size is valid are passed
798 	 * to ddi_prop_op_nblocks with the size information, otherwise the
799 	 * request is passed to ddi_prop_op. Make sure that the minor device
800 	 * is a valid part of the Virtual Disk subsystem.
801 	 */
802 	mnum = getminor(dev);
803 	setno = MD_MIN2SET(mnum);
804 	if ((dev == DDI_DEV_T_ANY) || (mnum == MD_ADM_MINOR) ||
805 	    (setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
806 pass:		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
807 		    name, valuep, lengthp));
808 	} else {
809 		rw_enter(&md_unit_array_rw.lock, RW_READER);
810 		if (((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) ||
811 		    ((ui = MDI_UNIT(mnum)) == NULL)) {
812 			rw_exit(&md_unit_array_rw.lock);
813 			goto pass;
814 		}
815 
816 		/* get nblocks value */
817 		un = (md_unit_t *)md_unit_readerlock(ui);
818 		nblocks64 = un->c.un_total_blocks;
819 		md_unit_readerexit(ui);
820 		rw_exit(&md_unit_array_rw.lock);
821 
822 		return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
823 		    name, valuep, lengthp, nblocks64));
824 	}
825 
826 }
827 
828 static void
829 snarf_user_data(set_t setno)
830 {
831 	mddb_recid_t		recid;
832 	mddb_recstatus_t	status;
833 
834 	recid = mddb_makerecid(setno, 0);
835 	while ((recid = mddb_getnextrec(recid, MDDB_USER, 0)) > 0) {
836 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
837 			continue;
838 
839 		status = mddb_getrecstatus(recid);
840 		if (status == MDDB_STALE)
841 			continue;
842 
843 		if (status == MDDB_NODATA) {
844 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
845 			continue;
846 		}
847 
848 		ASSERT(status == MDDB_OK);
849 
850 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
851 	}
852 }
853 
854 static void
855 md_print_block_usage(mddb_set_t *s, uint_t blks)
856 {
857 	uint_t		ib;
858 	int		li;
859 	mddb_mb_ic_t	*mbip;
860 	uint_t		max_blk_needed;
861 	mddb_lb_t	*lbp;
862 	mddb_sidelocator_t	*slp;
863 	int		drv_index;
864 	md_splitname	sn;
865 	char		*name;
866 	char		*suffix;
867 	size_t		prefixlen;
868 	size_t		suffixlen;
869 	int		alloc_sz;
870 
871 
872 	max_blk_needed = s->s_totalblkcnt - s->s_freeblkcnt + blks;
873 
874 
875 	cmn_err(CE_WARN, "Blocks in Metadevice State Database: %d\n"
876 		"            Additional Blocks Needed:            %d\n\n"
877 		"            Increase size of following replicas for\n"
878 		"            device relocatability by deleting listed\n"
879 		"            replica and re-adding replica with\n"
880 		"            increased size (see metadb(1M)):\n"
881 		"                Replica                   Increase By",
882 		s->s_totalblkcnt, (blks - s->s_freeblkcnt));
883 
884 	lbp = s->s_lbp;
885 
886 	for (li = 0; li < lbp->lb_loccnt; li++) {
887 		if (lbp->lb_locators[li].l_flags & MDDB_F_DELETED)
888 			continue;
889 		ib = 0;
890 		for (mbip = s->s_mbiarray[li]; mbip != NULL;
891 		    mbip = mbip->mbi_next) {
892 			ib += (uint_t)mbip->mbi_mddb_mb.mb_blkcnt;
893 		}
894 		if (ib == 0)
895 			continue;
896 		if (ib < max_blk_needed) {
897 			slp = &lbp->lb_sidelocators[s->s_sideno][li];
898 			drv_index = slp->l_drvnm_index;
899 			mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno,
900 				&sn);
901 			prefixlen = SPN_PREFIX(&sn).pre_len;
902 			suffixlen = SPN_SUFFIX(&sn).suf_len;
903 			alloc_sz = (int)(prefixlen + suffixlen + 2);
904 			name = (char *)kmem_alloc(alloc_sz, KM_SLEEP);
905 			(void) strncpy(name, SPN_PREFIX(&sn).pre_data,
906 			    prefixlen);
907 			name[prefixlen] = '/';
908 			suffix = name + (prefixlen + 1);
909 			(void) strncpy(suffix, SPN_SUFFIX(&sn).suf_data,
910 			    suffixlen);
911 			name[prefixlen + suffixlen + 1] = '\0';
912 			cmn_err(CE_WARN,
913 				"  %s (%s:%d:%d)   %d blocks",
914 				name, lbp->lb_drvnm[drv_index].dn_data,
915 				slp->l_mnum, lbp->lb_locators[li].l_blkno,
916 				(max_blk_needed - ib));
917 			kmem_free(name, alloc_sz);
918 		}
919 	}
920 }
921 
922 /*
923  * md_create_minor_node:
924  *	Create the minor device for the given set and un_self_id.
925  *
926  * Input:
927  *	setno	- set number
928  *	mnum	- selfID of unit
929  *
930  * Output:
931  *	None.
932  *
933  * Returns 0 for success, 1 for failure.
934  *
935  * Side-effects:
936  *	None.
937  */
938 int
939 md_create_minor_node(set_t setno, minor_t mnum)
940 {
941 	char		name[20];
942 
943 	/* Check for valid arguments */
944 	if (setno >= MD_MAXSETS || MD_MIN2UNIT(mnum) >= MD_MAXUNITS)
945 		return (1);
946 
947 	(void) snprintf(name, 20, "%u,%u,blk",
948 		(unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
949 
950 	if (ddi_create_minor_node(md_devinfo, name, S_IFBLK,
951 	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
952 		return (1);
953 
954 	(void) snprintf(name, 20, "%u,%u,raw",
955 		(unsigned)setno, (unsigned)MD_MIN2UNIT(mnum));
956 
957 	if (ddi_create_minor_node(md_devinfo, name, S_IFCHR,
958 	    MD_MKMIN(setno, mnum), DDI_PSEUDO, 0))
959 		return (1);
960 
961 	return (0);
962 }
963 
964 /*
965  * For a given key check if it is an orphaned record.
966  * The following conditions are used to determine an orphan.
967  * 1. The device associated with that key is not a metadevice.
968  * 2. If DEVID_STYLE then the physical device does not have a device Id
969  * associated with it.
970  *
971  * If a key does not have an entry in the devid namespace it could be
972  * a device that does not support device ids. Hence the record is not
973  * deleted.
974  */
975 
976 static int
977 md_verify_orphaned_record(set_t setno, mdkey_t key)
978 {
979 	md_dev64_t	odev; /* orphaned dev */
980 	mddb_set_t	*s;
981 	side_t		side = 0;
982 	struct nm_next_hdr	*did_nh = NULL;
983 
984 	s = (mddb_set_t *)md_set[setno].s_db;
985 	if ((did_nh = get_first_record(setno, 1,  (NM_DEVID | NM_NOTSHARED)))
986 	    == NULL)
987 		return (0);
988 	/*
989 	 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
990 	 */
991 	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE) {
992 		odev = md_getdevnum(setno, side, key, MD_NOTRUST_DEVT);
993 		if ((odev == NODEV64) || (md_getmajor(odev) == md_major))
994 			return (0);
995 		if (lookup_entry(did_nh, setno, side, key, odev, NM_DEVID) ==
996 									NULL)
997 			return (1);
998 	}
999 	return (0);
1000 }
1001 
1002 int
1003 md_snarf_db_set(set_t setno, md_error_t *ep)
1004 {
1005 	int			err = 0;
1006 	int			i;
1007 	mddb_recid_t		recid;
1008 	mddb_type_t		drvrid;
1009 	mddb_recstatus_t	status;
1010 	md_ops_t		*ops;
1011 	uint_t			privat;
1012 	mddb_set_t		*s;
1013 	uint_t			cvt_blks;
1014 	struct nm_next_hdr	*nh;
1015 	mdkey_t			key = MD_KEYWILD;
1016 	side_t			side = 0;
1017 	int			size;
1018 	int			devid_flag;
1019 	int			retval;
1020 	uint_t			un;
1021 	int			un_next_set = 0;
1022 
1023 	md_haltsnarf_enter(setno);
1024 
1025 	mutex_enter(&md_mx);
1026 	if (md_set[setno].s_status & MD_SET_SNARFED) {
1027 		mutex_exit(&md_mx);
1028 		md_haltsnarf_exit(setno);
1029 		return (0);
1030 	}
1031 	mutex_exit(&md_mx);
1032 
1033 	if (! (md_get_status() & MD_GBL_DAEMONS_LIVE)) {
1034 		if (md_start_daemons(TRUE)) {
1035 			if (ep != NULL)
1036 				(void) mdsyserror(ep, ENXIO);
1037 			err = -1;
1038 			goto out;
1039 		}
1040 	}
1041 
1042 
1043 	/*
1044 	 * Load the devid name space if it exists
1045 	 */
1046 	(void) md_load_namespace(setno, NULL, NM_DEVID);
1047 	if (!md_load_namespace(setno, ep, 0L)) {
1048 		/*
1049 		 * Unload the devid namespace
1050 		 */
1051 		(void) md_unload_namespace(setno, NM_DEVID);
1052 		err = -1;
1053 		goto out;
1054 	}
1055 
1056 	/*
1057 	 * If replica is in non-devid state, convert if:
1058 	 * 	- not in probe during upgrade (md_keep_repl_state = 0)
1059 	 * 	- enough space available in replica
1060 	 *	- local set
1061 	 *	- not a multi-node diskset
1062 	 *	- clustering is not present (for non-local set)
1063 	 */
1064 	s = (mddb_set_t *)md_set[setno].s_db;
1065 	devid_flag = 0;
1066 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE) && !md_keep_repl_state)
1067 		devid_flag = 1;
1068 	if (cluster_bootflags & CLUSTER_CONFIGURED)
1069 		if (setno != MD_LOCAL_SET)
1070 			devid_flag = 0;
1071 	if (MD_MNSET_SETNO(setno))
1072 		devid_flag = 0;
1073 	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
1074 		devid_flag = 0;
1075 
1076 	/*
1077 	 * if we weren't devid style before and md_keep_repl_state=1
1078 	 * we need to stay non-devid
1079 	 */
1080 	if ((md_keep_repl_state == 1) &&
1081 	    ((s->s_lbp->lb_flags & MDDB_DEVID_STYLE) == 0))
1082 		devid_flag = 0;
1083 	if (devid_flag) {
1084 		/*
1085 		 * Determine number of free blocks needed to convert
1086 		 * entire replica to device id format - locator blocks
1087 		 * and namespace.
1088 		 */
1089 		cvt_blks = 0;
1090 		if (mddb_lb_did_convert(s, 0, &cvt_blks) != 0) {
1091 			if (ep != NULL)
1092 				(void) mdsyserror(ep, EIO);
1093 			err = -1;
1094 			goto out;
1095 
1096 		}
1097 		cvt_blks += md_nm_did_chkspace(setno);
1098 
1099 		/* add MDDB_DEVID_CONV_PERC% */
1100 		if ((md_conv_perc > 0) && (md_conv_perc <= 100)) {
1101 			cvt_blks = cvt_blks * (100 + md_conv_perc) / 100;
1102 		}
1103 
1104 		if (cvt_blks <= s->s_freeblkcnt) {
1105 			if (mddb_lb_did_convert(s, 1, &cvt_blks) != 0) {
1106 				if (ep != NULL)
1107 					(void) mdsyserror(ep, EIO);
1108 				err = -1;
1109 				goto out;
1110 			}
1111 
1112 		} else {
1113 			/*
1114 			 * Print message that replica can't be converted for
1115 			 * lack of space.   No failure - just continue to
1116 			 * run without device ids.
1117 			 */
1118 			cmn_err(CE_WARN,
1119 			    "Unable to add Solaris Volume Manager device "
1120 			    "relocation data.\n"
1121 			    "          To use device relocation feature:\n"
1122 			    "          - Increase size of listed replicas\n"
1123 			    "          - Reboot");
1124 			md_print_block_usage(s, cvt_blks);
1125 			cmn_err(CE_WARN,
1126 			    "Loading set without device relocation data.\n"
1127 			    "          Solaris Volume Manager disk movement "
1128 			    "not tracked in local set.");
1129 		}
1130 	}
1131 
1132 	/*
1133 	 * go through and load any modules referenced in
1134 	 * data base
1135 	 */
1136 	recid = mddb_makerecid(setno, 0);
1137 	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1138 		status = mddb_getrecstatus(recid);
1139 		if (status == MDDB_STALE) {
1140 			if (! (md_get_setstatus(setno) & MD_SET_STALE)) {
1141 				md_set_setstatus(setno, MD_SET_STALE);
1142 				cmn_err(CE_WARN,
1143 				    "md: state database is stale");
1144 			}
1145 		} else if (status == MDDB_NODATA) {
1146 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1147 			continue;
1148 		}
1149 		drvrid = mddb_getrectype1(recid);
1150 		if (drvrid < MDDB_FIRST_MODID)
1151 			continue;
1152 		if (md_loadsubmod(setno, md_getshared_name(setno, drvrid),
1153 		    drvrid) < 0) {
1154 			cmn_err(CE_NOTE, "md: could not load misc/%s",
1155 				md_getshared_name(setno, drvrid));
1156 		}
1157 	}
1158 
1159 	if (recid < 0)
1160 		goto out;
1161 
1162 	snarf_user_data(setno);
1163 
1164 	/*
1165 	 * Initialize the md_nm_snarfed array
1166 	 * this array is indexed by the key and
1167 	 * is set by md_getdevnum during the snarf time
1168 	 */
1169 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) != NULL) {
1170 		size = (int)((((struct nm_rec_hdr *)nh->nmn_record)->
1171 		    r_next_key) * (sizeof (int)));
1172 		md_nm_snarfed = (int *)kmem_zalloc(size, KM_SLEEP);
1173 	}
1174 
1175 	/*
1176 	 * go through and snarf until nothing gets added
1177 	 */
1178 	do {
1179 		i = 0;
1180 		for (ops = md_opslist; ops != NULL; ops = ops->md_next) {
1181 			if (ops->md_snarf != NULL) {
1182 				retval = ops->md_snarf(MD_SNARF_DOIT, setno);
1183 				if (retval == -1) {
1184 					err = -1;
1185 					/* Don't know the failed unit */
1186 					(void) mdmderror(ep, MDE_RR_ALLOC_ERROR,
1187 					    0);
1188 					(void) md_halt_set(setno, MD_HALT_ALL);
1189 					(void) mddb_unload_set(setno);
1190 					md_haltsnarf_exit(setno);
1191 					return (err);
1192 				} else {
1193 					i += retval;
1194 				}
1195 			}
1196 		}
1197 	} while (i);
1198 
1199 	/*
1200 	 * Set the first available slot and availability
1201 	 */
1202 	md_set[setno].s_un_avail = 0;
1203 	for (un = 0; un < MD_MAXUNITS; un++) {
1204 		if (md_set[setno].s_un[un] != NULL) {
1205 			continue;
1206 		} else {
1207 			if (!un_next_set) {
1208 				md_set[setno].s_un_next = un;
1209 				un_next_set = 1;
1210 			}
1211 			md_set[setno].s_un_avail++;
1212 		}
1213 	}
1214 
1215 	md_set_setstatus(setno, MD_SET_SNARFED);
1216 
1217 	recid = mddb_makerecid(setno, 0);
1218 	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1219 		privat = mddb_getrecprivate(recid);
1220 		if (privat & MD_PRV_COMMIT) {
1221 			if (mddb_commitrec(recid)) {
1222 				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1223 					md_set_setstatus(setno, MD_SET_STALE);
1224 					cmn_err(CE_WARN,
1225 					    "md: state database is stale");
1226 				}
1227 			}
1228 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
1229 		}
1230 	}
1231 
1232 	/* Deletes must happen after all the commits */
1233 	recid = mddb_makerecid(setno, 0);
1234 	while ((recid = mddb_getnextrec(recid, MDDB_ALL, 0)) > 0) {
1235 		privat = mddb_getrecprivate(recid);
1236 		if (privat & MD_PRV_DELETE) {
1237 			if (mddb_deleterec(recid)) {
1238 				if (!(md_get_setstatus(setno) & MD_SET_STALE)) {
1239 					md_set_setstatus(setno, MD_SET_STALE);
1240 					cmn_err(CE_WARN,
1241 					    "md: state database is stale");
1242 				}
1243 				mddb_setrecprivate(recid, MD_PRV_GOTIT);
1244 			}
1245 			recid = mddb_makerecid(setno, 0);
1246 		}
1247 	}
1248 
1249 	/*
1250 	 * go through and clean up records until nothing gets cleaned up.
1251 	 */
1252 	do {
1253 		i = 0;
1254 		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
1255 			if (ops->md_snarf != NULL)
1256 				i += ops->md_snarf(MD_SNARF_CLEANUP, setno);
1257 	} while (i);
1258 
1259 	if (md_nm_snarfed != NULL &&
1260 	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
1261 		/*
1262 		 * go thru and cleanup the namespace and the device id
1263 		 * name space
1264 		 */
1265 		for (key = 1;
1266 		    key < ((struct nm_rec_hdr *)nh->nmn_record)->r_next_key;
1267 		    key++) {
1268 			/*
1269 			 * Is the entry an 'orphan'?
1270 			 */
1271 			if (lookup_entry(nh, setno, side, key, NODEV64, 0L) !=
1272 			    NULL) {
1273 				/*
1274 				 * If the value is not set then apparently
1275 				 * it is not part of the current configuration,
1276 				 * remove it this can happen when system panic
1277 				 * between the primary name space update and
1278 				 * the device id name space update
1279 				 */
1280 				if (md_nm_snarfed[key] == 0) {
1281 					if (md_verify_orphaned_record(setno,
1282 					    key) == 1)
1283 						(void) remove_entry(nh,
1284 						    side, key, 0L);
1285 				}
1286 			}
1287 		}
1288 	}
1289 
1290 	if (md_nm_snarfed != NULL) {
1291 		/*
1292 		 * Done and free the memory
1293 		 */
1294 		kmem_free(md_nm_snarfed, size);
1295 		md_nm_snarfed = NULL;
1296 	}
1297 
1298 	if (s->s_lbp->lb_flags & MDDB_DEVID_STYLE &&
1299 	    !(md_get_setstatus(setno) & MD_SET_STALE)) {
1300 		/*
1301 		 * if the destroy flag has been set and
1302 		 * the MD_SET_DIDCLUP bit is not set in
1303 		 * the set's status field, cleanup the
1304 		 * entire device id namespace
1305 		 */
1306 		if (md_devid_destroy &&
1307 		    !(md_get_setstatus(setno) & MD_SET_DIDCLUP)) {
1308 			(void) md_devid_cleanup(setno, 1);
1309 			md_set_setstatus(setno, MD_SET_DIDCLUP);
1310 		} else
1311 			(void) md_devid_cleanup(setno, 0);
1312 	}
1313 
1314 	/*
1315 	 * clear single threading on snarf, return success or error
1316 	 */
1317 out:
1318 	md_haltsnarf_exit(setno);
1319 	return (err);
1320 }
1321 
1322 void
1323 get_minfo(struct dk_minfo *info, minor_t mnum)
1324 {
1325 	md_unit_t	*un;
1326 	mdi_unit_t	*ui;
1327 
1328 	info->dki_capacity = 0;
1329 	info->dki_lbsize = 0;
1330 	info->dki_media_type = 0;
1331 
1332 	if ((ui = MDI_UNIT(mnum)) == NULL) {
1333 		return;
1334 	}
1335 	un = (md_unit_t *)md_unit_readerlock(ui);
1336 	info->dki_capacity = un->c.un_total_blocks;
1337 	md_unit_readerexit(ui);
1338 	info->dki_lbsize = DEV_BSIZE;
1339 	info->dki_media_type = DK_UNKNOWN;
1340 }
1341 
1342 
1343 void
1344 get_info(struct dk_cinfo *info, minor_t mnum)
1345 {
1346 	/*
1347 	 * Controller Information
1348 	 */
1349 	info->dki_ctype = DKC_MD;
1350 	info->dki_cnum = ddi_get_instance(ddi_get_parent(md_devinfo));
1351 	(void) strcpy(info->dki_cname,
1352 	    ddi_get_name(ddi_get_parent(md_devinfo)));
1353 	/*
1354 	 * Unit Information
1355 	 */
1356 	info->dki_unit = mnum;
1357 	info->dki_slave = 0;
1358 	(void) strcpy(info->dki_dname, ddi_driver_name(md_devinfo));
1359 	info->dki_flags = 0;
1360 	info->dki_partition = 0;
1361 	info->dki_maxtransfer = (ushort_t)(md_maxphys / DEV_BSIZE);
1362 
1363 	/*
1364 	 * We can't get from here to there yet
1365 	 */
1366 	info->dki_addr = 0;
1367 	info->dki_space = 0;
1368 	info->dki_prio = 0;
1369 	info->dki_vec = 0;
1370 }
1371 
1372 /*
1373  * open admin device
1374  */
1375 static int
1376 mdadminopen(
1377 	int	flag,
1378 	int	otyp)
1379 {
1380 	int	err = 0;
1381 
1382 	/* single thread */
1383 	mutex_enter(&md_mx);
1384 
1385 	/* check type and flags */
1386 	if ((otyp != OTYP_CHR) && (otyp != OTYP_LYR)) {
1387 		err = EINVAL;
1388 		goto out;
1389 	}
1390 	if (((flag & FEXCL) && (md_status & MD_GBL_OPEN)) ||
1391 	    (md_status & MD_GBL_EXCL)) {
1392 		err = EBUSY;
1393 		goto out;
1394 	}
1395 
1396 	/* count and flag open */
1397 	md_ocnt[otyp]++;
1398 	md_status |= MD_GBL_OPEN;
1399 	if (flag & FEXCL)
1400 		md_status |= MD_GBL_EXCL;
1401 
1402 	/* unlock return success */
1403 out:
1404 	mutex_exit(&md_mx);
1405 	return (err);
1406 }
1407 
1408 /*
1409  * open entry point
1410  */
1411 static int
1412 mdopen(
1413 	dev_t		*dev,
1414 	int		flag,
1415 	int		otyp,
1416 	cred_t		*cred_p)
1417 {
1418 	minor_t		mnum = getminor(*dev);
1419 	unit_t		unit = MD_MIN2UNIT(mnum);
1420 	set_t		setno = MD_MIN2SET(mnum);
1421 	mdi_unit_t	*ui = NULL;
1422 	int		err = 0;
1423 	md_parent_t	parent;
1424 
1425 	/* dispatch admin device opens */
1426 	if (mnum == MD_ADM_MINOR)
1427 		return (mdadminopen(flag, otyp));
1428 
1429 	/* lock, check status */
1430 	rw_enter(&md_unit_array_rw.lock, RW_READER);
1431 
1432 tryagain:
1433 	if (md_get_status() & MD_GBL_HALTED)  {
1434 		err = ENODEV;
1435 		goto out;
1436 	}
1437 
1438 	/* check minor */
1439 	if ((setno >= md_nsets) || (unit >= md_nunits)) {
1440 		err = ENXIO;
1441 		goto out;
1442 	}
1443 
1444 	/* make sure we're snarfed */
1445 	if ((md_get_setstatus(MD_LOCAL_SET) & MD_SET_SNARFED) == 0) {
1446 		if (md_snarf_db_set(MD_LOCAL_SET, NULL) != 0) {
1447 			err = ENODEV;
1448 			goto out;
1449 		}
1450 	}
1451 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) {
1452 		err = ENODEV;
1453 		goto out;
1454 	}
1455 
1456 	/* check unit */
1457 	if ((ui = MDI_UNIT(mnum)) == NULL) {
1458 		err = ENXIO;
1459 		goto out;
1460 	}
1461 
1462 	/*
1463 	 * The softpart open routine may do an I/O during the open, in
1464 	 * which case the open routine will set the OPENINPROGRESS flag
1465 	 * and drop all locks during the I/O.  If this thread sees
1466 	 * the OPENINPROGRESS flag set, if should wait until the flag
1467 	 * is reset before calling the driver's open routine.  It must
1468 	 * also revalidate the world after it grabs the unit_array lock
1469 	 * since the set may have been released or the metadevice cleared
1470 	 * during the sleep.
1471 	 */
1472 	if (MD_MNSET_SETNO(setno)) {
1473 		mutex_enter(&ui->ui_mx);
1474 		if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
1475 			rw_exit(&md_unit_array_rw.lock);
1476 			cv_wait(&ui->ui_cv, &ui->ui_mx);
1477 			rw_enter(&md_unit_array_rw.lock, RW_READER);
1478 			mutex_exit(&ui->ui_mx);
1479 			goto tryagain;
1480 		}
1481 		mutex_exit(&ui->ui_mx);
1482 	}
1483 
1484 	/* Test if device is openable */
1485 	if ((ui->ui_tstate & MD_NOTOPENABLE) != 0) {
1486 		err = ENXIO;
1487 		goto out;
1488 	}
1489 
1490 	/* don't allow opens w/WRITE flag if stale */
1491 	if ((flag & FWRITE) && (md_get_setstatus(setno) & MD_SET_STALE)) {
1492 		err = EROFS;
1493 		goto out;
1494 	}
1495 
1496 	/* don't allow writes to subdevices */
1497 	parent = md_get_parent(md_expldev(*dev));
1498 	if ((flag & FWRITE) && MD_HAS_PARENT(parent)) {
1499 		err = EROFS;
1500 		goto out;
1501 	}
1502 
1503 	/* open underlying driver */
1504 	if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1505 		if ((err = (*md_ops[ui->ui_opsindex]->md_open)
1506 		    (dev, flag, otyp, cred_p, 0)) != 0)
1507 			goto out;
1508 	}
1509 
1510 	/* or do it ourselves */
1511 	else {
1512 		/* single thread */
1513 		(void) md_unit_openclose_enter(ui);
1514 		err = md_unit_incopen(mnum, flag, otyp);
1515 		md_unit_openclose_exit(ui);
1516 		if (err != 0)
1517 			goto out;
1518 	}
1519 
1520 	/* unlock, return status */
1521 out:
1522 	rw_exit(&md_unit_array_rw.lock);
1523 	return (err);
1524 }
1525 
1526 /*
1527  * close admin device
1528  */
1529 static int
1530 mdadminclose(
1531 	int	otyp)
1532 {
1533 	int	i;
1534 	int	err = 0;
1535 
1536 	/* single thread */
1537 	mutex_enter(&md_mx);
1538 
1539 	/* check type and flags */
1540 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1541 		err = EINVAL;
1542 		goto out;
1543 	} else if (md_ocnt[otyp] == 0) {
1544 		err = ENXIO;
1545 		goto out;
1546 	}
1547 
1548 	/* count and flag closed */
1549 	if (otyp == OTYP_LYR)
1550 		md_ocnt[otyp]--;
1551 	else
1552 		md_ocnt[otyp] = 0;
1553 	md_status &= ~MD_GBL_OPEN;
1554 	for (i = 0; (i < OTYPCNT); ++i)
1555 		if (md_ocnt[i] != 0)
1556 			md_status |= MD_GBL_OPEN;
1557 	if (! (md_status & MD_GBL_OPEN))
1558 		md_status &= ~MD_GBL_EXCL;
1559 
1560 	/* unlock return success */
1561 out:
1562 	mutex_exit(&md_mx);
1563 	return (err);
1564 }
1565 
1566 /*
1567  * close entry point
1568  */
1569 static int
1570 mdclose(
1571 	dev_t		dev,
1572 	int		flag,
1573 	int		otyp,
1574 	cred_t		*cred_p)
1575 {
1576 	minor_t		mnum = getminor(dev);
1577 	set_t		setno = MD_MIN2SET(mnum);
1578 	unit_t		unit = MD_MIN2UNIT(mnum);
1579 	mdi_unit_t	*ui = NULL;
1580 	int		err = 0;
1581 
1582 	/* dispatch admin device closes */
1583 	if (mnum == MD_ADM_MINOR)
1584 		return (mdadminclose(otyp));
1585 
1586 	/* check minor */
1587 	if ((setno >= md_nsets) || (unit >= md_nunits) ||
1588 	    ((ui = MDI_UNIT(mnum)) == NULL)) {
1589 		err = ENXIO;
1590 		goto out;
1591 	}
1592 
1593 	/* close underlying driver */
1594 	if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1595 		if ((err = (*md_ops[ui->ui_opsindex]->md_close)
1596 		    (dev, flag, otyp, cred_p, 0)) != 0)
1597 			goto out;
1598 	}
1599 
1600 	/* or do it ourselves */
1601 	else {
1602 		/* single thread */
1603 		(void) md_unit_openclose_enter(ui);
1604 		err = md_unit_decopen(mnum, otyp);
1605 		md_unit_openclose_exit(ui);
1606 		if (err != 0)
1607 			goto out;
1608 	}
1609 
1610 	/* return success */
1611 out:
1612 	return (err);
1613 }
1614 
1615 
1616 /*
1617  * This routine performs raw read operations.  It is called from the
1618  * device switch at normal priority.
1619  *
1620  * The main catch is that the *uio struct which is passed to us may
1621  * specify a read which spans two buffers, which would be contiguous
1622  * on a single partition,  but not on a striped partition. This will
1623  * be handled by mdstrategy.
1624  */
1625 /*ARGSUSED*/
1626 static int
1627 mdread(dev_t dev, struct uio *uio, cred_t *credp)
1628 {
1629 	minor_t		mnum;
1630 	mdi_unit_t	*ui;
1631 	int		error;
1632 
1633 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1634 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1635 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1636 	    ((ui = MDI_UNIT(mnum)) == NULL))
1637 		return (ENXIO);
1638 
1639 	if (md_ops[ui->ui_opsindex]->md_read  != NULL)
1640 		return ((*md_ops[ui->ui_opsindex]->md_read)
1641 		    (dev, uio, credp));
1642 
1643 	if ((error = md_chk_uio(uio)) != 0)
1644 		return (error);
1645 
1646 	return (physio(mdstrategy, NULL, dev, B_READ, md_minphys, uio));
1647 }
1648 
1649 /*
1650  * This routine performs async raw read operations.  It is called from the
1651  * device switch at normal priority.
1652  *
1653  * The main catch is that the *aio struct which is passed to us may
1654  * specify a read which spans two buffers, which would be contiguous
1655  * on a single partition,  but not on a striped partition. This will
1656  * be handled by mdstrategy.
1657  */
1658 /*ARGSUSED*/
1659 static int
1660 mdaread(dev_t dev, struct aio_req *aio, cred_t *credp)
1661 {
1662 	minor_t		mnum;
1663 	mdi_unit_t	*ui;
1664 	int		error;
1665 
1666 
1667 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1668 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1669 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1670 	    ((ui = MDI_UNIT(mnum)) == NULL))
1671 		return (ENXIO);
1672 
1673 	if (md_ops[ui->ui_opsindex]->md_aread  != NULL)
1674 		return ((*md_ops[ui->ui_opsindex]->md_aread)
1675 		    (dev, aio, credp));
1676 
1677 	if ((error = md_chk_uio(aio->aio_uio)) != 0)
1678 		return (error);
1679 
1680 	return (aphysio(mdstrategy, anocancel, dev, B_READ, md_minphys, aio));
1681 }
1682 
1683 /*
1684  * This routine performs raw write operations.	It is called from the
1685  * device switch at normal priority.
1686  *
1687  * The main catch is that the *uio struct which is passed to us may
1688  * specify a write which spans two buffers, which would be contiguous
1689  * on a single partition,  but not on a striped partition. This is
1690  * handled by mdstrategy.
1691  *
1692  */
1693 /*ARGSUSED*/
1694 static int
1695 mdwrite(dev_t dev, struct uio *uio, cred_t *credp)
1696 {
1697 	minor_t		mnum;
1698 	mdi_unit_t	*ui;
1699 	int		error;
1700 
1701 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1702 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1703 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1704 	    ((ui = MDI_UNIT(mnum)) == NULL))
1705 		return (ENXIO);
1706 
1707 	if (md_ops[ui->ui_opsindex]->md_write  != NULL)
1708 		return ((*md_ops[ui->ui_opsindex]->md_write)
1709 		    (dev, uio, credp));
1710 
1711 	if ((error = md_chk_uio(uio)) != 0)
1712 		return (error);
1713 
1714 	return (physio(mdstrategy, NULL, dev, B_WRITE, md_minphys, uio));
1715 }
1716 
1717 /*
1718  * This routine performs async raw write operations.  It is called from the
1719  * device switch at normal priority.
1720  *
1721  * The main catch is that the *aio struct which is passed to us may
1722  * specify a write which spans two buffers, which would be contiguous
1723  * on a single partition,  but not on a striped partition. This is
1724  * handled by mdstrategy.
1725  *
1726  */
1727 /*ARGSUSED*/
1728 static int
1729 mdawrite(dev_t dev, struct aio_req *aio, cred_t *credp)
1730 {
1731 	minor_t		mnum;
1732 	mdi_unit_t	*ui;
1733 	int		error;
1734 
1735 
1736 	if (((mnum = getminor(dev)) == MD_ADM_MINOR) ||
1737 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1738 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1739 	    ((ui = MDI_UNIT(mnum)) == NULL))
1740 		return (ENXIO);
1741 
1742 	if (md_ops[ui->ui_opsindex]->md_awrite  != NULL)
1743 		return ((*md_ops[ui->ui_opsindex]->md_awrite)
1744 		    (dev, aio, credp));
1745 
1746 	if ((error = md_chk_uio(aio->aio_uio)) != 0)
1747 		return (error);
1748 
1749 	return (aphysio(mdstrategy, anocancel, dev, B_WRITE, md_minphys, aio));
1750 }
1751 
1752 int
1753 mdstrategy(struct buf *bp)
1754 {
1755 	minor_t		mnum;
1756 	mdi_unit_t	*ui;
1757 
1758 	ASSERT((bp->b_flags & B_DONE) == 0);
1759 
1760 	if (panicstr)
1761 		md_clr_status(MD_GBL_DAEMONS_LIVE);
1762 
1763 	if (((mnum = getminor(bp->b_edev)) == MD_ADM_MINOR) ||
1764 	    (MD_MIN2SET(mnum) >= md_nsets) ||
1765 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1766 	    ((ui = MDI_UNIT(mnum)) == NULL)) {
1767 		bp->b_flags |= B_ERROR;
1768 		bp->b_error = ENXIO;
1769 		bp->b_resid = bp->b_bcount;
1770 		biodone(bp);
1771 		return (0);
1772 	}
1773 
1774 	bp->b_flags &= ~(B_ERROR | B_DONE);
1775 	if (md_ops[ui->ui_opsindex]->md_strategy  != NULL) {
1776 		(*md_ops[ui->ui_opsindex]->md_strategy) (bp, 0, NULL);
1777 	} else {
1778 		(void) errdone(ui, bp, ENXIO);
1779 	}
1780 	return (0);
1781 }
1782 
1783 /*
1784  * Return true if the ioctl is allowed to be multithreaded.
1785  * All the ioctls with MN are sent only from the message handlers through
1786  * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
1787  * ioctl for the same metadevice are issued at the same time.
1788  * So we are safe here.
1789  * The other ioctls do not mess with any metadevice structures and therefor
1790  * are harmless too, if called multiple times at the same time.
1791  */
1792 static boolean_t
1793 is_mt_ioctl(int cmd) {
1794 
1795 	switch (cmd) {
1796 	case MD_IOCGUNIQMSGID:
1797 	case MD_IOCGVERSION:
1798 	case MD_IOCISOPEN:
1799 	case MD_MN_SET_MM_OWNER:
1800 	case MD_MN_SET_STATE:
1801 	case MD_MN_SUSPEND_WRITES:
1802 	case MD_MN_ALLOCATE_HOTSPARE:
1803 	case MD_MN_SET_SETFLAGS:
1804 	case MD_MN_GET_SETFLAGS:
1805 	case MD_MN_MDDB_OPTRECFIX:
1806 	case MD_MN_MDDB_PARSE:
1807 	case MD_MN_MDDB_BLOCK:
1808 	case MD_MN_DB_USERREQ:
1809 	case MD_IOC_SPSTATUS:
1810 	case MD_MN_COMMD_ERR:
1811 	case MD_MN_SET_COMMD_RUNNING:
1812 	case MD_MN_RESYNC:
1813 	case MD_MN_SETSYNC:
1814 	case MD_MN_POKE_HOTSPARES:
1815 		return (1);
1816 	default:
1817 		return (0);
1818 	}
1819 }
1820 
1821 /*
1822  * This routine implements the ioctl calls for the Virtual Disk System.
1823  * It is called from the device switch at normal priority.
1824  */
1825 /* ARGSUSED */
1826 static int
1827 mdioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cred_p,
1828 	int *rval_p)
1829 {
1830 	minor_t		mnum = getminor(dev);
1831 	mdi_unit_t	*ui;
1832 	IOLOCK		lock;
1833 	int		err;
1834 
1835 	/*
1836 	 * For multinode disksets  number of ioctls are allowed to be
1837 	 * multithreaded.
1838 	 * A fundamental assumption made in this implementation is that
1839 	 * ioctls either do not interact with other md structures  or the
1840 	 * ioctl to the admin device can only occur if the metadevice
1841 	 * device is open. i.e. avoid a race between metaclear and the
1842 	 * progress of a multithreaded ioctl.
1843 	 */
1844 
1845 	if (!is_mt_ioctl(cmd) && md_ioctl_lock_enter() == EINTR) {
1846 		return (EINTR);
1847 	}
1848 
1849 	/*
1850 	 * initialize lock tracker
1851 	 */
1852 	IOLOCK_INIT(&lock);
1853 
1854 	/* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */
1855 
1856 	if (is_mt_ioctl(cmd)) {
1857 		/* increment the md_mtioctl_cnt */
1858 		mutex_enter(&md_mx);
1859 		md_mtioctl_cnt++;
1860 		mutex_exit(&md_mx);
1861 		lock.l_flags |= MD_MT_IOCTL;
1862 	}
1863 
1864 	/*
1865 	 * this has been added to prevent notification from re-snarfing
1866 	 * so metaunload will work.  It may interfere with other modules
1867 	 * halt process.
1868 	 */
1869 	if (md_get_status() & (MD_GBL_HALTED | MD_GBL_DAEMONS_DIE))
1870 		return (IOLOCK_RETURN(ENXIO, &lock));
1871 
1872 	/*
1873 	 * admin device ioctls
1874 	 */
1875 	if (mnum == MD_ADM_MINOR) {
1876 		err = md_admin_ioctl(md_expldev(dev), cmd, (void *) data,
1877 					mode, &lock);
1878 	}
1879 
1880 	/*
1881 	 * metadevice ioctls
1882 	 */
1883 	else if ((MD_MIN2SET(mnum) >= md_nsets) ||
1884 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
1885 	    ((ui = MDI_UNIT(mnum)) == NULL)) {
1886 		err = ENXIO;
1887 	} else if (md_ops[ui->ui_opsindex]->md_ioctl == NULL) {
1888 		err = ENOTTY;
1889 	} else {
1890 		err = (*md_ops[ui->ui_opsindex]->md_ioctl)
1891 		    (dev, cmd, (void *) data, mode, &lock);
1892 	}
1893 
1894 	/*
1895 	 * drop any locks we grabbed
1896 	 */
1897 	return (IOLOCK_RETURN_IOCTLEND(err, &lock));
1898 }
1899 
1900 static int
1901 mddump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1902 {
1903 	minor_t		mnum;
1904 	set_t		setno;
1905 	mdi_unit_t	*ui;
1906 
1907 	if ((mnum = getminor(dev)) == MD_ADM_MINOR)
1908 		return (ENXIO);
1909 
1910 	setno = MD_MIN2SET(mnum);
1911 
1912 	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits) ||
1913 	    ((ui = MDI_UNIT(mnum)) == NULL))
1914 		return (ENXIO);
1915 
1916 
1917 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
1918 		return (ENXIO);
1919 
1920 	if (md_ops[ui->ui_opsindex]->md_dump  != NULL)
1921 		return ((*md_ops[ui->ui_opsindex]->md_dump)
1922 		    (dev, addr, blkno, nblk));
1923 
1924 	return (ENXIO);
1925 }
1926 
1927 /*
1928  * Metadevice unit number dispatcher
1929  * When this routine is called it will scan the
1930  * incore unit array and return the avail slot
1931  * hence the unit number to the caller
1932  *
1933  * Return -1 if there is nothing available
1934  */
1935 unit_t
1936 md_get_nextunit(set_t setno)
1937 {
1938 	unit_t	un, start;
1939 
1940 	/*
1941 	 * If nothing available
1942 	 */
1943 	if (md_set[setno].s_un_avail == 0) {
1944 		return (MD_UNITBAD);
1945 	}
1946 
1947 	mutex_enter(&md_mx);
1948 	start = un = md_set[setno].s_un_next;
1949 
1950 	/* LINTED: E_CONSTANT_CONDITION */
1951 	while (1) {
1952 		if (md_set[setno].s_un[un] == NULL) {
1953 			/*
1954 			 * Advance the starting index for the next
1955 			 * md_get_nextunit call
1956 			 */
1957 			if (un == MD_MAXUNITS - 1) {
1958 				md_set[setno].s_un_next = 0;
1959 			} else {
1960 				md_set[setno].s_un_next = un + 1;
1961 			}
1962 			break;
1963 		}
1964 
1965 		un = ((un == MD_MAXUNITS - 1) ? 0 : un + 1);
1966 
1967 		if (un == start) {
1968 			un = MD_UNITBAD;
1969 			break;
1970 		}
1971 
1972 	}
1973 
1974 	mutex_exit(&md_mx);
1975 	return (un);
1976 }
1977