xref: /titanic_41/usr/src/uts/common/io/lvm/softpart/sp.c (revision 354d1447ce995f3923a8f53d41c49fd3e6543282)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Soft partitioning metadevice driver (md_sp).
31  *
32  * This file contains the primary operations of the soft partitioning
33  * metadevice driver.  This includes all routines for normal operation
34  * (open/close/read/write).  Please see mdvar.h for a definition of
35  * metadevice operations vector (md_ops_t).  This driver is loosely
36  * based on the stripe driver (md_stripe).
37  *
38  * All metadevice administration is done through the use of ioctl's.
39  * As such, all administrative routines appear in sp_ioctl.c.
40  *
41  * Soft partitions are represented both in-core and in the metadb with a
42  * unit structure.  The soft partition-specific information in the unit
43  * structure includes the following information:
44  *	- Device information (md_dev64_t & md key) about the device on which
45  *	  the soft partition is built.
46  *	- Soft partition status information.
47  *	- The size of the soft partition and number of extents used to
48  *	  make up that size.
49  *	- An array of exents which define virtual/physical offset
50  *	  mappings and lengths for each extent.
51  *
52  * Typical soft partition operation proceeds as follows:
53  *	- The unit structure is fetched from the metadb and placed into
54  *	  an in-core array (as with other metadevices).  This operation
55  *	  is performed via sp_build_incore( ) and takes place during
56  *	  "snarfing" (when all metadevices are brought in-core at
57  *	  once) and when a new soft partition is created.
58  *	- A soft partition is opened via sp_open( ).  At open time the
59  *	  the soft partition unit structure is verified with the soft
60  *	  partition on-disk structures.  Additionally, the soft partition
61  *	  status is checked (only soft partitions in the OK state may be
62  *	  opened).
63  *	- Soft partition I/O is performed via sp_strategy( ) which relies on
64  *	  a support routine, sp_mapbuf( ), to do most of the work.
65  *	  sp_mapbuf( ) maps a buffer to a particular extent via a binary
66  *	  search of the extent array in the soft partition unit structure.
67  *	  Once a translation has been performed, the I/O is passed down
68  *	  to the next layer, which may be another metadevice or a physical
69  *	  disk.  Since a soft partition may contain multiple, non-contiguous
70  *	  extents, a single I/O may have to be fragmented.
71  *	- Soft partitions are closed using sp_close.
72  *
73  */
74 
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/conf.h>
78 #include <sys/file.h>
79 #include <sys/user.h>
80 #include <sys/uio.h>
81 #include <sys/t_lock.h>
82 #include <sys/buf.h>
83 #include <sys/dkio.h>
84 #include <sys/vtoc.h>
85 #include <sys/kmem.h>
86 #include <vm/page.h>
87 #include <sys/cmn_err.h>
88 #include <sys/sysmacros.h>
89 #include <sys/types.h>
90 #include <sys/mkdev.h>
91 #include <sys/stat.h>
92 #include <sys/open.h>
93 #include <sys/lvm/mdvar.h>
94 #include <sys/lvm/md_sp.h>
95 #include <sys/lvm/md_convert.h>
96 #include <sys/lvm/md_notify.h>
97 #include <sys/lvm/md_crc.h>
98 #include <sys/modctl.h>
99 #include <sys/ddi.h>
100 #include <sys/sunddi.h>
101 #include <sys/debug.h>
102 
103 #include <sys/sysevent/eventdefs.h>
104 #include <sys/sysevent/svm.h>
105 
106 md_ops_t		sp_md_ops;
107 #ifndef	lint
108 static char		_depends_on[] = "drv/md";
109 md_ops_t		*md_interface_ops = &sp_md_ops;
110 #endif
111 
112 extern unit_t		md_nunits;
113 extern set_t		md_nsets;
114 extern md_set_t		md_set[];
115 
116 extern int		md_status;
117 extern major_t		md_major;
118 extern mdq_anchor_t	md_done_daemon;
119 extern mdq_anchor_t	md_sp_daemon;
120 extern kmutex_t		md_mx;
121 extern kcondvar_t	md_cv;
122 extern md_krwlock_t	md_unit_array_rw;
123 
124 static kmem_cache_t	*sp_parent_cache = NULL;
125 static kmem_cache_t	*sp_child_cache = NULL;
126 static void		sp_send_stat_ok(mp_unit_t *);
127 static void		sp_send_stat_err(mp_unit_t *);
128 
129 /*
130  * FUNCTION:	sp_parent_constructor()
131  * INPUT:	none.
132  * OUTPUT:	ps	- parent save structure initialized.
133  * RETURNS:	void *	- ptr to initialized parent save structure.
134  * PURPOSE:	initialize parent save structure.
135  */
136 /*ARGSUSED1*/
137 static int
138 sp_parent_constructor(void *p, void *d1, int d2)
139 {
140 	mutex_init(&((md_spps_t *)p)->ps_mx,
141 	    NULL, MUTEX_DEFAULT, NULL);
142 	return (0);
143 }
144 
145 static void
146 sp_parent_init(md_spps_t *ps)
147 {
148 	bzero(ps, offsetof(md_spps_t, ps_mx));
149 }
150 
151 /*ARGSUSED1*/
152 static void
153 sp_parent_destructor(void *p, void *d)
154 {
155 	mutex_destroy(&((md_spps_t *)p)->ps_mx);
156 }
157 
158 /*
159  * FUNCTION:	sp_child_constructor()
160  * INPUT:	none.
161  * OUTPUT:	cs	- child save structure initialized.
162  * RETURNS:	void *	- ptr to initialized child save structure.
163  * PURPOSE:	initialize child save structure.
164  */
165 /*ARGSUSED1*/
166 static int
167 sp_child_constructor(void *p, void *d1, int d2)
168 {
169 	bioinit(&((md_spcs_t *)p)->cs_buf);
170 	return (0);
171 }
172 
173 static void
174 sp_child_init(md_spcs_t *cs)
175 {
176 	cs->cs_mdunit = 0;
177 	cs->cs_ps = NULL;
178 	md_bioreset(&cs->cs_buf);
179 }
180 
181 /*ARGSUSED1*/
182 static void
183 sp_child_destructor(void *p, void *d)
184 {
185 	biofini(&((md_spcs_t *)p)->cs_buf);
186 }
187 
188 /*
189  * FUNCTION:	sp_run_queue()
190  * INPUT:	none.
191  * OUTPUT:	none.
192  * RETURNS:	void.
193  * PURPOSE:	run the md_daemon to clean up memory pool.
194  */
195 /*ARGSUSED*/
196 static void
197 sp_run_queue(void *d)
198 {
199 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
200 		md_daemon(1, &md_done_daemon);
201 }
202 
203 
204 /*
205  * FUNCTION:	sp_build_incore()
206  * INPUT:	p		- ptr to unit structure.
207  *		snarfing	- flag to tell us we are snarfing.
208  * OUTPUT:	non.
209  * RETURNS:	int	- 0 (always).
210  * PURPOSE:	place unit structure into in-core unit array (keyed from
211  *		minor number).
212  */
213 int
214 sp_build_incore(void *p, int snarfing)
215 {
216 	mp_unit_t	*un = (mp_unit_t *)p;
217 	minor_t		mnum;
218 	set_t		setno;
219 	md_dev64_t	tmpdev;
220 
221 	mnum = MD_SID(un);
222 
223 	if (MD_UNIT(mnum) != NULL)
224 		return (0);
225 
226 	MD_STATUS(un) = 0;
227 
228 	if (snarfing) {
229 		/*
230 		 * if we are snarfing, we get the device information
231 		 * from the metadb record (using the metadb key for
232 		 * that device).
233 		 */
234 		setno = MD_MIN2SET(mnum);
235 
236 		tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
237 		    un->un_key, MD_NOTRUST_DEVT);
238 		un->un_dev = tmpdev;
239 	}
240 
241 	/* place unit in in-core array */
242 	MD_UNIT(mnum) = un;
243 	return (0);
244 }
245 
246 /*
247  * FUNCTION:	reset_sp()
248  * INPUT:	un		- unit structure to be reset/removed.
249  *		mnum		- minor number to be reset/removed.
250  *		removing	- flag to tell us if we are removing
251  *				  permanently or just reseting in-core
252  *				  structures.
253  * OUTPUT:	none.
254  * RETURNS:	void.
255  * PURPOSE:	used to either simply reset in-core structures or to
256  *		permanently remove metadevices from the metadb.
257  */
258 void
259 reset_sp(mp_unit_t *un, minor_t mnum, int removing)
260 {
261 	sv_dev_t	*sv;
262 	mddb_recid_t	vtoc_id;
263 
264 	/* clean up in-core structures */
265 	md_destroy_unit_incore(mnum, &sp_md_ops);
266 
267 	MD_UNIT(mnum) = NULL;
268 
269 	if (!removing)
270 		return;
271 
272 	/* we are removing the soft partition from the metadb */
273 
274 	/*
275 	 * Save off device information so we can get to
276 	 * it after we do the mddb_deleterec().
277 	 */
278 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
279 	sv->setno = MD_MIN2SET(mnum);
280 	sv->key = un->un_key;
281 	vtoc_id = un->c.un_vtoc_id;
282 
283 	/* Remove the unit structure */
284 	mddb_deleterec_wrapper(un->c.un_record_id);
285 
286 	if (vtoc_id)
287 		mddb_deleterec_wrapper(vtoc_id);
288 
289 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
290 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
291 
292 	/*
293 	 * remove the underlying device name from the metadb.  if other
294 	 * soft partitions are built on this device, this will simply
295 	 * decrease the reference count for this device.  otherwise the
296 	 * name record for this device will be removed from the metadb.
297 	 */
298 	md_rem_names(sv, 1);
299 	kmem_free(sv, sizeof (sv_dev_t));
300 }
301 
302 /*
303  * FUNCTION:	sp_send_stat_msg
304  * INPUT:	un	- unit reference
305  *		status	- status to be sent to master node
306  *			MD_SP_OK - soft-partition is now OK
307  *			MD_SP_ERR	"	"	 errored
308  * OUTPUT:	none.
309  * RETURNS:	void.
310  * PURPOSE:	send a soft-partition status change to the master node. If the
311  *		message succeeds we simply return. If it fails we panic as the
312  *		cluster-wide view of the metadevices is now inconsistent.
313  * CALLING CONTEXT:
314  *	Blockable. No locks can be held.
315  */
316 static void
317 sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
318 {
319 	md_mn_msg_sp_setstat_t	sp_msg;
320 	md_mn_kresult_t	*kres;
321 	set_t		setno = MD_UN2SET(un);
322 	int		rval;
323 	const char	*str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
324 
325 	sp_msg.sp_setstat_mnum = MD_SID(un);
326 	sp_msg.sp_setstat_status = status;
327 
328 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
329 
330 	rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
331 	    (char *)&sp_msg, sizeof (sp_msg), kres);
332 
333 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
334 		mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
335 
336 		/*
337 		 * Panic as we are now in an inconsistent state.
338 		 */
339 
340 		cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
341 		    md_shortname(MD_SID(un)), str);
342 	}
343 
344 	kmem_free(kres, sizeof (md_mn_kresult_t));
345 }
346 
347 /*
348  * FUNCTION:	sp_finish_error
349  * INPUT:	ps	- parent save structure for error-ed I/O.
350  *		lock_held	- set if the unit readerlock is held
351  * OUTPUT:	none.
352  * RETURNS:	void.
353  * PURPOSE:	report a driver error
354  */
355 static void
356 sp_finish_error(md_spps_t *ps, int lock_held)
357 {
358 	struct buf	*pb = ps->ps_bp;
359 	mdi_unit_t	*ui = ps->ps_ui;
360 	md_dev64_t	un_dev;			/* underlying device */
361 	md_dev64_t	md_dev = md_expldev(pb->b_edev); /* metadev in error */
362 	char		*str;
363 
364 	un_dev = md_expldev(ps->ps_un->un_dev);
365 	/* set error type */
366 	if (pb->b_flags & B_READ) {
367 		str = "read";
368 	} else {
369 		str = "write";
370 	}
371 
372 
373 	SPPS_FREE(sp_parent_cache, ps);
374 	pb->b_flags |= B_ERROR;
375 
376 	md_kstat_done(ui, pb, 0);
377 
378 	if (lock_held) {
379 		md_unit_readerexit(ui);
380 	}
381 	md_biodone(pb);
382 
383 	cmn_err(CE_WARN, "md: %s: %s error on %s",
384 	    md_shortname(md_getminor(md_dev)), str,
385 	    md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
386 }
387 
388 
389 /*
390  * FUNCTION:	sp_xmit_ok
391  * INPUT:	dq	- daemon queue referencing failing ps structure
392  * OUTPUT:	none.
393  * RETURNS:	void.
394  * PURPOSE:	send a message to the master node in a multi-owner diskset to
395  *		update all attached nodes view of the soft-part to be MD_SP_OK.
396  * CALLING CONTEXT:
397  *	Blockable. No unit lock held.
398  */
399 static void
400 sp_xmit_ok(daemon_queue_t *dq)
401 {
402 	md_spps_t	*ps = (md_spps_t *)dq;
403 
404 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
405 	sp_send_stat_msg(ps->ps_un, MD_SP_OK);
406 
407 	/*
408 	 * Successfully transmitted error state to all nodes, now release this
409 	 * parent structure.
410 	 */
411 	SPPS_FREE(sp_parent_cache, ps);
412 }
413 
414 /*
415  * FUNCTION:	sp_xmit_error
416  * INPUT:	dq	- daemon queue referencing failing ps structure
417  * OUTPUT:	none.
418  * RETURNS:	void.
419  * PURPOSE:	send a message to the master node in a multi-owner diskset to
420  *		update all attached nodes view of the soft-part to be MD_SP_ERR.
421  * CALLING CONTEXT:
422  *	Blockable. No unit lock held.
423  */
424 static void
425 sp_xmit_error(daemon_queue_t *dq)
426 {
427 	md_spps_t	*ps = (md_spps_t *)dq;
428 
429 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
430 	sp_send_stat_msg(ps->ps_un, MD_SP_ERR);
431 
432 	/*
433 	 * Successfully transmitted error state to all nodes, now release this
434 	 * parent structure.
435 	 */
436 	SPPS_FREE(sp_parent_cache, ps);
437 }
438 static void
439 sp_send_stat_ok(mp_unit_t *un)
440 {
441 	minor_t		mnum = MD_SID(un);
442 	md_spps_t	*ps;
443 
444 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
445 	sp_parent_init(ps);
446 	ps->ps_un = un;
447 	ps->ps_ui = MDI_UNIT(mnum);
448 
449 	daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
450 	REQ_OLD);
451 }
452 
453 static void
454 sp_send_stat_err(mp_unit_t *un)
455 {
456 	minor_t		mnum = MD_SID(un);
457 	md_spps_t	*ps;
458 
459 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
460 	sp_parent_init(ps);
461 	ps->ps_un = un;
462 	ps->ps_ui = MDI_UNIT(mnum);
463 
464 	daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
465 	REQ_OLD);
466 }
467 
468 
469 /*
470  * FUNCTION:	sp_error()
471  * INPUT:	ps	- parent save structure for error-ed I/O.
472  * OUTPUT:	none.
473  * RETURNS:	void.
474  * PURPOSE:	report a driver error.
475  * CALLING CONTEXT:
476  *	Interrupt - non-blockable
477  */
478 static void
479 sp_error(md_spps_t *ps)
480 {
481 	set_t		setno = MD_UN2SET(ps->ps_un);
482 
483 	/*
484 	 * Drop the mutex associated with this request before (potentially)
485 	 * enqueuing the free onto a separate thread. We have to release the
486 	 * mutex before destroying the parent structure.
487 	 */
488 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
489 		if (MUTEX_HELD(&ps->ps_mx)) {
490 			mutex_exit(&ps->ps_mx);
491 		}
492 	} else {
493 		/*
494 		 * this should only ever happen if we are panicking,
495 		 * since DONTFREE is only set on the parent if panicstr
496 		 * is non-NULL.
497 		 */
498 		ASSERT(panicstr);
499 	}
500 
501 	/*
502 	 * For a multi-owner set we need to send a message to the master so that
503 	 * all nodes get the errored status when we first encounter it. To avoid
504 	 * deadlocking when multiple soft-partitions encounter an error on one
505 	 * physical unit we drop the unit readerlock before enqueueing the
506 	 * request. That way we can service any messages that require a
507 	 * writerlock to be held. Additionally, to avoid deadlocking when at
508 	 * the bottom of a metadevice stack and a higher level mirror has
509 	 * multiple requests outstanding on this soft-part, we clone the ps
510 	 * that failed and pass the error back up the stack to release the
511 	 * reference that this i/o may have in the higher-level metadevice.
512 	 * The other nodes in the cluster just have to modify the soft-part
513 	 * status and we do not need to block the i/o completion for this.
514 	 */
515 	if (MD_MNSET_SETNO(setno)) {
516 		md_spps_t	*err_ps;
517 		err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
518 		sp_parent_init(err_ps);
519 
520 		err_ps->ps_un = ps->ps_un;
521 		err_ps->ps_ui = ps->ps_ui;
522 
523 		md_unit_readerexit(ps->ps_ui);
524 
525 		daemon_request(&md_sp_daemon, sp_xmit_error,
526 		    (daemon_queue_t *)err_ps, REQ_OLD);
527 
528 		sp_finish_error(ps, 0);
529 
530 		return;
531 	} else {
532 		ps->ps_un->un_status = MD_SP_ERR;
533 	}
534 
535 	/* Flag the error */
536 	sp_finish_error(ps, 1);
537 
538 }
539 
540 /*
541  * FUNCTION:	sp_mapbuf()
542  * INPUT:	un	- unit structure for soft partition we are doing
543  *			  I/O on.
544  *		voff	- virtual offset in soft partition to map.
545  *		bcount	- # of blocks in the I/O.
546  * OUTPUT:	bp	- translated buffer to be passed down to next layer.
547  * RETURNS:	1	- request must be fragmented, more work to do,
548  *		0	- request satisified, no more work to do
549  *		-1	- error
550  * PURPOSE:	Map the the virtual offset in the soft partition (passed
551  *		in via voff) to the "physical" offset on whatever the soft
552  *		partition is built on top of.  We do this by doing a binary
553  *		search of the extent array in the soft partition unit
554  *		structure.  Once the current extent is found, we do the
555  *		translation, determine if the I/O will cross extent
556  *		boundaries (if so, we have to fragment the I/O), then
557  *		fill in the buf structure to be passed down to the next layer.
558  */
559 static int
560 sp_mapbuf(
561 	mp_unit_t	*un,
562 	sp_ext_offset_t	voff,
563 	sp_ext_length_t	bcount,
564 	buf_t		*bp
565 )
566 {
567 	int		lo, mid, hi, found, more;
568 	size_t		new_bcount;
569 	sp_ext_offset_t new_blkno;
570 	sp_ext_offset_t	new_offset;
571 	sp_ext_offset_t	ext_endblk;
572 	md_dev64_t	new_edev;
573 	extern unsigned	md_maxphys;
574 
575 	found = 0;
576 	lo = 0;
577 	hi = un->un_numexts - 1;
578 
579 	/*
580 	 * do a binary search to find the extent that contains the
581 	 * starting offset.  after this loop, mid contains the index
582 	 * of the correct extent.
583 	 */
584 	while (lo <= hi && !found) {
585 		mid = (lo + hi) / 2;
586 		/* is the starting offset contained within the mid-ext? */
587 		if (voff >= un->un_ext[mid].un_voff &&
588 		    voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
589 			found = 1;
590 		else if (voff < un->un_ext[mid].un_voff)
591 			hi = mid - 1;
592 		else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
593 			lo = mid + 1;
594 	}
595 
596 	if (!found) {
597 		cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
598 		return (-1);
599 	}
600 
601 	/* translate to underlying physical offset/device */
602 	new_offset = voff - un->un_ext[mid].un_voff;
603 	new_blkno = un->un_ext[mid].un_poff + new_offset;
604 	new_edev = un->un_dev;
605 
606 	/* determine if we need to break the I/O into fragments */
607 	ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
608 	if (voff + btodb(bcount) > ext_endblk) {
609 		new_bcount = dbtob(ext_endblk - voff);
610 		more = 1;
611 	} else {
612 		new_bcount = bcount;
613 		more = 0;
614 	}
615 
616 	/* only break up the I/O if we're not built on another metadevice */
617 	if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
618 		new_bcount = md_maxphys;
619 		more = 1;
620 	}
621 	if (bp != (buf_t *)NULL) {
622 		/* do bp updates */
623 		bp->b_bcount = new_bcount;
624 		bp->b_lblkno = new_blkno;
625 		bp->b_edev = md_dev64_to_dev(new_edev);
626 	}
627 	return (more);
628 }
629 
630 /*
631  * FUNCTION:	sp_validate()
632  * INPUT:	un	- unit structure to be validated.
633  * OUTPUT:	none.
634  * RETURNS:	0	- soft partition ok.
635  *		-1	- error.
636  * PURPOSE:	called on open to sanity check the soft partition.  In
637  *		order to open a soft partition:
638  *		- it must have at least one extent
639  *		- the extent info in core and on disk must match
640  *		- it may not be in an intermediate state (which would
641  *		  imply that a two-phase commit was interrupted)
642  *
643  *		If the extent checking fails (B_ERROR returned from the read
644  *		strategy call) _and_ we're a multi-owner diskset, we send a
645  *		message to the master so that all nodes inherit the same view
646  *		of the soft partition.
647  *		If we are checking a soft-part that is marked as in error, and
648  *		we can actually read and validate the watermarks we send a
649  *		message to clear the error to the master node.
650  */
651 static int
652 sp_validate(mp_unit_t *un)
653 {
654 	uint_t		ext;
655 	struct buf	*buf;
656 	sp_ext_length_t	len;
657 	mp_watermark_t	*wm;
658 	set_t		setno;
659 	int		reset_error = 0;
660 
661 	setno = MD_UN2SET(un);
662 
663 	/* sanity check unit structure components ?? */
664 	if (un->un_status != MD_SP_OK) {
665 		if (un->un_status != MD_SP_ERR) {
666 			cmn_err(CE_WARN, "md: %s: open failed, soft partition "
667 			    "status is %u.",
668 			    md_shortname(MD_SID(un)),
669 			    un->un_status);
670 			return (-1);
671 		} else {
672 			cmn_err(CE_WARN, "md: %s: open of soft partition "
673 			    "in Errored state.",
674 			    md_shortname(MD_SID(un)));
675 			reset_error = 1;
676 		}
677 	}
678 
679 	if (un->un_numexts == 0) {
680 		cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
681 		    "not have any extents.", md_shortname(MD_SID(un)));
682 		return (-1);
683 	}
684 
685 	len = 0LL;
686 	for (ext = 0; ext < un->un_numexts; ext++) {
687 
688 		/* tally extent lengths to check total size */
689 		len += un->un_ext[ext].un_len;
690 
691 		/* allocate buffer for watermark */
692 		buf = getrbuf(KM_SLEEP);
693 
694 		/* read watermark */
695 		buf->b_flags = B_READ;
696 		buf->b_edev = md_dev64_to_dev(un->un_dev);
697 		buf->b_iodone = NULL;
698 		buf->b_proc = NULL;
699 		buf->b_bcount = sizeof (mp_watermark_t);
700 		buf->b_lblkno = un->un_ext[ext].un_poff - 1;
701 		buf->b_bufsize = sizeof (mp_watermark_t);
702 		buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
703 		    KM_SLEEP);
704 
705 		/*
706 		 * make the call non-blocking so that it is not affected
707 		 * by a set take.
708 		 */
709 		md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
710 		(void) biowait(buf);
711 
712 		if (buf->b_flags & B_ERROR) {
713 			cmn_err(CE_WARN, "md: %s: open failed, could not "
714 			    "read watermark at block %llu for extent %u, "
715 			    "error %d.", md_shortname(MD_SID(un)),
716 			    buf->b_lblkno, ext, buf->b_error);
717 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
718 			freerbuf(buf);
719 
720 			/*
721 			 * If we're a multi-owner diskset we send a message
722 			 * indicating that this soft-part has an invalid
723 			 * extent to the master node. This ensures a consistent
724 			 * view of the soft-part across the cluster.
725 			 */
726 			if (MD_MNSET_SETNO(setno)) {
727 				sp_send_stat_err(un);
728 			}
729 			return (-1);
730 		}
731 
732 		wm = (mp_watermark_t *)buf->b_un.b_addr;
733 
734 		/* make sure the checksum is correct first */
735 		if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
736 		    (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
737 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
738 			    "at block %llu for extent %u does not have a "
739 			    "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
740 			    buf->b_lblkno, ext, wm->wm_checksum);
741 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
742 			freerbuf(buf);
743 			return (-1);
744 		}
745 
746 		if (wm->wm_magic != MD_SP_MAGIC) {
747 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
748 			    "at block %llu for extent %u does not have a "
749 			    "valid watermark magic number, expected 0x%x, "
750 			    "found 0x%x.", md_shortname(MD_SID(un)),
751 			    buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
752 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
753 			freerbuf(buf);
754 			return (-1);
755 		}
756 
757 		/* make sure sequence number matches the current extent */
758 		if (wm->wm_seq != ext) {
759 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
760 			    "at block %llu for extent %u has invalid "
761 			    "sequence number %u.", md_shortname(MD_SID(un)),
762 			    buf->b_lblkno, ext, wm->wm_seq);
763 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
764 			freerbuf(buf);
765 			return (-1);
766 		}
767 
768 		/* make sure watermark length matches unit structure */
769 		if (wm->wm_length != un->un_ext[ext].un_len) {
770 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
771 			    "at block %llu for extent %u has inconsistent "
772 			    "length, expected %llu, found %llu.",
773 			    md_shortname(MD_SID(un)), buf->b_lblkno,
774 			    ext, un->un_ext[ext].un_len,
775 			    (u_longlong_t)wm->wm_length);
776 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
777 			freerbuf(buf);
778 			return (-1);
779 		}
780 
781 		/*
782 		 * make sure the type is a valid soft partition and not
783 		 * a free extent or the end.
784 		 */
785 		if (wm->wm_type != EXTTYP_ALLOC) {
786 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
787 			    "at block %llu for extent %u is not marked "
788 			    "as in-use, type = %u.", md_shortname(MD_SID(un)),
789 			    buf->b_lblkno, ext, wm->wm_type);
790 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
791 			freerbuf(buf);
792 			return (-1);
793 		}
794 		/* free up buffer */
795 		kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
796 		freerbuf(buf);
797 	}
798 
799 	if (len != un->un_length) {
800 		cmn_err(CE_WARN, "md: %s: open failed, computed length "
801 		    "%llu != expected length %llu.", md_shortname(MD_SID(un)),
802 		    len, un->un_length);
803 		return (-1);
804 	}
805 
806 	/*
807 	 * If we're a multi-owner set _and_ reset_error is set, we should clear
808 	 * the error condition on all nodes in the set. Use SP_SETSTAT2 with
809 	 * MD_SP_OK.
810 	 */
811 	if (MD_MNSET_SETNO(setno) && reset_error) {
812 		sp_send_stat_ok(un);
813 	}
814 	return (0);
815 }
816 
817 /*
818  * FUNCTION:	sp_done()
819  * INPUT:	child_buf	- buffer attached to child save structure.
820  *				  this is the buffer on which I/O has just
821  *				  completed.
822  * OUTPUT:	none.
823  * RETURNS:	0	- success.
824  *		1	- error.
825  * PURPOSE:	called on I/O completion.
826  */
827 static int
828 sp_done(struct buf *child_buf)
829 {
830 	struct buf	*parent_buf;
831 	mdi_unit_t	*ui;
832 	md_spps_t	*ps;
833 	md_spcs_t	*cs;
834 
835 	/* find the child save structure to which this buffer belongs */
836 	cs = (md_spcs_t *)((caddr_t)child_buf -
837 	    (sizeof (md_spcs_t) - sizeof (buf_t)));
838 	/* now get the parent save structure */
839 	ps = cs->cs_ps;
840 	parent_buf = ps->ps_bp;
841 
842 	mutex_enter(&ps->ps_mx);
843 	/* pass any errors back up to the parent */
844 	if (child_buf->b_flags & B_ERROR) {
845 		ps->ps_flags |= MD_SPPS_ERROR;
846 		parent_buf->b_error = child_buf->b_error;
847 	}
848 	/* mapout, if needed */
849 	if (child_buf->b_flags & B_REMAPPED)
850 		bp_mapout(child_buf);
851 
852 	ps->ps_frags--;
853 	if (ps->ps_frags != 0) {
854 		/*
855 		 * if this parent has more children, we just free the
856 		 * child and return.
857 		 */
858 		kmem_cache_free(sp_child_cache, cs);
859 		mutex_exit(&ps->ps_mx);
860 		return (1);
861 	}
862 	/* there are no more children */
863 	kmem_cache_free(sp_child_cache, cs);
864 	if (ps->ps_flags & MD_SPPS_ERROR) {
865 		sp_error(ps);
866 		return (1);
867 	}
868 	ui = ps->ps_ui;
869 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
870 		mutex_exit(&ps->ps_mx);
871 	} else {
872 		/*
873 		 * this should only ever happen if we are panicking,
874 		 * since DONTFREE is only set on the parent if panicstr
875 		 * is non-NULL.
876 		 */
877 		ASSERT(panicstr);
878 	}
879 	SPPS_FREE(sp_parent_cache, ps);
880 	md_kstat_done(ui, parent_buf, 0);
881 	md_unit_readerexit(ui);
882 	md_biodone(parent_buf);
883 	return (0);
884 }
885 
886 /*
887  * FUNCTION:	md_sp_strategy()
888  * INPUT:	parent_buf	- parent buffer
889  *		flag		- flags
890  *		private		- private data
891  * OUTPUT:	none.
892  * RETURNS:	void.
893  * PURPOSE:	Soft partitioning I/O strategy.  Performs the main work
894  *		needed to do I/O to a soft partition.  The basic
895  *		algorithm is as follows:
896  *			- Allocate a child save structure to keep track
897  *			  of the I/O we are going to pass down.
898  *			- Map the I/O to the correct extent in the soft
899  *			  partition (see sp_mapbuf()).
900  *			- bioclone() the buffer and pass it down the
901  *			  stack using md_call_strategy.
902  *			- If the I/O needs to split across extents,
903  *			  repeat the above steps until all fragments
904  *			  are finished.
905  */
906 static void
907 md_sp_strategy(buf_t *parent_buf, int flag, void *private)
908 {
909 	md_spps_t	*ps;
910 	md_spcs_t	*cs;
911 	int		more;
912 	mp_unit_t	*un;
913 	mdi_unit_t	*ui;
914 	size_t		current_count;
915 	off_t		current_offset;
916 	sp_ext_offset_t	current_blkno;
917 	buf_t		*child_buf;
918 	set_t		setno = MD_MIN2SET(getminor(parent_buf->b_edev));
919 	int		strat_flag = flag;
920 
921 	/*
922 	 * When doing IO to a multi owner meta device, check if set is halted.
923 	 * We do this check without the needed lock held, for performance
924 	 * reasons.
925 	 * If an IO just slips through while the set is locked via an
926 	 * MD_MN_SUSPEND_SET, we don't care about it.
927 	 * Only check for suspension if we are a top-level i/o request
928 	 * (MD_STR_NOTTOP is cleared in 'flag');
929 	 */
930 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
931 	    (MD_SET_HALTED | MD_SET_MNSET)) {
932 		if ((flag & MD_STR_NOTTOP) == 0) {
933 			mutex_enter(&md_mx);
934 			/* Here we loop until the set is no longer halted */
935 			while (md_set[setno].s_status & MD_SET_HALTED) {
936 				cv_wait(&md_cv, &md_mx);
937 			}
938 			mutex_exit(&md_mx);
939 		}
940 	}
941 
942 	ui = MDI_UNIT(getminor(parent_buf->b_edev));
943 
944 	md_kstat_waitq_enter(ui);
945 
946 	un = (mp_unit_t *)md_unit_readerlock(ui);
947 
948 	if ((flag & MD_NOBLOCK) == 0) {
949 		if (md_inc_iocount(setno) != 0) {
950 			parent_buf->b_flags |= B_ERROR;
951 			parent_buf->b_error = ENXIO;
952 			parent_buf->b_resid = parent_buf->b_bcount;
953 			md_unit_readerexit(ui);
954 			biodone(parent_buf);
955 			return;
956 		}
957 	} else {
958 		md_inc_iocount_noblock(setno);
959 	}
960 
961 	if (!(flag & MD_STR_NOTTOP)) {
962 		if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
963 			md_kstat_waitq_exit(ui);
964 			return;
965 		}
966 	}
967 
968 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
969 	sp_parent_init(ps);
970 
971 	/*
972 	 * Save essential information from the original buffhdr
973 	 * in the parent.
974 	 */
975 	ps->ps_un = un;
976 	ps->ps_ui = ui;
977 	ps->ps_bp = parent_buf;
978 	ps->ps_addr = parent_buf->b_un.b_addr;
979 
980 	current_count = parent_buf->b_bcount;
981 	current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
982 	current_offset  = 0;
983 
984 	/*
985 	 * if we are at the top and we are panicking,
986 	 * we don't free in order to save state.
987 	 */
988 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
989 		ps->ps_flags |= MD_SPPS_DONTFREE;
990 
991 	md_kstat_waitq_to_runq(ui);
992 
993 	ps->ps_frags++;
994 
995 	/*
996 	 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
997 	 * metadevice.
998 	 */
999 	if (ui->ui_tstate & MD_ABR_CAP)
1000 		strat_flag |= MD_STR_ABR;
1001 
1002 	/*
1003 	 * this loop does the main work of an I/O.  we allocate a
1004 	 * a child save for each buf, do the logical to physical
1005 	 * mapping, decide if we need to frag the I/O, clone the
1006 	 * new I/O to pass down the stack.  repeat until we've
1007 	 * taken care of the entire buf that was passed to us.
1008 	 */
1009 	do {
1010 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1011 		sp_child_init(cs);
1012 		child_buf = &cs->cs_buf;
1013 		cs->cs_ps = ps;
1014 
1015 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1016 		if (more == -1) {
1017 			parent_buf->b_flags |= B_ERROR;
1018 			parent_buf->b_error = EIO;
1019 			md_kstat_done(ui, parent_buf, 0);
1020 			md_unit_readerexit(ui);
1021 			md_biodone(parent_buf);
1022 			kmem_cache_free(sp_parent_cache, ps);
1023 			return;
1024 		}
1025 
1026 		child_buf = md_bioclone(parent_buf, current_offset,
1027 					child_buf->b_bcount, child_buf->b_edev,
1028 					child_buf->b_blkno, sp_done, child_buf,
1029 					KM_NOSLEEP);
1030 		/* calculate new offset, counts, etc... */
1031 		current_offset += child_buf->b_bcount;
1032 		current_count -=  child_buf->b_bcount;
1033 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1034 
1035 		if (more) {
1036 			mutex_enter(&ps->ps_mx);
1037 			ps->ps_frags++;
1038 			mutex_exit(&ps->ps_mx);
1039 		}
1040 
1041 		md_call_strategy(child_buf, strat_flag, private);
1042 	} while (more);
1043 
1044 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
1045 		while (!(ps->ps_flags & MD_SPPS_DONE)) {
1046 			md_daemon(1, &md_done_daemon);
1047 		}
1048 		kmem_cache_free(sp_parent_cache, ps);
1049 	}
1050 }
1051 
1052 /*
1053  * FUNCTION:	sp_directed_read()
1054  * INPUT:	mnum	- minor number
1055  *		vdr	- vol_directed_rd_t from user
1056  *		mode	- access mode for copying data out.
1057  * OUTPUT:	none.
1058  * RETURNS:	0	- success
1059  *		Exxxxx	- failure error-code
1060  * PURPOSE:	Construct the necessary sub-device i/o requests to perform the
1061  *		directed read as requested by the user. This is essentially the
1062  *		same as md_sp_strategy() with the exception being that the
1063  *		underlying 'md_call_strategy' is replaced with an ioctl call.
1064  */
1065 int
1066 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
1067 {
1068 	md_spps_t	*ps;
1069 	md_spcs_t	*cs;
1070 	int		more;
1071 	mp_unit_t	*un;
1072 	mdi_unit_t	*ui;
1073 	size_t		current_count;
1074 	off_t		current_offset;
1075 	sp_ext_offset_t	current_blkno;
1076 	buf_t		*child_buf, *parent_buf;
1077 	void		*kbuffer;
1078 	vol_directed_rd_t	cvdr;
1079 	caddr_t		userbuf;
1080 	offset_t	useroff;
1081 	int		ret = 0;
1082 
1083 	ui = MDI_UNIT(mnum);
1084 
1085 	md_kstat_waitq_enter(ui);
1086 
1087 	bzero(&cvdr, sizeof (cvdr));
1088 
1089 	un = (mp_unit_t *)md_unit_readerlock(ui);
1090 
1091 	/*
1092 	 * Construct a parent_buf header which reflects the user-supplied
1093 	 * request.
1094 	 */
1095 
1096 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
1097 	if (kbuffer == NULL) {
1098 		vdr->vdr_flags |= DKV_DMR_ERROR;
1099 		md_unit_readerexit(ui);
1100 		return (ENOMEM);
1101 	}
1102 
1103 	parent_buf = getrbuf(KM_NOSLEEP);
1104 	if (parent_buf == NULL) {
1105 		vdr->vdr_flags |= DKV_DMR_ERROR;
1106 		md_unit_readerexit(ui);
1107 		kmem_free(kbuffer, vdr->vdr_nbytes);
1108 		return (ENOMEM);
1109 	}
1110 	parent_buf->b_un.b_addr = kbuffer;
1111 	parent_buf->b_flags = B_READ;
1112 	parent_buf->b_bcount = vdr->vdr_nbytes;
1113 	parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
1114 	parent_buf->b_edev = un->un_dev;
1115 
1116 
1117 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
1118 	sp_parent_init(ps);
1119 
1120 	/*
1121 	 * Save essential information from the original buffhdr
1122 	 * in the parent.
1123 	 */
1124 	ps->ps_un = un;
1125 	ps->ps_ui = ui;
1126 	ps->ps_bp = parent_buf;
1127 	ps->ps_addr = parent_buf->b_un.b_addr;
1128 
1129 	current_count = parent_buf->b_bcount;
1130 	current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
1131 	current_offset  = 0;
1132 
1133 	ps->ps_frags++;
1134 	vdr->vdr_bytesread = 0;
1135 
1136 	/*
1137 	 * this loop does the main work of an I/O.  we allocate a
1138 	 * a child save for each buf, do the logical to physical
1139 	 * mapping, decide if we need to frag the I/O, clone the
1140 	 * new I/O to pass down the stack.  repeat until we've
1141 	 * taken care of the entire buf that was passed to us.
1142 	 */
1143 	do {
1144 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1145 		sp_child_init(cs);
1146 		child_buf = &cs->cs_buf;
1147 		cs->cs_ps = ps;
1148 
1149 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1150 		if (more == -1) {
1151 			ret = EIO;
1152 			vdr->vdr_flags |= DKV_DMR_SHORT;
1153 			kmem_cache_free(sp_child_cache, cs);
1154 			goto err_out;
1155 		}
1156 
1157 		cvdr.vdr_flags = vdr->vdr_flags;
1158 		cvdr.vdr_side = vdr->vdr_side;
1159 		cvdr.vdr_nbytes = child_buf->b_bcount;
1160 		cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
1161 		/* Work out where we are in the allocated buffer */
1162 		useroff = (offset_t)(uintptr_t)kbuffer;
1163 		useroff = useroff + (offset_t)current_offset;
1164 		cvdr.vdr_data = (void *)(uintptr_t)useroff;
1165 		child_buf = md_bioclone(parent_buf, current_offset,
1166 					child_buf->b_bcount, child_buf->b_edev,
1167 					child_buf->b_blkno, NULL,
1168 					child_buf, KM_NOSLEEP);
1169 		/* calculate new offset, counts, etc... */
1170 		current_offset += child_buf->b_bcount;
1171 		current_count -=  child_buf->b_bcount;
1172 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1173 
1174 		if (more) {
1175 			mutex_enter(&ps->ps_mx);
1176 			ps->ps_frags++;
1177 			mutex_exit(&ps->ps_mx);
1178 		}
1179 
1180 		ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
1181 		    (mode | FKIOCTL), NULL);
1182 
1183 		/*
1184 		 * Free the child structure as we've finished with it.
1185 		 * Normally this would be done by sp_done() but we're just
1186 		 * using md_bioclone() to segment the transfer and we never
1187 		 * issue a strategy request so the iodone will not be called.
1188 		 */
1189 		kmem_cache_free(sp_child_cache, cs);
1190 		if (ret == 0) {
1191 			/* copyout the returned data to vdr_data + offset */
1192 			userbuf = (caddr_t)kbuffer;
1193 			userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
1194 			if (ddi_copyout(userbuf, vdr->vdr_data,
1195 			    cvdr.vdr_bytesread, mode)) {
1196 				ret = EFAULT;
1197 				goto err_out;
1198 			}
1199 			vdr->vdr_bytesread += cvdr.vdr_bytesread;
1200 		} else {
1201 			goto err_out;
1202 		}
1203 	} while (more);
1204 
1205 	/*
1206 	 * Update the user-supplied vol_directed_rd_t structure with the
1207 	 * contents of the last issued child request.
1208 	 */
1209 	vdr->vdr_flags = cvdr.vdr_flags;
1210 	vdr->vdr_side = cvdr.vdr_side;
1211 	bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);
1212 
1213 err_out:
1214 	if (ret != 0) {
1215 		vdr->vdr_flags |= DKV_DMR_ERROR;
1216 	}
1217 	if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
1218 		vdr->vdr_flags |= DKV_DMR_SHORT;
1219 	}
1220 	kmem_cache_free(sp_parent_cache, ps);
1221 	kmem_free(kbuffer, vdr->vdr_nbytes);
1222 	freerbuf(parent_buf);
1223 	md_unit_readerexit(ui);
1224 	return (ret);
1225 }
1226 
1227 /*
1228  * FUNCTION:	sp_snarf()
1229  * INPUT:	cmd	- snarf cmd.
1230  *		setno	- set number.
1231  * OUTPUT:	none.
1232  * RETURNS:	1	- soft partitions were snarfed.
1233  *		0	- no soft partitions were snarfed.
1234  * PURPOSE:	Snarf soft partition metadb records into their in-core
1235  *		structures.  This routine is called at "snarf time" when
1236  *		md loads and gets all metadevices records into memory.
1237  *		The basic algorithm is simply to walk the soft partition
1238  *		records in the metadb and call the soft partitioning
1239  *		build_incore routine to set up the in-core structures.
1240  */
1241 static int
1242 sp_snarf(md_snarfcmd_t cmd, set_t setno)
1243 {
1244 	mp_unit_t	*un;
1245 	mddb_recid_t	recid;
1246 	int		gotsomething;
1247 	int		all_sp_gotten;
1248 	mddb_type_t	rec_type;
1249 	mddb_de_ic_t	*dep;
1250 	mddb_rb32_t	*rbp;
1251 	mp_unit_t	*big_un;
1252 	mp_unit32_od_t	*small_un;
1253 	size_t		newreqsize;
1254 
1255 
1256 	if (cmd == MD_SNARF_CLEANUP)
1257 		return (0);
1258 
1259 	all_sp_gotten = 1;
1260 	gotsomething = 0;
1261 
1262 	/* get the record type */
1263 	rec_type = (mddb_type_t)md_getshared_key(setno,
1264 	    sp_md_ops.md_driver.md_drivername);
1265 	recid = mddb_makerecid(setno, 0);
1266 
1267 	/*
1268 	 * walk soft partition records in the metadb and call
1269 	 * sp_build_incore to build in-core structures.
1270 	 */
1271 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1272 		/* if we've already gotten this record, go to the next one */
1273 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1274 			continue;
1275 
1276 
1277 		dep = mddb_getrecdep(recid);
1278 		dep->de_flags = MDDB_F_SOFTPART;
1279 		rbp = dep->de_rb;
1280 
1281 		if ((rbp->rb_revision == MDDB_REV_RB) &&
1282 		    ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
1283 			/*
1284 			 * This means, we have an old and small record.
1285 			 * And this record hasn't already been converted :-o
1286 			 * before we create an incore metadevice from this
1287 			 * we have to convert it to a big record.
1288 			 */
1289 			small_un = (mp_unit32_od_t *)mddb_getrecaddr(recid);
1290 			newreqsize = sizeof (mp_unit_t) +
1291 					((small_un->un_numexts - 1) *
1292 					sizeof (struct mp_ext));
1293 			big_un = (mp_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
1294 			softpart_convert((caddr_t)small_un, (caddr_t)big_un,
1295 			    SMALL_2_BIG);
1296 			kmem_free(small_un, dep->de_reqsize);
1297 			dep->de_rb_userdata = big_un;
1298 			dep->de_reqsize = newreqsize;
1299 			rbp->rb_private |= MD_PRV_CONVD;
1300 			un = big_un;
1301 		} else {
1302 			/* Large device */
1303 			un = (mp_unit_t *)mddb_getrecaddr(recid);
1304 		}
1305 
1306 		/* Set revision and flag accordingly */
1307 		if (rbp->rb_revision == MDDB_REV_RB) {
1308 			un->c.un_revision = MD_32BIT_META_DEV;
1309 		} else {
1310 			un->c.un_revision = MD_64BIT_META_DEV;
1311 			un->c.un_flag |= MD_EFILABEL;
1312 		}
1313 
1314 		/*
1315 		 * Create minor node for snarfed entry.
1316 		 */
1317 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
1318 
1319 		if (MD_UNIT(MD_SID(un)) != NULL) {
1320 			/* unit is already in-core */
1321 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1322 			continue;
1323 		}
1324 		all_sp_gotten = 0;
1325 		if (sp_build_incore((void *)un, 1) == 0) {
1326 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
1327 			md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
1328 			gotsomething = 1;
1329 		}
1330 	}
1331 
1332 	if (!all_sp_gotten)
1333 		return (gotsomething);
1334 	/* double-check records */
1335 	recid = mddb_makerecid(setno, 0);
1336 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
1337 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
1338 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1339 
1340 	return (0);
1341 }
1342 
1343 /*
1344  * FUNCTION:	sp_halt()
1345  * INPUT:	cmd	- halt cmd.
1346  *		setno	- set number.
1347  * RETURNS:	0	- success.
1348  *		1	- err.
1349  * PURPOSE:	Perform driver halt operations.  As with stripe, we
1350  *		support MD_HALT_CHECK and MD_HALT_DOIT.  The first
1351  *		does a check to see if halting can be done safely
1352  *		(no open soft partitions), the second cleans up and
1353  *		shuts down the driver.
1354  */
1355 static int
1356 sp_halt(md_haltcmd_t cmd, set_t setno)
1357 {
1358 	int		i;
1359 	mdi_unit_t	*ui;
1360 	minor_t		mnum;
1361 
1362 	if (cmd == MD_HALT_CLOSE)
1363 		return (0);
1364 
1365 	if (cmd == MD_HALT_OPEN)
1366 		return (0);
1367 
1368 	if (cmd == MD_HALT_UNLOAD)
1369 		return (0);
1370 
1371 	if (cmd == MD_HALT_CHECK) {
1372 		for (i = 0; i < md_nunits; i++) {
1373 			mnum = MD_MKMIN(setno, i);
1374 			if ((ui = MDI_UNIT(mnum)) == NULL)
1375 				continue;
1376 			if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1377 				continue;
1378 			if (md_unit_isopen(ui))
1379 				return (1);
1380 		}
1381 		return (0);
1382 	}
1383 
1384 	if (cmd != MD_HALT_DOIT)
1385 		return (1);
1386 
1387 	for (i = 0; i < md_nunits; i++) {
1388 		mnum = MD_MKMIN(setno, i);
1389 		if ((ui = MDI_UNIT(mnum)) == NULL)
1390 			continue;
1391 		if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1392 			continue;
1393 		reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
1394 	}
1395 
1396 	return (0);
1397 }
1398 
1399 /*
1400  * FUNCTION:	sp_open_dev()
1401  * INPUT:	un	- unit structure.
1402  *		oflags	- open flags.
1403  * OUTPUT:	none.
1404  * RETURNS:	0		- success.
1405  *		non-zero	- err.
1406  * PURPOSE:	open underlying device via md_layered_open.
1407  */
1408 static int
1409 sp_open_dev(mp_unit_t *un, int oflags)
1410 {
1411 	minor_t		mnum = MD_SID(un);
1412 	int		err;
1413 	md_dev64_t	tmpdev;
1414 	set_t		setno = MD_MIN2SET(MD_SID(un));
1415 	side_t		side = mddb_getsidenum(setno);
1416 
1417 	tmpdev = un->un_dev;
1418 	/*
1419 	 * Do the open by device id if underlying is regular
1420 	 */
1421 	if ((md_getmajor(tmpdev) != md_major) &&
1422 		md_devid_found(setno, side, un->un_key) == 1) {
1423 		tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
1424 	}
1425 	err = md_layered_open(mnum, &tmpdev, oflags);
1426 	un->un_dev = tmpdev;
1427 
1428 	if (err)
1429 		return (ENXIO);
1430 
1431 	return (0);
1432 }
1433 
1434 /*
1435  * FUNCTION:	sp_open()
1436  * INPUT:	dev		- device to open.
1437  *		flag		- pass-through flag.
1438  *		otyp		- pass-through open type.
1439  *		cred_p		- credentials.
1440  *		md_oflags	- open flags.
1441  * OUTPUT:	none.
1442  * RETURNS:	0		- success.
1443  *		non-zero	- err.
1444  * PURPOSE:	open a soft partition.
1445  */
1446 /* ARGSUSED */
1447 static int
1448 sp_open(
1449 	dev_t		*dev,
1450 	int		flag,
1451 	int		otyp,
1452 	cred_t		*cred_p,
1453 	int		md_oflags
1454 )
1455 {
1456 	minor_t		mnum = getminor(*dev);
1457 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1458 	mp_unit_t	*un;
1459 	int		err = 0;
1460 	set_t		setno;
1461 
1462 	/*
1463 	 * When doing an open of a multi owner metadevice, check to see if this
1464 	 * node is a starting node and if a reconfig cycle is underway.
1465 	 * If so, the system isn't sufficiently set up enough to handle the
1466 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
1467 	 */
1468 	setno = MD_MIN2SET(mnum);
1469 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
1470 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
1471 			return (ENXIO);
1472 	}
1473 
1474 	/* grab necessary locks */
1475 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1476 	setno = MD_UN2SET(un);
1477 
1478 	/* open underlying device, if necessary */
1479 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
1480 		if ((err = sp_open_dev(un, md_oflags)) != 0)
1481 			goto out;
1482 
1483 		if (MD_MNSET_SETNO(setno)) {
1484 			/* For probe, don't incur the overhead of validate */
1485 			if (!(md_oflags & MD_OFLG_PROBEDEV)) {
1486 				/*
1487 				 * Don't call sp_validate while
1488 				 * unit_openclose lock is held.  So, actually
1489 				 * open the device, drop openclose lock,
1490 				 * call sp_validate, reacquire openclose lock,
1491 				 * and close the device.  If sp_validate
1492 				 * succeeds, then device will be re-opened.
1493 				 */
1494 				if ((err = md_unit_incopen(mnum, flag,
1495 				    otyp)) != 0)
1496 					goto out;
1497 
1498 				mutex_enter(&ui->ui_mx);
1499 				ui->ui_lock |= MD_UL_OPENINPROGRESS;
1500 				mutex_exit(&ui->ui_mx);
1501 				md_unit_openclose_exit(ui);
1502 				if (otyp != OTYP_LYR)
1503 					rw_exit(&md_unit_array_rw.lock);
1504 
1505 				err = sp_validate(un);
1506 
1507 				if (otyp != OTYP_LYR)
1508 					rw_enter(&md_unit_array_rw.lock,
1509 					    RW_READER);
1510 				(void) md_unit_openclose_enter(ui);
1511 				(void) md_unit_decopen(mnum, otyp);
1512 				mutex_enter(&ui->ui_mx);
1513 				ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
1514 				cv_broadcast(&ui->ui_cv);
1515 				mutex_exit(&ui->ui_mx);
1516 				/*
1517 				 * Should be in the same state as before
1518 				 * the sp_validate.
1519 				 */
1520 				if (err != 0) {
1521 					/* close the device opened above */
1522 					md_layered_close(un->un_dev, md_oflags);
1523 					err = EIO;
1524 					goto out;
1525 				}
1526 			}
1527 			/*
1528 			 * As we're a multi-owner metadevice we need to ensure
1529 			 * that all nodes have the same idea of the status.
1530 			 * sp_validate() will mark the device as errored (if
1531 			 * it cannot read the watermark) or ok (if it was
1532 			 * previously errored but the watermark is now valid).
1533 			 * This code-path is only entered on the non-probe open
1534 			 * so we will maintain the errored state during a probe
1535 			 * call. This means the sys-admin must metarecover -m
1536 			 * to reset the soft-partition error.
1537 			 */
1538 		} else {
1539 			/* For probe, don't incur the overhead of validate */
1540 			if (!(md_oflags & MD_OFLG_PROBEDEV) &&
1541 			    (err = sp_validate(un)) != 0) {
1542 				/* close the device opened above */
1543 				md_layered_close(un->un_dev, md_oflags);
1544 				err = EIO;
1545 				goto out;
1546 			} else {
1547 				/*
1548 				 * we succeeded in validating the on disk
1549 				 * format versus the in core, so reset the
1550 				 * status if it's in error
1551 				 */
1552 				if (un->un_status == MD_SP_ERR) {
1553 					un->un_status = MD_SP_OK;
1554 				}
1555 			}
1556 		}
1557 	}
1558 
1559 	/* count open */
1560 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
1561 		goto out;
1562 
1563 out:
1564 	md_unit_openclose_exit(ui);
1565 	return (err);
1566 }
1567 
1568 /*
1569  * FUNCTION:	sp_close()
1570  * INPUT:	dev		- device to close.
1571  *		flag		- pass-through flag.
1572  *		otyp		- pass-through type.
1573  *		cred_p		- credentials.
1574  *		md_cflags	- close flags.
1575  * OUTPUT:	none.
1576  * RETURNS:	0		- success.
1577  *		non-zero	- err.
1578  * PURPOSE:	close a soft paritition.
1579  */
1580 /* ARGSUSED */
1581 static int
1582 sp_close(
1583 	dev_t		dev,
1584 	int		flag,
1585 	int		otyp,
1586 	cred_t		*cred_p,
1587 	int		md_cflags
1588 )
1589 {
1590 	minor_t		mnum = getminor(dev);
1591 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1592 	mp_unit_t	*un;
1593 	int		err = 0;
1594 
1595 	/* grab necessary locks */
1596 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1597 
1598 	/* count closed */
1599 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
1600 		goto out;
1601 
1602 	/* close devices, if necessary */
1603 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1604 		md_layered_close(un->un_dev, md_cflags);
1605 	}
1606 
1607 	/*
1608 	 * If a MN set and transient capabilities (eg ABR/DMR) are set,
1609 	 * clear these capabilities if this is the last close in
1610 	 * the cluster
1611 	 */
1612 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1613 	    (ui->ui_tstate & MD_ABR_CAP)) {
1614 		md_unit_openclose_exit(ui);
1615 		mdmn_clear_all_capabilities(mnum);
1616 		return (0);
1617 	}
1618 	/* unlock, return success */
1619 out:
1620 	md_unit_openclose_exit(ui);
1621 	return (err);
1622 }
1623 
1624 
1625 /* used in sp_dump routine */
1626 static struct buf dumpbuf;
1627 
1628 /*
1629  * FUNCTION:	sp_dump()
1630  * INPUT:	dev	- device to dump to.
1631  *		addr	- address to dump.
1632  *		blkno	- blkno on device.
1633  *		nblk	- number of blocks to dump.
1634  * OUTPUT:	none.
1635  * RETURNS:	result from bdev_dump.
1636  * PURPOSE:  This routine dumps memory to the disk.  It assumes that
1637  *           the memory has already been mapped into mainbus space.
1638  *           It is called at disk interrupt priority when the system
1639  *           is in trouble.
1640  *           NOTE: this function is defined using 32-bit arguments,
1641  *           but soft partitioning is internally 64-bit.  Arguments
1642  *           are casted where appropriate.
1643  */
1644 static int
1645 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1646 {
1647 	mp_unit_t	*un;
1648 	buf_t		*bp;
1649 	sp_ext_length_t	nb;
1650 	daddr_t		mapblk;
1651 	int		result;
1652 	int		more;
1653 	int		saveresult = 0;
1654 
1655 	/*
1656 	 * Don't need to grab the unit lock.
1657 	 * Cause nothing else is supposed to be happenning.
1658 	 * Also dump is not supposed to sleep.
1659 	 */
1660 	un = (mp_unit_t *)MD_UNIT(getminor(dev));
1661 
1662 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1663 		return (EINVAL);
1664 
1665 	if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
1666 		return (EINVAL);
1667 
1668 	bp = &dumpbuf;
1669 	nb = (sp_ext_length_t)dbtob(nblk);
1670 	do {
1671 		bzero((caddr_t)bp, sizeof (*bp));
1672 		more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
1673 		nblk = (int)(btodb(bp->b_bcount));
1674 		mapblk = bp->b_blkno;
1675 		result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
1676 		if (result)
1677 			saveresult = result;
1678 
1679 		nb -= bp->b_bcount;
1680 		addr += bp->b_bcount;
1681 		blkno += nblk;
1682 	} while (more);
1683 
1684 	return (saveresult);
1685 }
1686 
1687 static int
1688 sp_imp_set(
1689 	set_t	setno
1690 )
1691 {
1692 	mddb_recid_t	recid;
1693 	int		gotsomething;
1694 	mddb_type_t	rec_type;
1695 	mddb_de_ic_t	*dep;
1696 	mddb_rb32_t	*rbp;
1697 	mp_unit_t	*un64;
1698 	mp_unit32_od_t	*un32;
1699 	minor_t		*self_id;	/* minor needs to be updated */
1700 	md_parent_t	*parent_id;	/* parent needs to be updated */
1701 	mddb_recid_t	*record_id;	/* record id needs to be updated */
1702 
1703 	gotsomething = 0;
1704 
1705 	rec_type = (mddb_type_t)md_getshared_key(setno,
1706 		sp_md_ops.md_driver.md_drivername);
1707 	recid = mddb_makerecid(setno, 0);
1708 
1709 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1710 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1711 			continue;
1712 
1713 		dep = mddb_getrecdep(recid);
1714 		rbp = dep->de_rb;
1715 
1716 		if (rbp->rb_revision == MDDB_REV_RB) {
1717 			/*
1718 			 * Small device
1719 			 */
1720 			un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
1721 			self_id = &(un32->c.un_self_id);
1722 			parent_id = &(un32->c.un_parent);
1723 			record_id = &(un32->c.un_record_id);
1724 
1725 			if (!md_update_minor(setno, mddb_getsidenum
1726 				(setno), un32->un_key))
1727 				goto out;
1728 		} else {
1729 			un64 = (mp_unit_t *)mddb_getrecaddr(recid);
1730 			self_id = &(un64->c.un_self_id);
1731 			parent_id = &(un64->c.un_parent);
1732 			record_id = &(un64->c.un_record_id);
1733 
1734 			if (!md_update_minor(setno, mddb_getsidenum
1735 				(setno), un64->un_key))
1736 				goto out;
1737 		}
1738 
1739 		/*
1740 		 * Update unit with the imported setno
1741 		 *
1742 		 */
1743 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
1744 
1745 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1746 		if (*parent_id != MD_NO_PARENT)
1747 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1748 		*record_id = MAKERECID(setno, DBID(*record_id));
1749 
1750 		gotsomething = 1;
1751 	}
1752 
1753 out:
1754 	return (gotsomething);
1755 }
1756 
1757 static md_named_services_t sp_named_services[] = {
1758 	{NULL,					0}
1759 };
1760 
1761 md_ops_t sp_md_ops = {
1762 	sp_open,		/* open */
1763 	sp_close,		/* close */
1764 	md_sp_strategy,		/* strategy */
1765 	NULL,			/* print */
1766 	sp_dump,		/* dump */
1767 	NULL,			/* read */
1768 	NULL,			/* write */
1769 	md_sp_ioctl,		/* ioctl, */
1770 	sp_snarf,		/* snarf */
1771 	sp_halt,		/* halt */
1772 	NULL,			/* aread */
1773 	NULL,			/* awrite */
1774 	sp_imp_set,		/* import set */
1775 	sp_named_services
1776 };
1777 
1778 static void
1779 init_init()
1780 {
1781 	sp_parent_cache = kmem_cache_create("md_softpart_parent",
1782 	    sizeof (md_spps_t), 0, sp_parent_constructor,
1783 	    sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
1784 	sp_child_cache = kmem_cache_create("md_softpart_child",
1785 	    sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
1786 	    sp_child_constructor, sp_child_destructor, sp_run_queue,
1787 	    NULL, NULL, 0);
1788 }
1789 
1790 static void
1791 fini_uninit()
1792 {
1793 	kmem_cache_destroy(sp_parent_cache);
1794 	kmem_cache_destroy(sp_child_cache);
1795 	sp_parent_cache = sp_child_cache = NULL;
1796 }
1797 
1798 /* define the module linkage */
1799 MD_PLUGIN_MISC_MODULE("soft partition module %I%", init_init(), fini_uninit())
1800