xref: /titanic_41/usr/src/uts/common/io/lvm/softpart/sp.c (revision 5e992ba3a9b8749890ab15d3ca96a0b1a79641ac)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Soft partitioning metadevice driver (md_sp).
29  *
30  * This file contains the primary operations of the soft partitioning
31  * metadevice driver.  This includes all routines for normal operation
32  * (open/close/read/write).  Please see mdvar.h for a definition of
33  * metadevice operations vector (md_ops_t).  This driver is loosely
34  * based on the stripe driver (md_stripe).
35  *
36  * All metadevice administration is done through the use of ioctl's.
37  * As such, all administrative routines appear in sp_ioctl.c.
38  *
39  * Soft partitions are represented both in-core and in the metadb with a
40  * unit structure.  The soft partition-specific information in the unit
41  * structure includes the following information:
42  *	- Device information (md_dev64_t & md key) about the device on which
43  *	  the soft partition is built.
44  *	- Soft partition status information.
45  *	- The size of the soft partition and number of extents used to
46  *	  make up that size.
47  *	- An array of exents which define virtual/physical offset
48  *	  mappings and lengths for each extent.
49  *
50  * Typical soft partition operation proceeds as follows:
51  *	- The unit structure is fetched from the metadb and placed into
52  *	  an in-core array (as with other metadevices).  This operation
53  *	  is performed via sp_build_incore( ) and takes place during
54  *	  "snarfing" (when all metadevices are brought in-core at
55  *	  once) and when a new soft partition is created.
56  *	- A soft partition is opened via sp_open( ).  At open time the
57  *	  the soft partition unit structure is verified with the soft
58  *	  partition on-disk structures.  Additionally, the soft partition
59  *	  status is checked (only soft partitions in the OK state may be
60  *	  opened).
61  *	- Soft partition I/O is performed via sp_strategy( ) which relies on
62  *	  a support routine, sp_mapbuf( ), to do most of the work.
63  *	  sp_mapbuf( ) maps a buffer to a particular extent via a binary
64  *	  search of the extent array in the soft partition unit structure.
65  *	  Once a translation has been performed, the I/O is passed down
66  *	  to the next layer, which may be another metadevice or a physical
67  *	  disk.  Since a soft partition may contain multiple, non-contiguous
68  *	  extents, a single I/O may have to be fragmented.
69  *	- Soft partitions are closed using sp_close.
70  *
71  */
72 
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/conf.h>
76 #include <sys/file.h>
77 #include <sys/user.h>
78 #include <sys/uio.h>
79 #include <sys/t_lock.h>
80 #include <sys/buf.h>
81 #include <sys/dkio.h>
82 #include <sys/vtoc.h>
83 #include <sys/kmem.h>
84 #include <vm/page.h>
85 #include <sys/cmn_err.h>
86 #include <sys/sysmacros.h>
87 #include <sys/types.h>
88 #include <sys/mkdev.h>
89 #include <sys/stat.h>
90 #include <sys/open.h>
91 #include <sys/lvm/mdvar.h>
92 #include <sys/lvm/md_sp.h>
93 #include <sys/lvm/md_convert.h>
94 #include <sys/lvm/md_notify.h>
95 #include <sys/lvm/md_crc.h>
96 #include <sys/modctl.h>
97 #include <sys/ddi.h>
98 #include <sys/sunddi.h>
99 #include <sys/debug.h>
100 
101 #include <sys/sysevent/eventdefs.h>
102 #include <sys/sysevent/svm.h>
103 
104 md_ops_t		sp_md_ops;
105 #ifndef	lint
106 char			_depends_on[] = "drv/md";
107 md_ops_t		*md_interface_ops = &sp_md_ops;
108 #endif
109 
110 extern unit_t		md_nunits;
111 extern set_t		md_nsets;
112 extern md_set_t		md_set[];
113 
114 extern int		md_status;
115 extern major_t		md_major;
116 extern mdq_anchor_t	md_done_daemon;
117 extern mdq_anchor_t	md_sp_daemon;
118 extern kmutex_t		md_mx;
119 extern kcondvar_t	md_cv;
120 extern md_krwlock_t	md_unit_array_rw;
121 
122 static kmem_cache_t	*sp_parent_cache = NULL;
123 static kmem_cache_t	*sp_child_cache = NULL;
124 static void		sp_send_stat_ok(mp_unit_t *);
125 static void		sp_send_stat_err(mp_unit_t *);
126 
127 /*
128  * FUNCTION:	sp_parent_constructor()
129  * INPUT:	none.
130  * OUTPUT:	ps	- parent save structure initialized.
131  * RETURNS:	void *	- ptr to initialized parent save structure.
132  * PURPOSE:	initialize parent save structure.
133  */
134 /*ARGSUSED1*/
135 static int
136 sp_parent_constructor(void *p, void *d1, int d2)
137 {
138 	mutex_init(&((md_spps_t *)p)->ps_mx,
139 	    NULL, MUTEX_DEFAULT, NULL);
140 	return (0);
141 }
142 
143 static void
144 sp_parent_init(md_spps_t *ps)
145 {
146 	bzero(ps, offsetof(md_spps_t, ps_mx));
147 }
148 
149 /*ARGSUSED1*/
150 static void
151 sp_parent_destructor(void *p, void *d)
152 {
153 	mutex_destroy(&((md_spps_t *)p)->ps_mx);
154 }
155 
156 /*
157  * FUNCTION:	sp_child_constructor()
158  * INPUT:	none.
159  * OUTPUT:	cs	- child save structure initialized.
160  * RETURNS:	void *	- ptr to initialized child save structure.
161  * PURPOSE:	initialize child save structure.
162  */
163 /*ARGSUSED1*/
164 static int
165 sp_child_constructor(void *p, void *d1, int d2)
166 {
167 	bioinit(&((md_spcs_t *)p)->cs_buf);
168 	return (0);
169 }
170 
171 static void
172 sp_child_init(md_spcs_t *cs)
173 {
174 	cs->cs_mdunit = 0;
175 	cs->cs_ps = NULL;
176 	md_bioreset(&cs->cs_buf);
177 }
178 
179 /*ARGSUSED1*/
180 static void
181 sp_child_destructor(void *p, void *d)
182 {
183 	biofini(&((md_spcs_t *)p)->cs_buf);
184 }
185 
186 /*
187  * FUNCTION:	sp_run_queue()
188  * INPUT:	none.
189  * OUTPUT:	none.
190  * RETURNS:	void.
191  * PURPOSE:	run the md_daemon to clean up memory pool.
192  */
193 /*ARGSUSED*/
194 static void
195 sp_run_queue(void *d)
196 {
197 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
198 		md_daemon(1, &md_done_daemon);
199 }
200 
201 
202 /*
203  * FUNCTION:	sp_build_incore()
204  * INPUT:	p		- ptr to unit structure.
205  *		snarfing	- flag to tell us we are snarfing.
206  * OUTPUT:	non.
207  * RETURNS:	int	- 0 (always).
208  * PURPOSE:	place unit structure into in-core unit array (keyed from
209  *		minor number).
210  */
211 int
212 sp_build_incore(void *p, int snarfing)
213 {
214 	mp_unit_t	*un = (mp_unit_t *)p;
215 	minor_t		mnum;
216 	set_t		setno;
217 	md_dev64_t	tmpdev;
218 
219 	mnum = MD_SID(un);
220 
221 	if (MD_UNIT(mnum) != NULL)
222 		return (0);
223 
224 	MD_STATUS(un) = 0;
225 
226 	if (snarfing) {
227 		/*
228 		 * if we are snarfing, we get the device information
229 		 * from the metadb record (using the metadb key for
230 		 * that device).
231 		 */
232 		setno = MD_MIN2SET(mnum);
233 
234 		tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
235 		    un->un_key, MD_NOTRUST_DEVT);
236 		un->un_dev = tmpdev;
237 	}
238 
239 	/* place various information in the in-core data structures */
240 	md_nblocks_set(mnum, un->c.un_total_blocks);
241 	MD_UNIT(mnum) = un;
242 
243 	return (0);
244 }
245 
246 /*
247  * FUNCTION:	reset_sp()
248  * INPUT:	un		- unit structure to be reset/removed.
249  *		mnum		- minor number to be reset/removed.
250  *		removing	- flag to tell us if we are removing
251  *				  permanently or just reseting in-core
252  *				  structures.
253  * OUTPUT:	none.
254  * RETURNS:	void.
255  * PURPOSE:	used to either simply reset in-core structures or to
256  *		permanently remove metadevices from the metadb.
257  */
258 void
259 reset_sp(mp_unit_t *un, minor_t mnum, int removing)
260 {
261 	sv_dev_t	*sv;
262 	mddb_recid_t	vtoc_id;
263 
264 	/* clean up in-core structures */
265 	md_destroy_unit_incore(mnum, &sp_md_ops);
266 
267 	md_nblocks_set(mnum, -1ULL);
268 	MD_UNIT(mnum) = NULL;
269 
270 	/*
271 	 * Attempt release of minor node
272 	 */
273 	md_remove_minor_node(mnum);
274 
275 	if (!removing)
276 		return;
277 
278 	/* we are removing the soft partition from the metadb */
279 
280 	/*
281 	 * Save off device information so we can get to
282 	 * it after we do the mddb_deleterec().
283 	 */
284 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
285 	sv->setno = MD_MIN2SET(mnum);
286 	sv->key = un->un_key;
287 	vtoc_id = un->c.un_vtoc_id;
288 
289 	/*
290 	 * Remove self from the namespace
291 	 */
292 	if (un->c.un_revision & MD_FN_META_DEV) {
293 		(void) md_rem_selfname(un->c.un_self_id);
294 	}
295 
296 	/* Remove the unit structure */
297 	mddb_deleterec_wrapper(un->c.un_record_id);
298 
299 	if (vtoc_id)
300 		mddb_deleterec_wrapper(vtoc_id);
301 
302 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
303 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
304 
305 	/*
306 	 * remove the underlying device name from the metadb.  if other
307 	 * soft partitions are built on this device, this will simply
308 	 * decrease the reference count for this device.  otherwise the
309 	 * name record for this device will be removed from the metadb.
310 	 */
311 	md_rem_names(sv, 1);
312 	kmem_free(sv, sizeof (sv_dev_t));
313 }
314 
315 /*
316  * FUNCTION:	sp_send_stat_msg
317  * INPUT:	un	- unit reference
318  *		status	- status to be sent to master node
319  *			MD_SP_OK - soft-partition is now OK
320  *			MD_SP_ERR	"	"	 errored
321  * OUTPUT:	none.
322  * RETURNS:	void.
323  * PURPOSE:	send a soft-partition status change to the master node. If the
324  *		message succeeds we simply return. If it fails we panic as the
325  *		cluster-wide view of the metadevices is now inconsistent.
326  * CALLING CONTEXT:
327  *	Blockable. No locks can be held.
328  */
329 static void
330 sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
331 {
332 	md_mn_msg_sp_setstat_t	sp_msg;
333 	md_mn_kresult_t	*kres;
334 	set_t		setno = MD_UN2SET(un);
335 	int		rval;
336 	const char	*str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
337 
338 	sp_msg.sp_setstat_mnum = MD_SID(un);
339 	sp_msg.sp_setstat_status = status;
340 
341 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
342 
343 	rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
344 	    (char *)&sp_msg, sizeof (sp_msg), kres);
345 
346 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
347 		mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
348 
349 		/*
350 		 * Panic as we are now in an inconsistent state.
351 		 */
352 
353 		cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
354 		    md_shortname(MD_SID(un)), str);
355 	}
356 
357 	kmem_free(kres, sizeof (md_mn_kresult_t));
358 }
359 
360 /*
361  * FUNCTION:	sp_finish_error
362  * INPUT:	ps	- parent save structure for error-ed I/O.
363  *		lock_held	- set if the unit readerlock is held
364  * OUTPUT:	none.
365  * RETURNS:	void.
366  * PURPOSE:	report a driver error
367  */
368 static void
369 sp_finish_error(md_spps_t *ps, int lock_held)
370 {
371 	struct buf	*pb = ps->ps_bp;
372 	mdi_unit_t	*ui = ps->ps_ui;
373 	md_dev64_t	un_dev;			/* underlying device */
374 	md_dev64_t	md_dev = md_expldev(pb->b_edev); /* metadev in error */
375 	char		*str;
376 
377 	un_dev = md_expldev(ps->ps_un->un_dev);
378 	/* set error type */
379 	if (pb->b_flags & B_READ) {
380 		str = "read";
381 	} else {
382 		str = "write";
383 	}
384 
385 
386 	SPPS_FREE(sp_parent_cache, ps);
387 	pb->b_flags |= B_ERROR;
388 
389 	md_kstat_done(ui, pb, 0);
390 
391 	if (lock_held) {
392 		md_unit_readerexit(ui);
393 	}
394 	md_biodone(pb);
395 
396 	cmn_err(CE_WARN, "md: %s: %s error on %s",
397 	    md_shortname(md_getminor(md_dev)), str,
398 	    md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
399 }
400 
401 
402 /*
403  * FUNCTION:	sp_xmit_ok
404  * INPUT:	dq	- daemon queue referencing failing ps structure
405  * OUTPUT:	none.
406  * RETURNS:	void.
407  * PURPOSE:	send a message to the master node in a multi-owner diskset to
408  *		update all attached nodes view of the soft-part to be MD_SP_OK.
409  * CALLING CONTEXT:
410  *	Blockable. No unit lock held.
411  */
412 static void
413 sp_xmit_ok(daemon_queue_t *dq)
414 {
415 	md_spps_t	*ps = (md_spps_t *)dq;
416 
417 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
418 	sp_send_stat_msg(ps->ps_un, MD_SP_OK);
419 
420 	/*
421 	 * Successfully transmitted error state to all nodes, now release this
422 	 * parent structure.
423 	 */
424 	SPPS_FREE(sp_parent_cache, ps);
425 }
426 
427 /*
428  * FUNCTION:	sp_xmit_error
429  * INPUT:	dq	- daemon queue referencing failing ps structure
430  * OUTPUT:	none.
431  * RETURNS:	void.
432  * PURPOSE:	send a message to the master node in a multi-owner diskset to
433  *		update all attached nodes view of the soft-part to be MD_SP_ERR.
434  * CALLING CONTEXT:
435  *	Blockable. No unit lock held.
436  */
437 static void
438 sp_xmit_error(daemon_queue_t *dq)
439 {
440 	md_spps_t	*ps = (md_spps_t *)dq;
441 
442 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
443 	sp_send_stat_msg(ps->ps_un, MD_SP_ERR);
444 
445 	/*
446 	 * Successfully transmitted error state to all nodes, now release this
447 	 * parent structure.
448 	 */
449 	SPPS_FREE(sp_parent_cache, ps);
450 }
451 static void
452 sp_send_stat_ok(mp_unit_t *un)
453 {
454 	minor_t		mnum = MD_SID(un);
455 	md_spps_t	*ps;
456 
457 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
458 	sp_parent_init(ps);
459 	ps->ps_un = un;
460 	ps->ps_ui = MDI_UNIT(mnum);
461 
462 	daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
463 	    REQ_OLD);
464 }
465 
466 static void
467 sp_send_stat_err(mp_unit_t *un)
468 {
469 	minor_t		mnum = MD_SID(un);
470 	md_spps_t	*ps;
471 
472 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
473 	sp_parent_init(ps);
474 	ps->ps_un = un;
475 	ps->ps_ui = MDI_UNIT(mnum);
476 
477 	daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
478 	    REQ_OLD);
479 }
480 
481 
482 /*
483  * FUNCTION:	sp_error()
484  * INPUT:	ps	- parent save structure for error-ed I/O.
485  * OUTPUT:	none.
486  * RETURNS:	void.
487  * PURPOSE:	report a driver error.
488  * CALLING CONTEXT:
489  *	Interrupt - non-blockable
490  */
491 static void
492 sp_error(md_spps_t *ps)
493 {
494 	set_t		setno = MD_UN2SET(ps->ps_un);
495 
496 	/*
497 	 * Drop the mutex associated with this request before (potentially)
498 	 * enqueuing the free onto a separate thread. We have to release the
499 	 * mutex before destroying the parent structure.
500 	 */
501 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
502 		if (MUTEX_HELD(&ps->ps_mx)) {
503 			mutex_exit(&ps->ps_mx);
504 		}
505 	} else {
506 		/*
507 		 * this should only ever happen if we are panicking,
508 		 * since DONTFREE is only set on the parent if panicstr
509 		 * is non-NULL.
510 		 */
511 		ASSERT(panicstr);
512 	}
513 
514 	/*
515 	 * For a multi-owner set we need to send a message to the master so that
516 	 * all nodes get the errored status when we first encounter it. To avoid
517 	 * deadlocking when multiple soft-partitions encounter an error on one
518 	 * physical unit we drop the unit readerlock before enqueueing the
519 	 * request. That way we can service any messages that require a
520 	 * writerlock to be held. Additionally, to avoid deadlocking when at
521 	 * the bottom of a metadevice stack and a higher level mirror has
522 	 * multiple requests outstanding on this soft-part, we clone the ps
523 	 * that failed and pass the error back up the stack to release the
524 	 * reference that this i/o may have in the higher-level metadevice.
525 	 * The other nodes in the cluster just have to modify the soft-part
526 	 * status and we do not need to block the i/o completion for this.
527 	 */
528 	if (MD_MNSET_SETNO(setno)) {
529 		md_spps_t	*err_ps;
530 		err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
531 		sp_parent_init(err_ps);
532 
533 		err_ps->ps_un = ps->ps_un;
534 		err_ps->ps_ui = ps->ps_ui;
535 
536 		md_unit_readerexit(ps->ps_ui);
537 
538 		daemon_request(&md_sp_daemon, sp_xmit_error,
539 		    (daemon_queue_t *)err_ps, REQ_OLD);
540 
541 		sp_finish_error(ps, 0);
542 
543 		return;
544 	} else {
545 		ps->ps_un->un_status = MD_SP_ERR;
546 	}
547 
548 	/* Flag the error */
549 	sp_finish_error(ps, 1);
550 
551 }
552 
553 /*
554  * FUNCTION:	sp_mapbuf()
555  * INPUT:	un	- unit structure for soft partition we are doing
556  *			  I/O on.
557  *		voff	- virtual offset in soft partition to map.
558  *		bcount	- # of blocks in the I/O.
559  * OUTPUT:	bp	- translated buffer to be passed down to next layer.
560  * RETURNS:	1	- request must be fragmented, more work to do,
561  *		0	- request satisified, no more work to do
562  *		-1	- error
563  * PURPOSE:	Map the the virtual offset in the soft partition (passed
564  *		in via voff) to the "physical" offset on whatever the soft
565  *		partition is built on top of.  We do this by doing a binary
566  *		search of the extent array in the soft partition unit
567  *		structure.  Once the current extent is found, we do the
568  *		translation, determine if the I/O will cross extent
569  *		boundaries (if so, we have to fragment the I/O), then
570  *		fill in the buf structure to be passed down to the next layer.
571  */
572 static int
573 sp_mapbuf(
574 	mp_unit_t	*un,
575 	sp_ext_offset_t	voff,
576 	sp_ext_length_t	bcount,
577 	buf_t		*bp
578 )
579 {
580 	int		lo, mid, hi, found, more;
581 	size_t		new_bcount;
582 	sp_ext_offset_t new_blkno;
583 	sp_ext_offset_t	new_offset;
584 	sp_ext_offset_t	ext_endblk;
585 	md_dev64_t	new_edev;
586 	extern unsigned	md_maxphys;
587 
588 	found = 0;
589 	lo = 0;
590 	hi = un->un_numexts - 1;
591 
592 	/*
593 	 * do a binary search to find the extent that contains the
594 	 * starting offset.  after this loop, mid contains the index
595 	 * of the correct extent.
596 	 */
597 	while (lo <= hi && !found) {
598 		mid = (lo + hi) / 2;
599 		/* is the starting offset contained within the mid-ext? */
600 		if (voff >= un->un_ext[mid].un_voff &&
601 		    voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
602 			found = 1;
603 		else if (voff < un->un_ext[mid].un_voff)
604 			hi = mid - 1;
605 		else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
606 			lo = mid + 1;
607 	}
608 
609 	if (!found) {
610 		cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
611 		return (-1);
612 	}
613 
614 	/* translate to underlying physical offset/device */
615 	new_offset = voff - un->un_ext[mid].un_voff;
616 	new_blkno = un->un_ext[mid].un_poff + new_offset;
617 	new_edev = un->un_dev;
618 
619 	/* determine if we need to break the I/O into fragments */
620 	ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
621 	if (voff + btodb(bcount) > ext_endblk) {
622 		new_bcount = dbtob(ext_endblk - voff);
623 		more = 1;
624 	} else {
625 		new_bcount = bcount;
626 		more = 0;
627 	}
628 
629 	/* only break up the I/O if we're not built on another metadevice */
630 	if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
631 		new_bcount = md_maxphys;
632 		more = 1;
633 	}
634 	if (bp != (buf_t *)NULL) {
635 		/* do bp updates */
636 		bp->b_bcount = new_bcount;
637 		bp->b_lblkno = new_blkno;
638 		bp->b_edev = md_dev64_to_dev(new_edev);
639 	}
640 	return (more);
641 }
642 
643 /*
644  * FUNCTION:	sp_validate()
645  * INPUT:	un	- unit structure to be validated.
646  * OUTPUT:	none.
647  * RETURNS:	0	- soft partition ok.
648  *		-1	- error.
649  * PURPOSE:	called on open to sanity check the soft partition.  In
650  *		order to open a soft partition:
651  *		- it must have at least one extent
652  *		- the extent info in core and on disk must match
653  *		- it may not be in an intermediate state (which would
654  *		  imply that a two-phase commit was interrupted)
655  *
656  *		If the extent checking fails (B_ERROR returned from the read
657  *		strategy call) _and_ we're a multi-owner diskset, we send a
658  *		message to the master so that all nodes inherit the same view
659  *		of the soft partition.
660  *		If we are checking a soft-part that is marked as in error, and
661  *		we can actually read and validate the watermarks we send a
662  *		message to clear the error to the master node.
663  */
664 static int
665 sp_validate(mp_unit_t *un)
666 {
667 	uint_t		ext;
668 	struct buf	*buf;
669 	sp_ext_length_t	len;
670 	mp_watermark_t	*wm;
671 	set_t		setno;
672 	int		reset_error = 0;
673 
674 	setno = MD_UN2SET(un);
675 
676 	/* sanity check unit structure components ?? */
677 	if (un->un_status != MD_SP_OK) {
678 		if (un->un_status != MD_SP_ERR) {
679 			cmn_err(CE_WARN, "md: %s: open failed, soft partition "
680 			    "status is %u.",
681 			    md_shortname(MD_SID(un)),
682 			    un->un_status);
683 			return (-1);
684 		} else {
685 			cmn_err(CE_WARN, "md: %s: open of soft partition "
686 			    "in Errored state.",
687 			    md_shortname(MD_SID(un)));
688 			reset_error = 1;
689 		}
690 	}
691 
692 	if (un->un_numexts == 0) {
693 		cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
694 		    "not have any extents.", md_shortname(MD_SID(un)));
695 		return (-1);
696 	}
697 
698 	len = 0LL;
699 	for (ext = 0; ext < un->un_numexts; ext++) {
700 
701 		/* tally extent lengths to check total size */
702 		len += un->un_ext[ext].un_len;
703 
704 		/* allocate buffer for watermark */
705 		buf = getrbuf(KM_SLEEP);
706 
707 		/* read watermark */
708 		buf->b_flags = B_READ;
709 		buf->b_edev = md_dev64_to_dev(un->un_dev);
710 		buf->b_iodone = NULL;
711 		buf->b_proc = NULL;
712 		buf->b_bcount = sizeof (mp_watermark_t);
713 		buf->b_lblkno = un->un_ext[ext].un_poff - 1;
714 		buf->b_bufsize = sizeof (mp_watermark_t);
715 		buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
716 		    KM_SLEEP);
717 
718 		/*
719 		 * make the call non-blocking so that it is not affected
720 		 * by a set take.
721 		 */
722 		md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
723 		(void) biowait(buf);
724 
725 		if (buf->b_flags & B_ERROR) {
726 			cmn_err(CE_WARN, "md: %s: open failed, could not "
727 			    "read watermark at block %llu for extent %u, "
728 			    "error %d.", md_shortname(MD_SID(un)),
729 			    buf->b_lblkno, ext, buf->b_error);
730 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
731 			freerbuf(buf);
732 
733 			/*
734 			 * If we're a multi-owner diskset we send a message
735 			 * indicating that this soft-part has an invalid
736 			 * extent to the master node. This ensures a consistent
737 			 * view of the soft-part across the cluster.
738 			 */
739 			if (MD_MNSET_SETNO(setno)) {
740 				sp_send_stat_err(un);
741 			}
742 			return (-1);
743 		}
744 
745 		wm = (mp_watermark_t *)buf->b_un.b_addr;
746 
747 		/* make sure the checksum is correct first */
748 		if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
749 		    (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
750 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
751 			    "at block %llu for extent %u does not have a "
752 			    "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
753 			    buf->b_lblkno, ext, wm->wm_checksum);
754 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
755 			freerbuf(buf);
756 			return (-1);
757 		}
758 
759 		if (wm->wm_magic != MD_SP_MAGIC) {
760 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
761 			    "at block %llu for extent %u does not have a "
762 			    "valid watermark magic number, expected 0x%x, "
763 			    "found 0x%x.", md_shortname(MD_SID(un)),
764 			    buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
765 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
766 			freerbuf(buf);
767 			return (-1);
768 		}
769 
770 		/* make sure sequence number matches the current extent */
771 		if (wm->wm_seq != ext) {
772 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
773 			    "at block %llu for extent %u has invalid "
774 			    "sequence number %u.", md_shortname(MD_SID(un)),
775 			    buf->b_lblkno, ext, wm->wm_seq);
776 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
777 			freerbuf(buf);
778 			return (-1);
779 		}
780 
781 		/* make sure watermark length matches unit structure */
782 		if (wm->wm_length != un->un_ext[ext].un_len) {
783 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
784 			    "at block %llu for extent %u has inconsistent "
785 			    "length, expected %llu, found %llu.",
786 			    md_shortname(MD_SID(un)), buf->b_lblkno,
787 			    ext, un->un_ext[ext].un_len,
788 			    (u_longlong_t)wm->wm_length);
789 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
790 			freerbuf(buf);
791 			return (-1);
792 		}
793 
794 		/*
795 		 * make sure the type is a valid soft partition and not
796 		 * a free extent or the end.
797 		 */
798 		if (wm->wm_type != EXTTYP_ALLOC) {
799 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
800 			    "at block %llu for extent %u is not marked "
801 			    "as in-use, type = %u.", md_shortname(MD_SID(un)),
802 			    buf->b_lblkno, ext, wm->wm_type);
803 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
804 			freerbuf(buf);
805 			return (-1);
806 		}
807 		/* free up buffer */
808 		kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
809 		freerbuf(buf);
810 	}
811 
812 	if (len != un->un_length) {
813 		cmn_err(CE_WARN, "md: %s: open failed, computed length "
814 		    "%llu != expected length %llu.", md_shortname(MD_SID(un)),
815 		    len, un->un_length);
816 		return (-1);
817 	}
818 
819 	/*
820 	 * If we're a multi-owner set _and_ reset_error is set, we should clear
821 	 * the error condition on all nodes in the set. Use SP_SETSTAT2 with
822 	 * MD_SP_OK.
823 	 */
824 	if (MD_MNSET_SETNO(setno) && reset_error) {
825 		sp_send_stat_ok(un);
826 	}
827 	return (0);
828 }
829 
830 /*
831  * FUNCTION:	sp_done()
832  * INPUT:	child_buf	- buffer attached to child save structure.
833  *				  this is the buffer on which I/O has just
834  *				  completed.
835  * OUTPUT:	none.
836  * RETURNS:	0	- success.
837  *		1	- error.
838  * PURPOSE:	called on I/O completion.
839  */
840 static int
841 sp_done(struct buf *child_buf)
842 {
843 	struct buf	*parent_buf;
844 	mdi_unit_t	*ui;
845 	md_spps_t	*ps;
846 	md_spcs_t	*cs;
847 
848 	/* find the child save structure to which this buffer belongs */
849 	cs = (md_spcs_t *)((caddr_t)child_buf -
850 	    (sizeof (md_spcs_t) - sizeof (buf_t)));
851 	/* now get the parent save structure */
852 	ps = cs->cs_ps;
853 	parent_buf = ps->ps_bp;
854 
855 	mutex_enter(&ps->ps_mx);
856 	/* pass any errors back up to the parent */
857 	if (child_buf->b_flags & B_ERROR) {
858 		ps->ps_flags |= MD_SPPS_ERROR;
859 		parent_buf->b_error = child_buf->b_error;
860 	}
861 	/* mapout, if needed */
862 	if (child_buf->b_flags & B_REMAPPED)
863 		bp_mapout(child_buf);
864 
865 	ps->ps_frags--;
866 	if (ps->ps_frags != 0) {
867 		/*
868 		 * if this parent has more children, we just free the
869 		 * child and return.
870 		 */
871 		kmem_cache_free(sp_child_cache, cs);
872 		mutex_exit(&ps->ps_mx);
873 		return (1);
874 	}
875 	/* there are no more children */
876 	kmem_cache_free(sp_child_cache, cs);
877 	if (ps->ps_flags & MD_SPPS_ERROR) {
878 		sp_error(ps);
879 		return (1);
880 	}
881 	ui = ps->ps_ui;
882 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
883 		mutex_exit(&ps->ps_mx);
884 	} else {
885 		/*
886 		 * this should only ever happen if we are panicking,
887 		 * since DONTFREE is only set on the parent if panicstr
888 		 * is non-NULL.
889 		 */
890 		ASSERT(panicstr);
891 	}
892 	SPPS_FREE(sp_parent_cache, ps);
893 	md_kstat_done(ui, parent_buf, 0);
894 	md_unit_readerexit(ui);
895 	md_biodone(parent_buf);
896 	return (0);
897 }
898 
899 /*
900  * FUNCTION:	md_sp_strategy()
901  * INPUT:	parent_buf	- parent buffer
902  *		flag		- flags
903  *		private		- private data
904  * OUTPUT:	none.
905  * RETURNS:	void.
906  * PURPOSE:	Soft partitioning I/O strategy.  Performs the main work
907  *		needed to do I/O to a soft partition.  The basic
908  *		algorithm is as follows:
909  *			- Allocate a child save structure to keep track
910  *			  of the I/O we are going to pass down.
911  *			- Map the I/O to the correct extent in the soft
912  *			  partition (see sp_mapbuf()).
913  *			- bioclone() the buffer and pass it down the
914  *			  stack using md_call_strategy.
915  *			- If the I/O needs to split across extents,
916  *			  repeat the above steps until all fragments
917  *			  are finished.
918  */
919 static void
920 md_sp_strategy(buf_t *parent_buf, int flag, void *private)
921 {
922 	md_spps_t	*ps;
923 	md_spcs_t	*cs;
924 	int		more;
925 	mp_unit_t	*un;
926 	mdi_unit_t	*ui;
927 	size_t		current_count;
928 	off_t		current_offset;
929 	sp_ext_offset_t	current_blkno;
930 	buf_t		*child_buf;
931 	set_t		setno = MD_MIN2SET(getminor(parent_buf->b_edev));
932 	int		strat_flag = flag;
933 
934 	/*
935 	 * When doing IO to a multi owner meta device, check if set is halted.
936 	 * We do this check without the needed lock held, for performance
937 	 * reasons.
938 	 * If an IO just slips through while the set is locked via an
939 	 * MD_MN_SUSPEND_SET, we don't care about it.
940 	 * Only check for suspension if we are a top-level i/o request
941 	 * (MD_STR_NOTTOP is cleared in 'flag');
942 	 */
943 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
944 	    (MD_SET_HALTED | MD_SET_MNSET)) {
945 		if ((flag & MD_STR_NOTTOP) == 0) {
946 			mutex_enter(&md_mx);
947 			/* Here we loop until the set is no longer halted */
948 			while (md_set[setno].s_status & MD_SET_HALTED) {
949 				cv_wait(&md_cv, &md_mx);
950 			}
951 			mutex_exit(&md_mx);
952 		}
953 	}
954 
955 	ui = MDI_UNIT(getminor(parent_buf->b_edev));
956 
957 	md_kstat_waitq_enter(ui);
958 
959 	un = (mp_unit_t *)md_unit_readerlock(ui);
960 
961 	if ((flag & MD_NOBLOCK) == 0) {
962 		if (md_inc_iocount(setno) != 0) {
963 			parent_buf->b_flags |= B_ERROR;
964 			parent_buf->b_error = ENXIO;
965 			parent_buf->b_resid = parent_buf->b_bcount;
966 			md_kstat_waitq_exit(ui);
967 			md_unit_readerexit(ui);
968 			biodone(parent_buf);
969 			return;
970 		}
971 	} else {
972 		md_inc_iocount_noblock(setno);
973 	}
974 
975 	if (!(flag & MD_STR_NOTTOP)) {
976 		if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
977 			md_kstat_waitq_exit(ui);
978 			return;
979 		}
980 	}
981 
982 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
983 	sp_parent_init(ps);
984 
985 	/*
986 	 * Save essential information from the original buffhdr
987 	 * in the parent.
988 	 */
989 	ps->ps_un = un;
990 	ps->ps_ui = ui;
991 	ps->ps_bp = parent_buf;
992 	ps->ps_addr = parent_buf->b_un.b_addr;
993 
994 	current_count = parent_buf->b_bcount;
995 	current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
996 	current_offset  = 0;
997 
998 	/*
999 	 * if we are at the top and we are panicking,
1000 	 * we don't free in order to save state.
1001 	 */
1002 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
1003 		ps->ps_flags |= MD_SPPS_DONTFREE;
1004 
1005 	md_kstat_waitq_to_runq(ui);
1006 
1007 	ps->ps_frags++;
1008 
1009 	/*
1010 	 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
1011 	 * metadevice.
1012 	 */
1013 	if (ui->ui_tstate & MD_ABR_CAP)
1014 		strat_flag |= MD_STR_ABR;
1015 
1016 	/*
1017 	 * this loop does the main work of an I/O.  we allocate a
1018 	 * a child save for each buf, do the logical to physical
1019 	 * mapping, decide if we need to frag the I/O, clone the
1020 	 * new I/O to pass down the stack.  repeat until we've
1021 	 * taken care of the entire buf that was passed to us.
1022 	 */
1023 	do {
1024 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1025 		sp_child_init(cs);
1026 		child_buf = &cs->cs_buf;
1027 		cs->cs_ps = ps;
1028 
1029 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1030 		if (more == -1) {
1031 			parent_buf->b_flags |= B_ERROR;
1032 			parent_buf->b_error = EIO;
1033 			md_kstat_done(ui, parent_buf, 0);
1034 			md_unit_readerexit(ui);
1035 			md_biodone(parent_buf);
1036 			kmem_cache_free(sp_parent_cache, ps);
1037 			return;
1038 		}
1039 
1040 		child_buf = md_bioclone(parent_buf, current_offset,
1041 		    child_buf->b_bcount, child_buf->b_edev,
1042 		    child_buf->b_blkno, sp_done, child_buf,
1043 		    KM_NOSLEEP);
1044 		/* calculate new offset, counts, etc... */
1045 		current_offset += child_buf->b_bcount;
1046 		current_count -=  child_buf->b_bcount;
1047 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1048 
1049 		if (more) {
1050 			mutex_enter(&ps->ps_mx);
1051 			ps->ps_frags++;
1052 			mutex_exit(&ps->ps_mx);
1053 		}
1054 
1055 		md_call_strategy(child_buf, strat_flag, private);
1056 	} while (more);
1057 
1058 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
1059 		while (!(ps->ps_flags & MD_SPPS_DONE)) {
1060 			md_daemon(1, &md_done_daemon);
1061 		}
1062 		kmem_cache_free(sp_parent_cache, ps);
1063 	}
1064 }
1065 
1066 /*
1067  * FUNCTION:	sp_directed_read()
1068  * INPUT:	mnum	- minor number
1069  *		vdr	- vol_directed_rd_t from user
1070  *		mode	- access mode for copying data out.
1071  * OUTPUT:	none.
1072  * RETURNS:	0	- success
1073  *		Exxxxx	- failure error-code
1074  * PURPOSE:	Construct the necessary sub-device i/o requests to perform the
1075  *		directed read as requested by the user. This is essentially the
1076  *		same as md_sp_strategy() with the exception being that the
1077  *		underlying 'md_call_strategy' is replaced with an ioctl call.
1078  */
1079 int
1080 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
1081 {
1082 	md_spps_t	*ps;
1083 	md_spcs_t	*cs;
1084 	int		more;
1085 	mp_unit_t	*un;
1086 	mdi_unit_t	*ui;
1087 	size_t		current_count;
1088 	off_t		current_offset;
1089 	sp_ext_offset_t	current_blkno;
1090 	buf_t		*child_buf, *parent_buf;
1091 	void		*kbuffer;
1092 	vol_directed_rd_t	cvdr;
1093 	caddr_t		userbuf;
1094 	offset_t	useroff;
1095 	int		ret = 0;
1096 
1097 	ui = MDI_UNIT(mnum);
1098 
1099 	md_kstat_waitq_enter(ui);
1100 
1101 	bzero(&cvdr, sizeof (cvdr));
1102 
1103 	un = (mp_unit_t *)md_unit_readerlock(ui);
1104 
1105 	/*
1106 	 * Construct a parent_buf header which reflects the user-supplied
1107 	 * request.
1108 	 */
1109 
1110 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
1111 	if (kbuffer == NULL) {
1112 		vdr->vdr_flags |= DKV_DMR_ERROR;
1113 		md_kstat_waitq_exit(ui);
1114 		md_unit_readerexit(ui);
1115 		return (ENOMEM);
1116 	}
1117 
1118 	parent_buf = getrbuf(KM_NOSLEEP);
1119 	if (parent_buf == NULL) {
1120 		vdr->vdr_flags |= DKV_DMR_ERROR;
1121 		md_kstat_waitq_exit(ui);
1122 		md_unit_readerexit(ui);
1123 		kmem_free(kbuffer, vdr->vdr_nbytes);
1124 		return (ENOMEM);
1125 	}
1126 	parent_buf->b_un.b_addr = kbuffer;
1127 	parent_buf->b_flags = B_READ;
1128 	parent_buf->b_bcount = vdr->vdr_nbytes;
1129 	parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
1130 	parent_buf->b_edev = un->un_dev;
1131 
1132 
1133 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
1134 	sp_parent_init(ps);
1135 
1136 	/*
1137 	 * Save essential information from the original buffhdr
1138 	 * in the parent.
1139 	 */
1140 	ps->ps_un = un;
1141 	ps->ps_ui = ui;
1142 	ps->ps_bp = parent_buf;
1143 	ps->ps_addr = parent_buf->b_un.b_addr;
1144 
1145 	current_count = parent_buf->b_bcount;
1146 	current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
1147 	current_offset  = 0;
1148 
1149 	md_kstat_waitq_to_runq(ui);
1150 
1151 	ps->ps_frags++;
1152 	vdr->vdr_bytesread = 0;
1153 
1154 	/*
1155 	 * this loop does the main work of an I/O.  we allocate a
1156 	 * a child save for each buf, do the logical to physical
1157 	 * mapping, decide if we need to frag the I/O, clone the
1158 	 * new I/O to pass down the stack.  repeat until we've
1159 	 * taken care of the entire buf that was passed to us.
1160 	 */
1161 	do {
1162 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1163 		sp_child_init(cs);
1164 		child_buf = &cs->cs_buf;
1165 		cs->cs_ps = ps;
1166 
1167 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1168 		if (more == -1) {
1169 			ret = EIO;
1170 			vdr->vdr_flags |= DKV_DMR_SHORT;
1171 			kmem_cache_free(sp_child_cache, cs);
1172 			goto err_out;
1173 		}
1174 
1175 		cvdr.vdr_flags = vdr->vdr_flags;
1176 		cvdr.vdr_side = vdr->vdr_side;
1177 		cvdr.vdr_nbytes = child_buf->b_bcount;
1178 		cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
1179 		/* Work out where we are in the allocated buffer */
1180 		useroff = (offset_t)(uintptr_t)kbuffer;
1181 		useroff = useroff + (offset_t)current_offset;
1182 		cvdr.vdr_data = (void *)(uintptr_t)useroff;
1183 		child_buf = md_bioclone(parent_buf, current_offset,
1184 		    child_buf->b_bcount, child_buf->b_edev,
1185 		    child_buf->b_blkno, NULL,
1186 		    child_buf, KM_NOSLEEP);
1187 		/* calculate new offset, counts, etc... */
1188 		current_offset += child_buf->b_bcount;
1189 		current_count -=  child_buf->b_bcount;
1190 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1191 
1192 		if (more) {
1193 			mutex_enter(&ps->ps_mx);
1194 			ps->ps_frags++;
1195 			mutex_exit(&ps->ps_mx);
1196 		}
1197 
1198 		ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
1199 		    (mode | FKIOCTL), NULL);
1200 
1201 		/*
1202 		 * Free the child structure as we've finished with it.
1203 		 * Normally this would be done by sp_done() but we're just
1204 		 * using md_bioclone() to segment the transfer and we never
1205 		 * issue a strategy request so the iodone will not be called.
1206 		 */
1207 		kmem_cache_free(sp_child_cache, cs);
1208 		if (ret == 0) {
1209 			/* copyout the returned data to vdr_data + offset */
1210 			userbuf = (caddr_t)kbuffer;
1211 			userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
1212 			if (ddi_copyout(userbuf, vdr->vdr_data,
1213 			    cvdr.vdr_bytesread, mode)) {
1214 				ret = EFAULT;
1215 				goto err_out;
1216 			}
1217 			vdr->vdr_bytesread += cvdr.vdr_bytesread;
1218 		} else {
1219 			goto err_out;
1220 		}
1221 	} while (more);
1222 
1223 	/*
1224 	 * Update the user-supplied vol_directed_rd_t structure with the
1225 	 * contents of the last issued child request.
1226 	 */
1227 	vdr->vdr_flags = cvdr.vdr_flags;
1228 	vdr->vdr_side = cvdr.vdr_side;
1229 	bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);
1230 
1231 err_out:
1232 	if (ret != 0) {
1233 		vdr->vdr_flags |= DKV_DMR_ERROR;
1234 	}
1235 	if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
1236 		vdr->vdr_flags |= DKV_DMR_SHORT;
1237 	}
1238 	kmem_cache_free(sp_parent_cache, ps);
1239 	kmem_free(kbuffer, vdr->vdr_nbytes);
1240 	freerbuf(parent_buf);
1241 	md_unit_readerexit(ui);
1242 	return (ret);
1243 }
1244 
1245 /*
1246  * FUNCTION:	sp_snarf()
1247  * INPUT:	cmd	- snarf cmd.
1248  *		setno	- set number.
1249  * OUTPUT:	none.
1250  * RETURNS:	1	- soft partitions were snarfed.
1251  *		0	- no soft partitions were snarfed.
1252  * PURPOSE:	Snarf soft partition metadb records into their in-core
1253  *		structures.  This routine is called at "snarf time" when
1254  *		md loads and gets all metadevices records into memory.
1255  *		The basic algorithm is simply to walk the soft partition
1256  *		records in the metadb and call the soft partitioning
1257  *		build_incore routine to set up the in-core structures.
1258  */
1259 static int
1260 sp_snarf(md_snarfcmd_t cmd, set_t setno)
1261 {
1262 	mp_unit_t	*un;
1263 	mddb_recid_t	recid;
1264 	int		gotsomething;
1265 	int		all_sp_gotten;
1266 	mddb_type_t	rec_type;
1267 	mddb_de_ic_t	*dep;
1268 	mddb_rb32_t	*rbp;
1269 	mp_unit_t	*big_un;
1270 	mp_unit32_od_t	*small_un;
1271 	size_t		newreqsize;
1272 
1273 
1274 	if (cmd == MD_SNARF_CLEANUP)
1275 		return (0);
1276 
1277 	all_sp_gotten = 1;
1278 	gotsomething = 0;
1279 
1280 	/* get the record type */
1281 	rec_type = (mddb_type_t)md_getshared_key(setno,
1282 	    sp_md_ops.md_driver.md_drivername);
1283 	recid = mddb_makerecid(setno, 0);
1284 
1285 	/*
1286 	 * walk soft partition records in the metadb and call
1287 	 * sp_build_incore to build in-core structures.
1288 	 */
1289 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1290 		/* if we've already gotten this record, go to the next one */
1291 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1292 			continue;
1293 
1294 
1295 		dep = mddb_getrecdep(recid);
1296 		dep->de_flags = MDDB_F_SOFTPART;
1297 		rbp = dep->de_rb;
1298 
1299 		switch (rbp->rb_revision) {
1300 		case MDDB_REV_RB:
1301 		case MDDB_REV_RBFN:
1302 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
1303 				/*
1304 				 * This means, we have an old and small record.
1305 				 * And this record hasn't already been converted
1306 				 * :-o before we create an incore metadevice
1307 				 * from this we have to convert it to a big
1308 				 * record.
1309 				 */
1310 				small_un =
1311 				    (mp_unit32_od_t *)mddb_getrecaddr(recid);
1312 				newreqsize = sizeof (mp_unit_t) +
1313 				    ((small_un->un_numexts - 1) *
1314 				    sizeof (struct mp_ext));
1315 				big_un = (mp_unit_t *)kmem_zalloc(newreqsize,
1316 				    KM_SLEEP);
1317 				softpart_convert((caddr_t)small_un,
1318 				    (caddr_t)big_un, SMALL_2_BIG);
1319 				kmem_free(small_un, dep->de_reqsize);
1320 				dep->de_rb_userdata = big_un;
1321 				dep->de_reqsize = newreqsize;
1322 				rbp->rb_private |= MD_PRV_CONVD;
1323 				un = big_un;
1324 			} else {
1325 				/* Record has already been converted */
1326 				un = (mp_unit_t *)mddb_getrecaddr(recid);
1327 			}
1328 			un->c.un_revision &= ~MD_64BIT_META_DEV;
1329 			break;
1330 		case MDDB_REV_RB64:
1331 		case MDDB_REV_RB64FN:
1332 			/* Large device */
1333 			un = (mp_unit_t *)mddb_getrecaddr(recid);
1334 			un->c.un_revision |= MD_64BIT_META_DEV;
1335 			un->c.un_flag |= MD_EFILABEL;
1336 			break;
1337 		}
1338 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
1339 
1340 		/*
1341 		 * Create minor node for snarfed entry.
1342 		 */
1343 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
1344 
1345 		if (MD_UNIT(MD_SID(un)) != NULL) {
1346 			/* unit is already in-core */
1347 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1348 			continue;
1349 		}
1350 		all_sp_gotten = 0;
1351 		if (sp_build_incore((void *)un, 1) == 0) {
1352 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
1353 			md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
1354 			gotsomething = 1;
1355 		}
1356 	}
1357 
1358 	if (!all_sp_gotten)
1359 		return (gotsomething);
1360 	/* double-check records */
1361 	recid = mddb_makerecid(setno, 0);
1362 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
1363 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
1364 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1365 
1366 	return (0);
1367 }
1368 
1369 /*
1370  * FUNCTION:	sp_halt()
1371  * INPUT:	cmd	- halt cmd.
1372  *		setno	- set number.
1373  * RETURNS:	0	- success.
1374  *		1	- err.
1375  * PURPOSE:	Perform driver halt operations.  As with stripe, we
1376  *		support MD_HALT_CHECK and MD_HALT_DOIT.  The first
1377  *		does a check to see if halting can be done safely
1378  *		(no open soft partitions), the second cleans up and
1379  *		shuts down the driver.
1380  */
1381 static int
1382 sp_halt(md_haltcmd_t cmd, set_t setno)
1383 {
1384 	int		i;
1385 	mdi_unit_t	*ui;
1386 	minor_t		mnum;
1387 
1388 	if (cmd == MD_HALT_CLOSE)
1389 		return (0);
1390 
1391 	if (cmd == MD_HALT_OPEN)
1392 		return (0);
1393 
1394 	if (cmd == MD_HALT_UNLOAD)
1395 		return (0);
1396 
1397 	if (cmd == MD_HALT_CHECK) {
1398 		for (i = 0; i < md_nunits; i++) {
1399 			mnum = MD_MKMIN(setno, i);
1400 			if ((ui = MDI_UNIT(mnum)) == NULL)
1401 				continue;
1402 			if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1403 				continue;
1404 			if (md_unit_isopen(ui))
1405 				return (1);
1406 		}
1407 		return (0);
1408 	}
1409 
1410 	if (cmd != MD_HALT_DOIT)
1411 		return (1);
1412 
1413 	for (i = 0; i < md_nunits; i++) {
1414 		mnum = MD_MKMIN(setno, i);
1415 		if ((ui = MDI_UNIT(mnum)) == NULL)
1416 			continue;
1417 		if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1418 			continue;
1419 		reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
1420 	}
1421 
1422 	return (0);
1423 }
1424 
1425 /*
1426  * FUNCTION:	sp_open_dev()
1427  * INPUT:	un	- unit structure.
1428  *		oflags	- open flags.
1429  * OUTPUT:	none.
1430  * RETURNS:	0		- success.
1431  *		non-zero	- err.
1432  * PURPOSE:	open underlying device via md_layered_open.
1433  */
1434 static int
1435 sp_open_dev(mp_unit_t *un, int oflags)
1436 {
1437 	minor_t		mnum = MD_SID(un);
1438 	int		err;
1439 	md_dev64_t	tmpdev;
1440 	set_t		setno = MD_MIN2SET(MD_SID(un));
1441 	side_t		side = mddb_getsidenum(setno);
1442 
1443 	tmpdev = un->un_dev;
1444 	/*
1445 	 * Do the open by device id if underlying is regular
1446 	 */
1447 	if ((md_getmajor(tmpdev) != md_major) &&
1448 	    md_devid_found(setno, side, un->un_key) == 1) {
1449 		tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
1450 	}
1451 	err = md_layered_open(mnum, &tmpdev, oflags);
1452 	un->un_dev = tmpdev;
1453 
1454 	if (err)
1455 		return (ENXIO);
1456 
1457 	return (0);
1458 }
1459 
1460 /*
1461  * FUNCTION:	sp_open()
1462  * INPUT:	dev		- device to open.
1463  *		flag		- pass-through flag.
1464  *		otyp		- pass-through open type.
1465  *		cred_p		- credentials.
1466  *		md_oflags	- open flags.
1467  * OUTPUT:	none.
1468  * RETURNS:	0		- success.
1469  *		non-zero	- err.
1470  * PURPOSE:	open a soft partition.
1471  */
1472 /* ARGSUSED */
1473 static int
1474 sp_open(
1475 	dev_t		*dev,
1476 	int		flag,
1477 	int		otyp,
1478 	cred_t		*cred_p,
1479 	int		md_oflags
1480 )
1481 {
1482 	minor_t		mnum = getminor(*dev);
1483 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1484 	mp_unit_t	*un;
1485 	int		err = 0;
1486 	set_t		setno;
1487 
1488 	/*
1489 	 * When doing an open of a multi owner metadevice, check to see if this
1490 	 * node is a starting node and if a reconfig cycle is underway.
1491 	 * If so, the system isn't sufficiently set up enough to handle the
1492 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
1493 	 */
1494 	setno = MD_MIN2SET(mnum);
1495 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
1496 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
1497 			return (ENXIO);
1498 	}
1499 
1500 	/* grab necessary locks */
1501 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1502 	setno = MD_UN2SET(un);
1503 
1504 	/* open underlying device, if necessary */
1505 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
1506 		if ((err = sp_open_dev(un, md_oflags)) != 0)
1507 			goto out;
1508 
1509 		if (MD_MNSET_SETNO(setno)) {
1510 			/* For probe, don't incur the overhead of validate */
1511 			if (!(md_oflags & MD_OFLG_PROBEDEV)) {
1512 				/*
1513 				 * Don't call sp_validate while
1514 				 * unit_openclose lock is held.  So, actually
1515 				 * open the device, drop openclose lock,
1516 				 * call sp_validate, reacquire openclose lock,
1517 				 * and close the device.  If sp_validate
1518 				 * succeeds, then device will be re-opened.
1519 				 */
1520 				if ((err = md_unit_incopen(mnum, flag,
1521 				    otyp)) != 0)
1522 					goto out;
1523 
1524 				mutex_enter(&ui->ui_mx);
1525 				ui->ui_lock |= MD_UL_OPENINPROGRESS;
1526 				mutex_exit(&ui->ui_mx);
1527 				md_unit_openclose_exit(ui);
1528 				if (otyp != OTYP_LYR)
1529 					rw_exit(&md_unit_array_rw.lock);
1530 
1531 				err = sp_validate(un);
1532 
1533 				if (otyp != OTYP_LYR)
1534 					rw_enter(&md_unit_array_rw.lock,
1535 					    RW_READER);
1536 				(void) md_unit_openclose_enter(ui);
1537 				(void) md_unit_decopen(mnum, otyp);
1538 				mutex_enter(&ui->ui_mx);
1539 				ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
1540 				cv_broadcast(&ui->ui_cv);
1541 				mutex_exit(&ui->ui_mx);
1542 				/*
1543 				 * Should be in the same state as before
1544 				 * the sp_validate.
1545 				 */
1546 				if (err != 0) {
1547 					/* close the device opened above */
1548 					md_layered_close(un->un_dev, md_oflags);
1549 					err = EIO;
1550 					goto out;
1551 				}
1552 			}
1553 			/*
1554 			 * As we're a multi-owner metadevice we need to ensure
1555 			 * that all nodes have the same idea of the status.
1556 			 * sp_validate() will mark the device as errored (if
1557 			 * it cannot read the watermark) or ok (if it was
1558 			 * previously errored but the watermark is now valid).
1559 			 * This code-path is only entered on the non-probe open
1560 			 * so we will maintain the errored state during a probe
1561 			 * call. This means the sys-admin must metarecover -m
1562 			 * to reset the soft-partition error.
1563 			 */
1564 		} else {
1565 			/* For probe, don't incur the overhead of validate */
1566 			if (!(md_oflags & MD_OFLG_PROBEDEV) &&
1567 			    (err = sp_validate(un)) != 0) {
1568 				/* close the device opened above */
1569 				md_layered_close(un->un_dev, md_oflags);
1570 				err = EIO;
1571 				goto out;
1572 			} else {
1573 				/*
1574 				 * we succeeded in validating the on disk
1575 				 * format versus the in core, so reset the
1576 				 * status if it's in error
1577 				 */
1578 				if (un->un_status == MD_SP_ERR) {
1579 					un->un_status = MD_SP_OK;
1580 				}
1581 			}
1582 		}
1583 	}
1584 
1585 	/* count open */
1586 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
1587 		goto out;
1588 
1589 out:
1590 	md_unit_openclose_exit(ui);
1591 	return (err);
1592 }
1593 
1594 /*
1595  * FUNCTION:	sp_close()
1596  * INPUT:	dev		- device to close.
1597  *		flag		- pass-through flag.
1598  *		otyp		- pass-through type.
1599  *		cred_p		- credentials.
1600  *		md_cflags	- close flags.
1601  * OUTPUT:	none.
1602  * RETURNS:	0		- success.
1603  *		non-zero	- err.
1604  * PURPOSE:	close a soft paritition.
1605  */
1606 /* ARGSUSED */
1607 static int
1608 sp_close(
1609 	dev_t		dev,
1610 	int		flag,
1611 	int		otyp,
1612 	cred_t		*cred_p,
1613 	int		md_cflags
1614 )
1615 {
1616 	minor_t		mnum = getminor(dev);
1617 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1618 	mp_unit_t	*un;
1619 	int		err = 0;
1620 
1621 	/* grab necessary locks */
1622 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1623 
1624 	/* count closed */
1625 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
1626 		goto out;
1627 
1628 	/* close devices, if necessary */
1629 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1630 		md_layered_close(un->un_dev, md_cflags);
1631 	}
1632 
1633 	/*
1634 	 * If a MN set and transient capabilities (eg ABR/DMR) are set,
1635 	 * clear these capabilities if this is the last close in
1636 	 * the cluster
1637 	 */
1638 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1639 	    (ui->ui_tstate & MD_ABR_CAP)) {
1640 		md_unit_openclose_exit(ui);
1641 		mdmn_clear_all_capabilities(mnum);
1642 		return (0);
1643 	}
1644 	/* unlock, return success */
1645 out:
1646 	md_unit_openclose_exit(ui);
1647 	return (err);
1648 }
1649 
1650 
1651 /* used in sp_dump routine */
1652 static struct buf dumpbuf;
1653 
1654 /*
1655  * FUNCTION:	sp_dump()
1656  * INPUT:	dev	- device to dump to.
1657  *		addr	- address to dump.
1658  *		blkno	- blkno on device.
1659  *		nblk	- number of blocks to dump.
1660  * OUTPUT:	none.
1661  * RETURNS:	result from bdev_dump.
1662  * PURPOSE:  This routine dumps memory to the disk.  It assumes that
1663  *           the memory has already been mapped into mainbus space.
1664  *           It is called at disk interrupt priority when the system
1665  *           is in trouble.
1666  *           NOTE: this function is defined using 32-bit arguments,
1667  *           but soft partitioning is internally 64-bit.  Arguments
1668  *           are casted where appropriate.
1669  */
1670 static int
1671 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1672 {
1673 	mp_unit_t	*un;
1674 	buf_t		*bp;
1675 	sp_ext_length_t	nb;
1676 	daddr_t		mapblk;
1677 	int		result;
1678 	int		more;
1679 	int		saveresult = 0;
1680 
1681 	/*
1682 	 * Don't need to grab the unit lock.
1683 	 * Cause nothing else is supposed to be happenning.
1684 	 * Also dump is not supposed to sleep.
1685 	 */
1686 	un = (mp_unit_t *)MD_UNIT(getminor(dev));
1687 
1688 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1689 		return (EINVAL);
1690 
1691 	if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
1692 		return (EINVAL);
1693 
1694 	bp = &dumpbuf;
1695 	nb = (sp_ext_length_t)dbtob(nblk);
1696 	do {
1697 		bzero((caddr_t)bp, sizeof (*bp));
1698 		more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
1699 		nblk = (int)(btodb(bp->b_bcount));
1700 		mapblk = bp->b_blkno;
1701 		result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
1702 		if (result)
1703 			saveresult = result;
1704 
1705 		nb -= bp->b_bcount;
1706 		addr += bp->b_bcount;
1707 		blkno += nblk;
1708 	} while (more);
1709 
1710 	return (saveresult);
1711 }
1712 
1713 static int
1714 sp_imp_set(
1715 	set_t	setno
1716 )
1717 {
1718 	mddb_recid_t	recid;
1719 	int		gotsomething;
1720 	mddb_type_t	rec_type;
1721 	mddb_de_ic_t	*dep;
1722 	mddb_rb32_t	*rbp;
1723 	mp_unit_t	*un64;
1724 	mp_unit32_od_t	*un32;
1725 	md_dev64_t	self_devt;
1726 	minor_t		*self_id;	/* minor needs to be updated */
1727 	md_parent_t	*parent_id;	/* parent needs to be updated */
1728 	mddb_recid_t	*record_id;	/* record id needs to be updated */
1729 
1730 	gotsomething = 0;
1731 
1732 	rec_type = (mddb_type_t)md_getshared_key(setno,
1733 	    sp_md_ops.md_driver.md_drivername);
1734 	recid = mddb_makerecid(setno, 0);
1735 
1736 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1737 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1738 			continue;
1739 
1740 		dep = mddb_getrecdep(recid);
1741 		rbp = dep->de_rb;
1742 
1743 		switch (rbp->rb_revision) {
1744 		case MDDB_REV_RB:
1745 		case MDDB_REV_RBFN:
1746 			/*
1747 			 * Small device
1748 			 */
1749 			un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
1750 			self_id = &(un32->c.un_self_id);
1751 			parent_id = &(un32->c.un_parent);
1752 			record_id = &(un32->c.un_record_id);
1753 
1754 			if (!md_update_minor(setno, mddb_getsidenum
1755 			    (setno), un32->un_key))
1756 				goto out;
1757 			break;
1758 
1759 		case MDDB_REV_RB64:
1760 		case MDDB_REV_RB64FN:
1761 			un64 = (mp_unit_t *)mddb_getrecaddr(recid);
1762 			self_id = &(un64->c.un_self_id);
1763 			parent_id = &(un64->c.un_parent);
1764 			record_id = &(un64->c.un_record_id);
1765 
1766 			if (!md_update_minor(setno, mddb_getsidenum
1767 			    (setno), un64->un_key))
1768 				goto out;
1769 			break;
1770 		}
1771 
1772 		/*
1773 		 * If this is a top level and a friendly name metadevice,
1774 		 * update its minor in the namespace.
1775 		 */
1776 		if ((*parent_id == MD_NO_PARENT) &&
1777 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
1778 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
1779 
1780 			self_devt = md_makedevice(md_major, *self_id);
1781 			if (!md_update_top_device_minor(setno,
1782 			    mddb_getsidenum(setno), self_devt))
1783 				goto out;
1784 		}
1785 
1786 		/*
1787 		 * Update unit with the imported setno
1788 		 *
1789 		 */
1790 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
1791 
1792 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1793 		if (*parent_id != MD_NO_PARENT)
1794 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1795 		*record_id = MAKERECID(setno, DBID(*record_id));
1796 
1797 		gotsomething = 1;
1798 	}
1799 
1800 out:
1801 	return (gotsomething);
1802 }
1803 
1804 static md_named_services_t sp_named_services[] = {
1805 	{NULL,					0}
1806 };
1807 
1808 md_ops_t sp_md_ops = {
1809 	sp_open,		/* open */
1810 	sp_close,		/* close */
1811 	md_sp_strategy,		/* strategy */
1812 	NULL,			/* print */
1813 	sp_dump,		/* dump */
1814 	NULL,			/* read */
1815 	NULL,			/* write */
1816 	md_sp_ioctl,		/* ioctl, */
1817 	sp_snarf,		/* snarf */
1818 	sp_halt,		/* halt */
1819 	NULL,			/* aread */
1820 	NULL,			/* awrite */
1821 	sp_imp_set,		/* import set */
1822 	sp_named_services
1823 };
1824 
1825 static void
1826 init_init()
1827 {
1828 	sp_parent_cache = kmem_cache_create("md_softpart_parent",
1829 	    sizeof (md_spps_t), 0, sp_parent_constructor,
1830 	    sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
1831 	sp_child_cache = kmem_cache_create("md_softpart_child",
1832 	    sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
1833 	    sp_child_constructor, sp_child_destructor, sp_run_queue,
1834 	    NULL, NULL, 0);
1835 }
1836 
1837 static void
1838 fini_uninit()
1839 {
1840 	kmem_cache_destroy(sp_parent_cache);
1841 	kmem_cache_destroy(sp_child_cache);
1842 	sp_parent_cache = sp_child_cache = NULL;
1843 }
1844 
1845 /* define the module linkage */
1846 MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit())
1847