xref: /titanic_41/usr/src/uts/common/io/lvm/softpart/sp.c (revision 08045defdf65ee890fef6e20510a093a17feb8fe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Soft partitioning metadevice driver (md_sp).
29  *
30  * This file contains the primary operations of the soft partitioning
31  * metadevice driver.  This includes all routines for normal operation
32  * (open/close/read/write).  Please see mdvar.h for a definition of
33  * metadevice operations vector (md_ops_t).  This driver is loosely
34  * based on the stripe driver (md_stripe).
35  *
36  * All metadevice administration is done through the use of ioctl's.
37  * As such, all administrative routines appear in sp_ioctl.c.
38  *
39  * Soft partitions are represented both in-core and in the metadb with a
40  * unit structure.  The soft partition-specific information in the unit
41  * structure includes the following information:
42  *	- Device information (md_dev64_t & md key) about the device on which
43  *	  the soft partition is built.
44  *	- Soft partition status information.
45  *	- The size of the soft partition and number of extents used to
46  *	  make up that size.
47  *	- An array of exents which define virtual/physical offset
48  *	  mappings and lengths for each extent.
49  *
50  * Typical soft partition operation proceeds as follows:
51  *	- The unit structure is fetched from the metadb and placed into
52  *	  an in-core array (as with other metadevices).  This operation
53  *	  is performed via sp_build_incore( ) and takes place during
54  *	  "snarfing" (when all metadevices are brought in-core at
55  *	  once) and when a new soft partition is created.
56  *	- A soft partition is opened via sp_open( ).  At open time the
57  *	  the soft partition unit structure is verified with the soft
58  *	  partition on-disk structures.  Additionally, the soft partition
59  *	  status is checked (only soft partitions in the OK state may be
60  *	  opened).
61  *	- Soft partition I/O is performed via sp_strategy( ) which relies on
62  *	  a support routine, sp_mapbuf( ), to do most of the work.
63  *	  sp_mapbuf( ) maps a buffer to a particular extent via a binary
64  *	  search of the extent array in the soft partition unit structure.
65  *	  Once a translation has been performed, the I/O is passed down
66  *	  to the next layer, which may be another metadevice or a physical
67  *	  disk.  Since a soft partition may contain multiple, non-contiguous
68  *	  extents, a single I/O may have to be fragmented.
69  *	- Soft partitions are closed using sp_close.
70  *
71  */
72 
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/conf.h>
76 #include <sys/file.h>
77 #include <sys/user.h>
78 #include <sys/uio.h>
79 #include <sys/t_lock.h>
80 #include <sys/buf.h>
81 #include <sys/dkio.h>
82 #include <sys/vtoc.h>
83 #include <sys/kmem.h>
84 #include <vm/page.h>
85 #include <sys/cmn_err.h>
86 #include <sys/sysmacros.h>
87 #include <sys/types.h>
88 #include <sys/mkdev.h>
89 #include <sys/stat.h>
90 #include <sys/open.h>
91 #include <sys/lvm/mdvar.h>
92 #include <sys/lvm/md_sp.h>
93 #include <sys/lvm/md_convert.h>
94 #include <sys/lvm/md_notify.h>
95 #include <sys/lvm/md_crc.h>
96 #include <sys/modctl.h>
97 #include <sys/ddi.h>
98 #include <sys/sunddi.h>
99 #include <sys/debug.h>
100 
101 #include <sys/sysevent/eventdefs.h>
102 #include <sys/sysevent/svm.h>
103 
104 md_ops_t		sp_md_ops;
105 #ifndef	lint
106 char			_depends_on[] = "drv/md";
107 md_ops_t		*md_interface_ops = &sp_md_ops;
108 #endif
109 
110 extern unit_t		md_nunits;
111 extern set_t		md_nsets;
112 extern md_set_t		md_set[];
113 
114 extern int		md_status;
115 extern major_t		md_major;
116 extern mdq_anchor_t	md_done_daemon;
117 extern mdq_anchor_t	md_sp_daemon;
118 extern kmutex_t		md_mx;
119 extern kcondvar_t	md_cv;
120 extern md_krwlock_t	md_unit_array_rw;
121 extern clock_t		md_hz;
122 
123 static kmem_cache_t	*sp_parent_cache = NULL;
124 static kmem_cache_t	*sp_child_cache = NULL;
125 static void		sp_send_stat_ok(mp_unit_t *);
126 static void		sp_send_stat_err(mp_unit_t *);
127 
128 /*
129  * FUNCTION:	sp_parent_constructor()
130  * INPUT:	none.
131  * OUTPUT:	ps	- parent save structure initialized.
132  * RETURNS:	void *	- ptr to initialized parent save structure.
133  * PURPOSE:	initialize parent save structure.
134  */
135 /*ARGSUSED1*/
136 static int
137 sp_parent_constructor(void *p, void *d1, int d2)
138 {
139 	mutex_init(&((md_spps_t *)p)->ps_mx,
140 	    NULL, MUTEX_DEFAULT, NULL);
141 	return (0);
142 }
143 
144 static void
145 sp_parent_init(md_spps_t *ps)
146 {
147 	bzero(ps, offsetof(md_spps_t, ps_mx));
148 }
149 
150 /*ARGSUSED1*/
151 static void
152 sp_parent_destructor(void *p, void *d)
153 {
154 	mutex_destroy(&((md_spps_t *)p)->ps_mx);
155 }
156 
157 /*
158  * FUNCTION:	sp_child_constructor()
159  * INPUT:	none.
160  * OUTPUT:	cs	- child save structure initialized.
161  * RETURNS:	void *	- ptr to initialized child save structure.
162  * PURPOSE:	initialize child save structure.
163  */
164 /*ARGSUSED1*/
165 static int
166 sp_child_constructor(void *p, void *d1, int d2)
167 {
168 	bioinit(&((md_spcs_t *)p)->cs_buf);
169 	return (0);
170 }
171 
172 static void
173 sp_child_init(md_spcs_t *cs)
174 {
175 	cs->cs_mdunit = 0;
176 	cs->cs_ps = NULL;
177 	md_bioreset(&cs->cs_buf);
178 }
179 
180 /*ARGSUSED1*/
181 static void
182 sp_child_destructor(void *p, void *d)
183 {
184 	biofini(&((md_spcs_t *)p)->cs_buf);
185 }
186 
187 /*
188  * FUNCTION:	sp_run_queue()
189  * INPUT:	none.
190  * OUTPUT:	none.
191  * RETURNS:	void.
192  * PURPOSE:	run the md_daemon to clean up memory pool.
193  */
194 /*ARGSUSED*/
195 static void
196 sp_run_queue(void *d)
197 {
198 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
199 		md_daemon(1, &md_done_daemon);
200 }
201 
202 
203 /*
204  * FUNCTION:	sp_build_incore()
205  * INPUT:	p		- ptr to unit structure.
206  *		snarfing	- flag to tell us we are snarfing.
207  * OUTPUT:	non.
208  * RETURNS:	int	- 0 (always).
209  * PURPOSE:	place unit structure into in-core unit array (keyed from
210  *		minor number).
211  */
212 int
213 sp_build_incore(void *p, int snarfing)
214 {
215 	mp_unit_t	*un = (mp_unit_t *)p;
216 	minor_t		mnum;
217 	set_t		setno;
218 	md_dev64_t	tmpdev;
219 
220 	mnum = MD_SID(un);
221 
222 	if (MD_UNIT(mnum) != NULL)
223 		return (0);
224 
225 	MD_STATUS(un) = 0;
226 
227 	if (snarfing) {
228 		/*
229 		 * if we are snarfing, we get the device information
230 		 * from the metadb record (using the metadb key for
231 		 * that device).
232 		 */
233 		setno = MD_MIN2SET(mnum);
234 
235 		tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
236 		    un->un_key, MD_NOTRUST_DEVT);
237 		un->un_dev = tmpdev;
238 	}
239 
240 	/* place various information in the in-core data structures */
241 	md_nblocks_set(mnum, un->c.un_total_blocks);
242 	MD_UNIT(mnum) = un;
243 
244 	return (0);
245 }
246 
247 /*
248  * FUNCTION:	reset_sp()
249  * INPUT:	un		- unit structure to be reset/removed.
250  *		mnum		- minor number to be reset/removed.
251  *		removing	- flag to tell us if we are removing
252  *				  permanently or just reseting in-core
253  *				  structures.
254  * OUTPUT:	none.
255  * RETURNS:	void.
256  * PURPOSE:	used to either simply reset in-core structures or to
257  *		permanently remove metadevices from the metadb.
258  */
259 void
260 reset_sp(mp_unit_t *un, minor_t mnum, int removing)
261 {
262 	sv_dev_t	*sv;
263 	mddb_recid_t	vtoc_id;
264 
265 	/* clean up in-core structures */
266 	md_destroy_unit_incore(mnum, &sp_md_ops);
267 
268 	md_nblocks_set(mnum, -1ULL);
269 	MD_UNIT(mnum) = NULL;
270 
271 	/*
272 	 * Attempt release of minor node
273 	 */
274 	md_remove_minor_node(mnum);
275 
276 	if (!removing)
277 		return;
278 
279 	/* we are removing the soft partition from the metadb */
280 
281 	/*
282 	 * Save off device information so we can get to
283 	 * it after we do the mddb_deleterec().
284 	 */
285 	sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
286 	sv->setno = MD_MIN2SET(mnum);
287 	sv->key = un->un_key;
288 	vtoc_id = un->c.un_vtoc_id;
289 
290 	/*
291 	 * Remove self from the namespace
292 	 */
293 	if (un->c.un_revision & MD_FN_META_DEV) {
294 		(void) md_rem_selfname(un->c.un_self_id);
295 	}
296 
297 	/* Remove the unit structure */
298 	mddb_deleterec_wrapper(un->c.un_record_id);
299 
300 	if (vtoc_id)
301 		mddb_deleterec_wrapper(vtoc_id);
302 
303 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
304 	    MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
305 
306 	/*
307 	 * remove the underlying device name from the metadb.  if other
308 	 * soft partitions are built on this device, this will simply
309 	 * decrease the reference count for this device.  otherwise the
310 	 * name record for this device will be removed from the metadb.
311 	 */
312 	md_rem_names(sv, 1);
313 	kmem_free(sv, sizeof (sv_dev_t));
314 }
315 
316 /*
317  * FUNCTION:	sp_send_stat_msg
318  * INPUT:	un	- unit reference
319  *		status	- status to be sent to master node
320  *			MD_SP_OK - soft-partition is now OK
321  *			MD_SP_ERR	"	"	 errored
322  * OUTPUT:	none.
323  * RETURNS:	void.
324  * PURPOSE:	send a soft-partition status change to the master node. If the
325  *		message succeeds we simply return. If it fails we panic as the
326  *		cluster-wide view of the metadevices is now inconsistent.
327  * CALLING CONTEXT:
328  *	Blockable. No locks can be held.
329  */
330 static void
331 sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
332 {
333 	md_mn_msg_sp_setstat_t	sp_msg;
334 	md_mn_kresult_t	*kres;
335 	set_t		setno = MD_UN2SET(un);
336 	int		rval;
337 	const char	*str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
338 
339 	sp_msg.sp_setstat_mnum = MD_SID(un);
340 	sp_msg.sp_setstat_status = status;
341 
342 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
343 
344 	rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
345 	    0, (char *)&sp_msg, sizeof (sp_msg), kres);
346 
347 	if (!MDMN_KSEND_MSG_OK(rval, kres)) {
348 		mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
349 		/* If we're shutting down already, pause things here. */
350 		if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
351 			while (!md_mn_is_commd_present()) {
352 				delay(md_hz);
353 			}
354 		}
355 		/*
356 		 * Panic as we are now in an inconsistent state.
357 		 */
358 		cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
359 		    md_shortname(MD_SID(un)), str);
360 	}
361 
362 	kmem_free(kres, sizeof (md_mn_kresult_t));
363 }
364 
365 /*
366  * FUNCTION:	sp_finish_error
367  * INPUT:	ps	- parent save structure for error-ed I/O.
368  *		lock_held	- set if the unit readerlock is held
369  * OUTPUT:	none.
370  * RETURNS:	void.
371  * PURPOSE:	report a driver error
372  */
373 static void
374 sp_finish_error(md_spps_t *ps, int lock_held)
375 {
376 	struct buf	*pb = ps->ps_bp;
377 	mdi_unit_t	*ui = ps->ps_ui;
378 	md_dev64_t	un_dev;			/* underlying device */
379 	md_dev64_t	md_dev = md_expldev(pb->b_edev); /* metadev in error */
380 	char		*str;
381 
382 	un_dev = md_expldev(ps->ps_un->un_dev);
383 	/* set error type */
384 	if (pb->b_flags & B_READ) {
385 		str = "read";
386 	} else {
387 		str = "write";
388 	}
389 
390 
391 	SPPS_FREE(sp_parent_cache, ps);
392 	pb->b_flags |= B_ERROR;
393 
394 	md_kstat_done(ui, pb, 0);
395 
396 	if (lock_held) {
397 		md_unit_readerexit(ui);
398 	}
399 	md_biodone(pb);
400 
401 	cmn_err(CE_WARN, "md: %s: %s error on %s",
402 	    md_shortname(md_getminor(md_dev)), str,
403 	    md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
404 }
405 
406 
407 /*
408  * FUNCTION:	sp_xmit_ok
409  * INPUT:	dq	- daemon queue referencing failing ps structure
410  * OUTPUT:	none.
411  * RETURNS:	void.
412  * PURPOSE:	send a message to the master node in a multi-owner diskset to
413  *		update all attached nodes view of the soft-part to be MD_SP_OK.
414  * CALLING CONTEXT:
415  *	Blockable. No unit lock held.
416  */
417 static void
418 sp_xmit_ok(daemon_queue_t *dq)
419 {
420 	md_spps_t	*ps = (md_spps_t *)dq;
421 
422 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
423 	sp_send_stat_msg(ps->ps_un, MD_SP_OK);
424 
425 	/*
426 	 * Successfully transmitted error state to all nodes, now release this
427 	 * parent structure.
428 	 */
429 	SPPS_FREE(sp_parent_cache, ps);
430 }
431 
432 /*
433  * FUNCTION:	sp_xmit_error
434  * INPUT:	dq	- daemon queue referencing failing ps structure
435  * OUTPUT:	none.
436  * RETURNS:	void.
437  * PURPOSE:	send a message to the master node in a multi-owner diskset to
438  *		update all attached nodes view of the soft-part to be MD_SP_ERR.
439  * CALLING CONTEXT:
440  *	Blockable. No unit lock held.
441  */
442 static void
443 sp_xmit_error(daemon_queue_t *dq)
444 {
445 	md_spps_t	*ps = (md_spps_t *)dq;
446 
447 	/* Send a MD_MN_MSG_SP_SETSTAT to the master */
448 	sp_send_stat_msg(ps->ps_un, MD_SP_ERR);
449 
450 	/*
451 	 * Successfully transmitted error state to all nodes, now release this
452 	 * parent structure.
453 	 */
454 	SPPS_FREE(sp_parent_cache, ps);
455 }
456 static void
457 sp_send_stat_ok(mp_unit_t *un)
458 {
459 	minor_t		mnum = MD_SID(un);
460 	md_spps_t	*ps;
461 
462 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
463 	sp_parent_init(ps);
464 	ps->ps_un = un;
465 	ps->ps_ui = MDI_UNIT(mnum);
466 
467 	daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
468 	    REQ_OLD);
469 }
470 
471 static void
472 sp_send_stat_err(mp_unit_t *un)
473 {
474 	minor_t		mnum = MD_SID(un);
475 	md_spps_t	*ps;
476 
477 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
478 	sp_parent_init(ps);
479 	ps->ps_un = un;
480 	ps->ps_ui = MDI_UNIT(mnum);
481 
482 	daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
483 	    REQ_OLD);
484 }
485 
486 
487 /*
488  * FUNCTION:	sp_error()
489  * INPUT:	ps	- parent save structure for error-ed I/O.
490  * OUTPUT:	none.
491  * RETURNS:	void.
492  * PURPOSE:	report a driver error.
493  * CALLING CONTEXT:
494  *	Interrupt - non-blockable
495  */
496 static void
497 sp_error(md_spps_t *ps)
498 {
499 	set_t		setno = MD_UN2SET(ps->ps_un);
500 
501 	/*
502 	 * Drop the mutex associated with this request before (potentially)
503 	 * enqueuing the free onto a separate thread. We have to release the
504 	 * mutex before destroying the parent structure.
505 	 */
506 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
507 		if (MUTEX_HELD(&ps->ps_mx)) {
508 			mutex_exit(&ps->ps_mx);
509 		}
510 	} else {
511 		/*
512 		 * this should only ever happen if we are panicking,
513 		 * since DONTFREE is only set on the parent if panicstr
514 		 * is non-NULL.
515 		 */
516 		ASSERT(panicstr);
517 	}
518 
519 	/*
520 	 * For a multi-owner set we need to send a message to the master so that
521 	 * all nodes get the errored status when we first encounter it. To avoid
522 	 * deadlocking when multiple soft-partitions encounter an error on one
523 	 * physical unit we drop the unit readerlock before enqueueing the
524 	 * request. That way we can service any messages that require a
525 	 * writerlock to be held. Additionally, to avoid deadlocking when at
526 	 * the bottom of a metadevice stack and a higher level mirror has
527 	 * multiple requests outstanding on this soft-part, we clone the ps
528 	 * that failed and pass the error back up the stack to release the
529 	 * reference that this i/o may have in the higher-level metadevice.
530 	 * The other nodes in the cluster just have to modify the soft-part
531 	 * status and we do not need to block the i/o completion for this.
532 	 */
533 	if (MD_MNSET_SETNO(setno)) {
534 		md_spps_t	*err_ps;
535 		err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
536 		sp_parent_init(err_ps);
537 
538 		err_ps->ps_un = ps->ps_un;
539 		err_ps->ps_ui = ps->ps_ui;
540 
541 		md_unit_readerexit(ps->ps_ui);
542 
543 		daemon_request(&md_sp_daemon, sp_xmit_error,
544 		    (daemon_queue_t *)err_ps, REQ_OLD);
545 
546 		sp_finish_error(ps, 0);
547 
548 		return;
549 	} else {
550 		ps->ps_un->un_status = MD_SP_ERR;
551 	}
552 
553 	/* Flag the error */
554 	sp_finish_error(ps, 1);
555 
556 }
557 
558 /*
559  * FUNCTION:	sp_mapbuf()
560  * INPUT:	un	- unit structure for soft partition we are doing
561  *			  I/O on.
562  *		voff	- virtual offset in soft partition to map.
563  *		bcount	- # of blocks in the I/O.
564  * OUTPUT:	bp	- translated buffer to be passed down to next layer.
565  * RETURNS:	1	- request must be fragmented, more work to do,
566  *		0	- request satisified, no more work to do
567  *		-1	- error
568  * PURPOSE:	Map the the virtual offset in the soft partition (passed
569  *		in via voff) to the "physical" offset on whatever the soft
570  *		partition is built on top of.  We do this by doing a binary
571  *		search of the extent array in the soft partition unit
572  *		structure.  Once the current extent is found, we do the
573  *		translation, determine if the I/O will cross extent
574  *		boundaries (if so, we have to fragment the I/O), then
575  *		fill in the buf structure to be passed down to the next layer.
576  */
577 static int
578 sp_mapbuf(
579 	mp_unit_t	*un,
580 	sp_ext_offset_t	voff,
581 	sp_ext_length_t	bcount,
582 	buf_t		*bp
583 )
584 {
585 	int		lo, mid, hi, found, more;
586 	size_t		new_bcount;
587 	sp_ext_offset_t new_blkno;
588 	sp_ext_offset_t	new_offset;
589 	sp_ext_offset_t	ext_endblk;
590 	md_dev64_t	new_edev;
591 	extern unsigned	md_maxphys;
592 
593 	found = 0;
594 	lo = 0;
595 	hi = un->un_numexts - 1;
596 
597 	/*
598 	 * do a binary search to find the extent that contains the
599 	 * starting offset.  after this loop, mid contains the index
600 	 * of the correct extent.
601 	 */
602 	while (lo <= hi && !found) {
603 		mid = (lo + hi) / 2;
604 		/* is the starting offset contained within the mid-ext? */
605 		if (voff >= un->un_ext[mid].un_voff &&
606 		    voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
607 			found = 1;
608 		else if (voff < un->un_ext[mid].un_voff)
609 			hi = mid - 1;
610 		else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
611 			lo = mid + 1;
612 	}
613 
614 	if (!found) {
615 		cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
616 		return (-1);
617 	}
618 
619 	/* translate to underlying physical offset/device */
620 	new_offset = voff - un->un_ext[mid].un_voff;
621 	new_blkno = un->un_ext[mid].un_poff + new_offset;
622 	new_edev = un->un_dev;
623 
624 	/* determine if we need to break the I/O into fragments */
625 	ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
626 	if (voff + btodb(bcount) > ext_endblk) {
627 		new_bcount = dbtob(ext_endblk - voff);
628 		more = 1;
629 	} else {
630 		new_bcount = bcount;
631 		more = 0;
632 	}
633 
634 	/* only break up the I/O if we're not built on another metadevice */
635 	if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
636 		new_bcount = md_maxphys;
637 		more = 1;
638 	}
639 	if (bp != (buf_t *)NULL) {
640 		/* do bp updates */
641 		bp->b_bcount = new_bcount;
642 		bp->b_lblkno = new_blkno;
643 		bp->b_edev = md_dev64_to_dev(new_edev);
644 	}
645 	return (more);
646 }
647 
648 /*
649  * FUNCTION:	sp_validate()
650  * INPUT:	un	- unit structure to be validated.
651  * OUTPUT:	none.
652  * RETURNS:	0	- soft partition ok.
653  *		-1	- error.
654  * PURPOSE:	called on open to sanity check the soft partition.  In
655  *		order to open a soft partition:
656  *		- it must have at least one extent
657  *		- the extent info in core and on disk must match
658  *		- it may not be in an intermediate state (which would
659  *		  imply that a two-phase commit was interrupted)
660  *
661  *		If the extent checking fails (B_ERROR returned from the read
662  *		strategy call) _and_ we're a multi-owner diskset, we send a
663  *		message to the master so that all nodes inherit the same view
664  *		of the soft partition.
665  *		If we are checking a soft-part that is marked as in error, and
666  *		we can actually read and validate the watermarks we send a
667  *		message to clear the error to the master node.
668  */
669 static int
670 sp_validate(mp_unit_t *un)
671 {
672 	uint_t		ext;
673 	struct buf	*buf;
674 	sp_ext_length_t	len;
675 	mp_watermark_t	*wm;
676 	set_t		setno;
677 	int		reset_error = 0;
678 
679 	setno = MD_UN2SET(un);
680 
681 	/* sanity check unit structure components ?? */
682 	if (un->un_status != MD_SP_OK) {
683 		if (un->un_status != MD_SP_ERR) {
684 			cmn_err(CE_WARN, "md: %s: open failed, soft partition "
685 			    "status is %u.",
686 			    md_shortname(MD_SID(un)),
687 			    un->un_status);
688 			return (-1);
689 		} else {
690 			cmn_err(CE_WARN, "md: %s: open of soft partition "
691 			    "in Errored state.",
692 			    md_shortname(MD_SID(un)));
693 			reset_error = 1;
694 		}
695 	}
696 
697 	if (un->un_numexts == 0) {
698 		cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
699 		    "not have any extents.", md_shortname(MD_SID(un)));
700 		return (-1);
701 	}
702 
703 	len = 0LL;
704 	for (ext = 0; ext < un->un_numexts; ext++) {
705 
706 		/* tally extent lengths to check total size */
707 		len += un->un_ext[ext].un_len;
708 
709 		/* allocate buffer for watermark */
710 		buf = getrbuf(KM_SLEEP);
711 
712 		/* read watermark */
713 		buf->b_flags = B_READ;
714 		buf->b_edev = md_dev64_to_dev(un->un_dev);
715 		buf->b_iodone = NULL;
716 		buf->b_proc = NULL;
717 		buf->b_bcount = sizeof (mp_watermark_t);
718 		buf->b_lblkno = un->un_ext[ext].un_poff - 1;
719 		buf->b_bufsize = sizeof (mp_watermark_t);
720 		buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
721 		    KM_SLEEP);
722 
723 		/*
724 		 * make the call non-blocking so that it is not affected
725 		 * by a set take.
726 		 */
727 		md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
728 		(void) biowait(buf);
729 
730 		if (buf->b_flags & B_ERROR) {
731 			cmn_err(CE_WARN, "md: %s: open failed, could not "
732 			    "read watermark at block %llu for extent %u, "
733 			    "error %d.", md_shortname(MD_SID(un)),
734 			    buf->b_lblkno, ext, buf->b_error);
735 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
736 			freerbuf(buf);
737 
738 			/*
739 			 * If we're a multi-owner diskset we send a message
740 			 * indicating that this soft-part has an invalid
741 			 * extent to the master node. This ensures a consistent
742 			 * view of the soft-part across the cluster.
743 			 */
744 			if (MD_MNSET_SETNO(setno)) {
745 				sp_send_stat_err(un);
746 			}
747 			return (-1);
748 		}
749 
750 		wm = (mp_watermark_t *)buf->b_un.b_addr;
751 
752 		/* make sure the checksum is correct first */
753 		if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
754 		    (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
755 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
756 			    "at block %llu for extent %u does not have a "
757 			    "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
758 			    buf->b_lblkno, ext, wm->wm_checksum);
759 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
760 			freerbuf(buf);
761 			return (-1);
762 		}
763 
764 		if (wm->wm_magic != MD_SP_MAGIC) {
765 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
766 			    "at block %llu for extent %u does not have a "
767 			    "valid watermark magic number, expected 0x%x, "
768 			    "found 0x%x.", md_shortname(MD_SID(un)),
769 			    buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
770 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
771 			freerbuf(buf);
772 			return (-1);
773 		}
774 
775 		/* make sure sequence number matches the current extent */
776 		if (wm->wm_seq != ext) {
777 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
778 			    "at block %llu for extent %u has invalid "
779 			    "sequence number %u.", md_shortname(MD_SID(un)),
780 			    buf->b_lblkno, ext, wm->wm_seq);
781 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
782 			freerbuf(buf);
783 			return (-1);
784 		}
785 
786 		/* make sure watermark length matches unit structure */
787 		if (wm->wm_length != un->un_ext[ext].un_len) {
788 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
789 			    "at block %llu for extent %u has inconsistent "
790 			    "length, expected %llu, found %llu.",
791 			    md_shortname(MD_SID(un)), buf->b_lblkno,
792 			    ext, un->un_ext[ext].un_len,
793 			    (u_longlong_t)wm->wm_length);
794 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
795 			freerbuf(buf);
796 			return (-1);
797 		}
798 
799 		/*
800 		 * make sure the type is a valid soft partition and not
801 		 * a free extent or the end.
802 		 */
803 		if (wm->wm_type != EXTTYP_ALLOC) {
804 			cmn_err(CE_WARN, "md: %s: open failed, watermark "
805 			    "at block %llu for extent %u is not marked "
806 			    "as in-use, type = %u.", md_shortname(MD_SID(un)),
807 			    buf->b_lblkno, ext, wm->wm_type);
808 			kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
809 			freerbuf(buf);
810 			return (-1);
811 		}
812 		/* free up buffer */
813 		kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
814 		freerbuf(buf);
815 	}
816 
817 	if (len != un->un_length) {
818 		cmn_err(CE_WARN, "md: %s: open failed, computed length "
819 		    "%llu != expected length %llu.", md_shortname(MD_SID(un)),
820 		    len, un->un_length);
821 		return (-1);
822 	}
823 
824 	/*
825 	 * If we're a multi-owner set _and_ reset_error is set, we should clear
826 	 * the error condition on all nodes in the set. Use SP_SETSTAT2 with
827 	 * MD_SP_OK.
828 	 */
829 	if (MD_MNSET_SETNO(setno) && reset_error) {
830 		sp_send_stat_ok(un);
831 	}
832 	return (0);
833 }
834 
835 /*
836  * FUNCTION:	sp_done()
837  * INPUT:	child_buf	- buffer attached to child save structure.
838  *				  this is the buffer on which I/O has just
839  *				  completed.
840  * OUTPUT:	none.
841  * RETURNS:	0	- success.
842  *		1	- error.
843  * PURPOSE:	called on I/O completion.
844  */
845 static int
846 sp_done(struct buf *child_buf)
847 {
848 	struct buf	*parent_buf;
849 	mdi_unit_t	*ui;
850 	md_spps_t	*ps;
851 	md_spcs_t	*cs;
852 
853 	/* find the child save structure to which this buffer belongs */
854 	cs = (md_spcs_t *)((caddr_t)child_buf -
855 	    (sizeof (md_spcs_t) - sizeof (buf_t)));
856 	/* now get the parent save structure */
857 	ps = cs->cs_ps;
858 	parent_buf = ps->ps_bp;
859 
860 	mutex_enter(&ps->ps_mx);
861 	/* pass any errors back up to the parent */
862 	if (child_buf->b_flags & B_ERROR) {
863 		ps->ps_flags |= MD_SPPS_ERROR;
864 		parent_buf->b_error = child_buf->b_error;
865 	}
866 	/* mapout, if needed */
867 	if (child_buf->b_flags & B_REMAPPED)
868 		bp_mapout(child_buf);
869 
870 	ps->ps_frags--;
871 	if (ps->ps_frags != 0) {
872 		/*
873 		 * if this parent has more children, we just free the
874 		 * child and return.
875 		 */
876 		kmem_cache_free(sp_child_cache, cs);
877 		mutex_exit(&ps->ps_mx);
878 		return (1);
879 	}
880 	/* there are no more children */
881 	kmem_cache_free(sp_child_cache, cs);
882 	if (ps->ps_flags & MD_SPPS_ERROR) {
883 		sp_error(ps);
884 		return (1);
885 	}
886 	ui = ps->ps_ui;
887 	if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
888 		mutex_exit(&ps->ps_mx);
889 	} else {
890 		/*
891 		 * this should only ever happen if we are panicking,
892 		 * since DONTFREE is only set on the parent if panicstr
893 		 * is non-NULL.
894 		 */
895 		ASSERT(panicstr);
896 	}
897 	SPPS_FREE(sp_parent_cache, ps);
898 	md_kstat_done(ui, parent_buf, 0);
899 	md_unit_readerexit(ui);
900 	md_biodone(parent_buf);
901 	return (0);
902 }
903 
904 /*
905  * FUNCTION:	md_sp_strategy()
906  * INPUT:	parent_buf	- parent buffer
907  *		flag		- flags
908  *		private		- private data
909  * OUTPUT:	none.
910  * RETURNS:	void.
911  * PURPOSE:	Soft partitioning I/O strategy.  Performs the main work
912  *		needed to do I/O to a soft partition.  The basic
913  *		algorithm is as follows:
914  *			- Allocate a child save structure to keep track
915  *			  of the I/O we are going to pass down.
916  *			- Map the I/O to the correct extent in the soft
917  *			  partition (see sp_mapbuf()).
918  *			- bioclone() the buffer and pass it down the
919  *			  stack using md_call_strategy.
920  *			- If the I/O needs to split across extents,
921  *			  repeat the above steps until all fragments
922  *			  are finished.
923  */
924 static void
925 md_sp_strategy(buf_t *parent_buf, int flag, void *private)
926 {
927 	md_spps_t	*ps;
928 	md_spcs_t	*cs;
929 	int		more;
930 	mp_unit_t	*un;
931 	mdi_unit_t	*ui;
932 	size_t		current_count;
933 	off_t		current_offset;
934 	sp_ext_offset_t	current_blkno;
935 	buf_t		*child_buf;
936 	set_t		setno = MD_MIN2SET(getminor(parent_buf->b_edev));
937 	int		strat_flag = flag;
938 
939 	/*
940 	 * When doing IO to a multi owner meta device, check if set is halted.
941 	 * We do this check without the needed lock held, for performance
942 	 * reasons.
943 	 * If an IO just slips through while the set is locked via an
944 	 * MD_MN_SUSPEND_SET, we don't care about it.
945 	 * Only check for suspension if we are a top-level i/o request
946 	 * (MD_STR_NOTTOP is cleared in 'flag');
947 	 */
948 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
949 	    (MD_SET_HALTED | MD_SET_MNSET)) {
950 		if ((flag & MD_STR_NOTTOP) == 0) {
951 			mutex_enter(&md_mx);
952 			/* Here we loop until the set is no longer halted */
953 			while (md_set[setno].s_status & MD_SET_HALTED) {
954 				cv_wait(&md_cv, &md_mx);
955 			}
956 			mutex_exit(&md_mx);
957 		}
958 	}
959 
960 	ui = MDI_UNIT(getminor(parent_buf->b_edev));
961 
962 	md_kstat_waitq_enter(ui);
963 
964 	un = (mp_unit_t *)md_unit_readerlock(ui);
965 
966 	if ((flag & MD_NOBLOCK) == 0) {
967 		if (md_inc_iocount(setno) != 0) {
968 			parent_buf->b_flags |= B_ERROR;
969 			parent_buf->b_error = ENXIO;
970 			parent_buf->b_resid = parent_buf->b_bcount;
971 			md_kstat_waitq_exit(ui);
972 			md_unit_readerexit(ui);
973 			biodone(parent_buf);
974 			return;
975 		}
976 	} else {
977 		md_inc_iocount_noblock(setno);
978 	}
979 
980 	if (!(flag & MD_STR_NOTTOP)) {
981 		if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
982 			md_kstat_waitq_exit(ui);
983 			return;
984 		}
985 	}
986 
987 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
988 	sp_parent_init(ps);
989 
990 	/*
991 	 * Save essential information from the original buffhdr
992 	 * in the parent.
993 	 */
994 	ps->ps_un = un;
995 	ps->ps_ui = ui;
996 	ps->ps_bp = parent_buf;
997 	ps->ps_addr = parent_buf->b_un.b_addr;
998 
999 	current_count = parent_buf->b_bcount;
1000 	current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
1001 	current_offset  = 0;
1002 
1003 	/*
1004 	 * if we are at the top and we are panicking,
1005 	 * we don't free in order to save state.
1006 	 */
1007 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
1008 		ps->ps_flags |= MD_SPPS_DONTFREE;
1009 
1010 	md_kstat_waitq_to_runq(ui);
1011 
1012 	ps->ps_frags++;
1013 
1014 	/*
1015 	 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
1016 	 * metadevice.
1017 	 */
1018 	if (ui->ui_tstate & MD_ABR_CAP)
1019 		strat_flag |= MD_STR_ABR;
1020 
1021 	/*
1022 	 * this loop does the main work of an I/O.  we allocate a
1023 	 * a child save for each buf, do the logical to physical
1024 	 * mapping, decide if we need to frag the I/O, clone the
1025 	 * new I/O to pass down the stack.  repeat until we've
1026 	 * taken care of the entire buf that was passed to us.
1027 	 */
1028 	do {
1029 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1030 		sp_child_init(cs);
1031 		child_buf = &cs->cs_buf;
1032 		cs->cs_ps = ps;
1033 
1034 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1035 		if (more == -1) {
1036 			parent_buf->b_flags |= B_ERROR;
1037 			parent_buf->b_error = EIO;
1038 			md_kstat_done(ui, parent_buf, 0);
1039 			md_unit_readerexit(ui);
1040 			md_biodone(parent_buf);
1041 			kmem_cache_free(sp_parent_cache, ps);
1042 			return;
1043 		}
1044 
1045 		child_buf = md_bioclone(parent_buf, current_offset,
1046 		    child_buf->b_bcount, child_buf->b_edev,
1047 		    child_buf->b_blkno, sp_done, child_buf,
1048 		    KM_NOSLEEP);
1049 		/* calculate new offset, counts, etc... */
1050 		current_offset += child_buf->b_bcount;
1051 		current_count -=  child_buf->b_bcount;
1052 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1053 
1054 		if (more) {
1055 			mutex_enter(&ps->ps_mx);
1056 			ps->ps_frags++;
1057 			mutex_exit(&ps->ps_mx);
1058 		}
1059 
1060 		md_call_strategy(child_buf, strat_flag, private);
1061 	} while (more);
1062 
1063 	if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
1064 		while (!(ps->ps_flags & MD_SPPS_DONE)) {
1065 			md_daemon(1, &md_done_daemon);
1066 		}
1067 		kmem_cache_free(sp_parent_cache, ps);
1068 	}
1069 }
1070 
1071 /*
1072  * FUNCTION:	sp_directed_read()
1073  * INPUT:	mnum	- minor number
1074  *		vdr	- vol_directed_rd_t from user
1075  *		mode	- access mode for copying data out.
1076  * OUTPUT:	none.
1077  * RETURNS:	0	- success
1078  *		Exxxxx	- failure error-code
1079  * PURPOSE:	Construct the necessary sub-device i/o requests to perform the
1080  *		directed read as requested by the user. This is essentially the
1081  *		same as md_sp_strategy() with the exception being that the
1082  *		underlying 'md_call_strategy' is replaced with an ioctl call.
1083  */
1084 int
1085 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
1086 {
1087 	md_spps_t	*ps;
1088 	md_spcs_t	*cs;
1089 	int		more;
1090 	mp_unit_t	*un;
1091 	mdi_unit_t	*ui;
1092 	size_t		current_count;
1093 	off_t		current_offset;
1094 	sp_ext_offset_t	current_blkno;
1095 	buf_t		*child_buf, *parent_buf;
1096 	void		*kbuffer;
1097 	vol_directed_rd_t	cvdr;
1098 	caddr_t		userbuf;
1099 	offset_t	useroff;
1100 	int		ret = 0;
1101 
1102 	ui = MDI_UNIT(mnum);
1103 
1104 	md_kstat_waitq_enter(ui);
1105 
1106 	bzero(&cvdr, sizeof (cvdr));
1107 
1108 	un = (mp_unit_t *)md_unit_readerlock(ui);
1109 
1110 	/*
1111 	 * Construct a parent_buf header which reflects the user-supplied
1112 	 * request.
1113 	 */
1114 
1115 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
1116 	if (kbuffer == NULL) {
1117 		vdr->vdr_flags |= DKV_DMR_ERROR;
1118 		md_kstat_waitq_exit(ui);
1119 		md_unit_readerexit(ui);
1120 		return (ENOMEM);
1121 	}
1122 
1123 	parent_buf = getrbuf(KM_NOSLEEP);
1124 	if (parent_buf == NULL) {
1125 		vdr->vdr_flags |= DKV_DMR_ERROR;
1126 		md_kstat_waitq_exit(ui);
1127 		md_unit_readerexit(ui);
1128 		kmem_free(kbuffer, vdr->vdr_nbytes);
1129 		return (ENOMEM);
1130 	}
1131 	parent_buf->b_un.b_addr = kbuffer;
1132 	parent_buf->b_flags = B_READ;
1133 	parent_buf->b_bcount = vdr->vdr_nbytes;
1134 	parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
1135 	parent_buf->b_edev = un->un_dev;
1136 
1137 
1138 	ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
1139 	sp_parent_init(ps);
1140 
1141 	/*
1142 	 * Save essential information from the original buffhdr
1143 	 * in the parent.
1144 	 */
1145 	ps->ps_un = un;
1146 	ps->ps_ui = ui;
1147 	ps->ps_bp = parent_buf;
1148 	ps->ps_addr = parent_buf->b_un.b_addr;
1149 
1150 	current_count = parent_buf->b_bcount;
1151 	current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
1152 	current_offset  = 0;
1153 
1154 	md_kstat_waitq_to_runq(ui);
1155 
1156 	ps->ps_frags++;
1157 	vdr->vdr_bytesread = 0;
1158 
1159 	/*
1160 	 * this loop does the main work of an I/O.  we allocate a
1161 	 * a child save for each buf, do the logical to physical
1162 	 * mapping, decide if we need to frag the I/O, clone the
1163 	 * new I/O to pass down the stack.  repeat until we've
1164 	 * taken care of the entire buf that was passed to us.
1165 	 */
1166 	do {
1167 		cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1168 		sp_child_init(cs);
1169 		child_buf = &cs->cs_buf;
1170 		cs->cs_ps = ps;
1171 
1172 		more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1173 		if (more == -1) {
1174 			ret = EIO;
1175 			vdr->vdr_flags |= DKV_DMR_SHORT;
1176 			kmem_cache_free(sp_child_cache, cs);
1177 			goto err_out;
1178 		}
1179 
1180 		cvdr.vdr_flags = vdr->vdr_flags;
1181 		cvdr.vdr_side = vdr->vdr_side;
1182 		cvdr.vdr_nbytes = child_buf->b_bcount;
1183 		cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
1184 		/* Work out where we are in the allocated buffer */
1185 		useroff = (offset_t)(uintptr_t)kbuffer;
1186 		useroff = useroff + (offset_t)current_offset;
1187 		cvdr.vdr_data = (void *)(uintptr_t)useroff;
1188 		child_buf = md_bioclone(parent_buf, current_offset,
1189 		    child_buf->b_bcount, child_buf->b_edev,
1190 		    child_buf->b_blkno, NULL,
1191 		    child_buf, KM_NOSLEEP);
1192 		/* calculate new offset, counts, etc... */
1193 		current_offset += child_buf->b_bcount;
1194 		current_count -=  child_buf->b_bcount;
1195 		current_blkno +=  (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1196 
1197 		if (more) {
1198 			mutex_enter(&ps->ps_mx);
1199 			ps->ps_frags++;
1200 			mutex_exit(&ps->ps_mx);
1201 		}
1202 
1203 		ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
1204 		    (mode | FKIOCTL), NULL);
1205 
1206 		/*
1207 		 * Free the child structure as we've finished with it.
1208 		 * Normally this would be done by sp_done() but we're just
1209 		 * using md_bioclone() to segment the transfer and we never
1210 		 * issue a strategy request so the iodone will not be called.
1211 		 */
1212 		kmem_cache_free(sp_child_cache, cs);
1213 		if (ret == 0) {
1214 			/* copyout the returned data to vdr_data + offset */
1215 			userbuf = (caddr_t)kbuffer;
1216 			userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
1217 			if (ddi_copyout(userbuf, vdr->vdr_data,
1218 			    cvdr.vdr_bytesread, mode)) {
1219 				ret = EFAULT;
1220 				goto err_out;
1221 			}
1222 			vdr->vdr_bytesread += cvdr.vdr_bytesread;
1223 		} else {
1224 			goto err_out;
1225 		}
1226 	} while (more);
1227 
1228 	/*
1229 	 * Update the user-supplied vol_directed_rd_t structure with the
1230 	 * contents of the last issued child request.
1231 	 */
1232 	vdr->vdr_flags = cvdr.vdr_flags;
1233 	vdr->vdr_side = cvdr.vdr_side;
1234 	bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);
1235 
1236 err_out:
1237 	if (ret != 0) {
1238 		vdr->vdr_flags |= DKV_DMR_ERROR;
1239 	}
1240 	if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
1241 		vdr->vdr_flags |= DKV_DMR_SHORT;
1242 	}
1243 	kmem_cache_free(sp_parent_cache, ps);
1244 	kmem_free(kbuffer, vdr->vdr_nbytes);
1245 	freerbuf(parent_buf);
1246 	md_unit_readerexit(ui);
1247 	return (ret);
1248 }
1249 
1250 /*
1251  * FUNCTION:	sp_snarf()
1252  * INPUT:	cmd	- snarf cmd.
1253  *		setno	- set number.
1254  * OUTPUT:	none.
1255  * RETURNS:	1	- soft partitions were snarfed.
1256  *		0	- no soft partitions were snarfed.
1257  * PURPOSE:	Snarf soft partition metadb records into their in-core
1258  *		structures.  This routine is called at "snarf time" when
1259  *		md loads and gets all metadevices records into memory.
1260  *		The basic algorithm is simply to walk the soft partition
1261  *		records in the metadb and call the soft partitioning
1262  *		build_incore routine to set up the in-core structures.
1263  */
1264 static int
1265 sp_snarf(md_snarfcmd_t cmd, set_t setno)
1266 {
1267 	mp_unit_t	*un;
1268 	mddb_recid_t	recid;
1269 	int		gotsomething;
1270 	int		all_sp_gotten;
1271 	mddb_type_t	rec_type;
1272 	mddb_de_ic_t	*dep;
1273 	mddb_rb32_t	*rbp;
1274 	mp_unit_t	*big_un;
1275 	mp_unit32_od_t	*small_un;
1276 	size_t		newreqsize;
1277 
1278 
1279 	if (cmd == MD_SNARF_CLEANUP)
1280 		return (0);
1281 
1282 	all_sp_gotten = 1;
1283 	gotsomething = 0;
1284 
1285 	/* get the record type */
1286 	rec_type = (mddb_type_t)md_getshared_key(setno,
1287 	    sp_md_ops.md_driver.md_drivername);
1288 	recid = mddb_makerecid(setno, 0);
1289 
1290 	/*
1291 	 * walk soft partition records in the metadb and call
1292 	 * sp_build_incore to build in-core structures.
1293 	 */
1294 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1295 		/* if we've already gotten this record, go to the next one */
1296 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1297 			continue;
1298 
1299 
1300 		dep = mddb_getrecdep(recid);
1301 		dep->de_flags = MDDB_F_SOFTPART;
1302 		rbp = dep->de_rb;
1303 
1304 		switch (rbp->rb_revision) {
1305 		case MDDB_REV_RB:
1306 		case MDDB_REV_RBFN:
1307 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
1308 				/*
1309 				 * This means, we have an old and small record.
1310 				 * And this record hasn't already been converted
1311 				 * :-o before we create an incore metadevice
1312 				 * from this we have to convert it to a big
1313 				 * record.
1314 				 */
1315 				small_un =
1316 				    (mp_unit32_od_t *)mddb_getrecaddr(recid);
1317 				newreqsize = sizeof (mp_unit_t) +
1318 				    ((small_un->un_numexts - 1) *
1319 				    sizeof (struct mp_ext));
1320 				big_un = (mp_unit_t *)kmem_zalloc(newreqsize,
1321 				    KM_SLEEP);
1322 				softpart_convert((caddr_t)small_un,
1323 				    (caddr_t)big_un, SMALL_2_BIG);
1324 				kmem_free(small_un, dep->de_reqsize);
1325 				dep->de_rb_userdata = big_un;
1326 				dep->de_reqsize = newreqsize;
1327 				rbp->rb_private |= MD_PRV_CONVD;
1328 				un = big_un;
1329 			} else {
1330 				/* Record has already been converted */
1331 				un = (mp_unit_t *)mddb_getrecaddr(recid);
1332 			}
1333 			un->c.un_revision &= ~MD_64BIT_META_DEV;
1334 			break;
1335 		case MDDB_REV_RB64:
1336 		case MDDB_REV_RB64FN:
1337 			/* Large device */
1338 			un = (mp_unit_t *)mddb_getrecaddr(recid);
1339 			un->c.un_revision |= MD_64BIT_META_DEV;
1340 			un->c.un_flag |= MD_EFILABEL;
1341 			break;
1342 		}
1343 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
1344 
1345 		/*
1346 		 * Create minor node for snarfed entry.
1347 		 */
1348 		(void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
1349 
1350 		if (MD_UNIT(MD_SID(un)) != NULL) {
1351 			/* unit is already in-core */
1352 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1353 			continue;
1354 		}
1355 		all_sp_gotten = 0;
1356 		if (sp_build_incore((void *)un, 1) == 0) {
1357 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
1358 			md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
1359 			gotsomething = 1;
1360 		}
1361 	}
1362 
1363 	if (!all_sp_gotten)
1364 		return (gotsomething);
1365 	/* double-check records */
1366 	recid = mddb_makerecid(setno, 0);
1367 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
1368 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
1369 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1370 
1371 	return (0);
1372 }
1373 
1374 /*
1375  * FUNCTION:	sp_halt()
1376  * INPUT:	cmd	- halt cmd.
1377  *		setno	- set number.
1378  * RETURNS:	0	- success.
1379  *		1	- err.
1380  * PURPOSE:	Perform driver halt operations.  As with stripe, we
1381  *		support MD_HALT_CHECK and MD_HALT_DOIT.  The first
1382  *		does a check to see if halting can be done safely
1383  *		(no open soft partitions), the second cleans up and
1384  *		shuts down the driver.
1385  */
1386 static int
1387 sp_halt(md_haltcmd_t cmd, set_t setno)
1388 {
1389 	int		i;
1390 	mdi_unit_t	*ui;
1391 	minor_t		mnum;
1392 
1393 	if (cmd == MD_HALT_CLOSE)
1394 		return (0);
1395 
1396 	if (cmd == MD_HALT_OPEN)
1397 		return (0);
1398 
1399 	if (cmd == MD_HALT_UNLOAD)
1400 		return (0);
1401 
1402 	if (cmd == MD_HALT_CHECK) {
1403 		for (i = 0; i < md_nunits; i++) {
1404 			mnum = MD_MKMIN(setno, i);
1405 			if ((ui = MDI_UNIT(mnum)) == NULL)
1406 				continue;
1407 			if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1408 				continue;
1409 			if (md_unit_isopen(ui))
1410 				return (1);
1411 		}
1412 		return (0);
1413 	}
1414 
1415 	if (cmd != MD_HALT_DOIT)
1416 		return (1);
1417 
1418 	for (i = 0; i < md_nunits; i++) {
1419 		mnum = MD_MKMIN(setno, i);
1420 		if ((ui = MDI_UNIT(mnum)) == NULL)
1421 			continue;
1422 		if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1423 			continue;
1424 		reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
1425 	}
1426 
1427 	return (0);
1428 }
1429 
1430 /*
1431  * FUNCTION:	sp_open_dev()
1432  * INPUT:	un	- unit structure.
1433  *		oflags	- open flags.
1434  * OUTPUT:	none.
1435  * RETURNS:	0		- success.
1436  *		non-zero	- err.
1437  * PURPOSE:	open underlying device via md_layered_open.
1438  */
1439 static int
1440 sp_open_dev(mp_unit_t *un, int oflags)
1441 {
1442 	minor_t		mnum = MD_SID(un);
1443 	int		err;
1444 	md_dev64_t	tmpdev;
1445 	set_t		setno = MD_MIN2SET(MD_SID(un));
1446 	side_t		side = mddb_getsidenum(setno);
1447 
1448 	tmpdev = un->un_dev;
1449 	/*
1450 	 * Do the open by device id if underlying is regular
1451 	 */
1452 	if ((md_getmajor(tmpdev) != md_major) &&
1453 	    md_devid_found(setno, side, un->un_key) == 1) {
1454 		tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
1455 	}
1456 	err = md_layered_open(mnum, &tmpdev, oflags);
1457 	un->un_dev = tmpdev;
1458 
1459 	if (err)
1460 		return (ENXIO);
1461 
1462 	return (0);
1463 }
1464 
1465 /*
1466  * FUNCTION:	sp_open()
1467  * INPUT:	dev		- device to open.
1468  *		flag		- pass-through flag.
1469  *		otyp		- pass-through open type.
1470  *		cred_p		- credentials.
1471  *		md_oflags	- open flags.
1472  * OUTPUT:	none.
1473  * RETURNS:	0		- success.
1474  *		non-zero	- err.
1475  * PURPOSE:	open a soft partition.
1476  */
1477 /* ARGSUSED */
1478 static int
1479 sp_open(
1480 	dev_t		*dev,
1481 	int		flag,
1482 	int		otyp,
1483 	cred_t		*cred_p,
1484 	int		md_oflags
1485 )
1486 {
1487 	minor_t		mnum = getminor(*dev);
1488 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1489 	mp_unit_t	*un;
1490 	int		err = 0;
1491 	set_t		setno;
1492 
1493 	/*
1494 	 * When doing an open of a multi owner metadevice, check to see if this
1495 	 * node is a starting node and if a reconfig cycle is underway.
1496 	 * If so, the system isn't sufficiently set up enough to handle the
1497 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
1498 	 */
1499 	setno = MD_MIN2SET(mnum);
1500 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
1501 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
1502 			return (ENXIO);
1503 	}
1504 
1505 	/* grab necessary locks */
1506 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1507 	setno = MD_UN2SET(un);
1508 
1509 	/* open underlying device, if necessary */
1510 	if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
1511 		if ((err = sp_open_dev(un, md_oflags)) != 0)
1512 			goto out;
1513 
1514 		if (MD_MNSET_SETNO(setno)) {
1515 			/* For probe, don't incur the overhead of validate */
1516 			if (!(md_oflags & MD_OFLG_PROBEDEV)) {
1517 				/*
1518 				 * Don't call sp_validate while
1519 				 * unit_openclose lock is held.  So, actually
1520 				 * open the device, drop openclose lock,
1521 				 * call sp_validate, reacquire openclose lock,
1522 				 * and close the device.  If sp_validate
1523 				 * succeeds, then device will be re-opened.
1524 				 */
1525 				if ((err = md_unit_incopen(mnum, flag,
1526 				    otyp)) != 0)
1527 					goto out;
1528 
1529 				mutex_enter(&ui->ui_mx);
1530 				ui->ui_lock |= MD_UL_OPENINPROGRESS;
1531 				mutex_exit(&ui->ui_mx);
1532 				md_unit_openclose_exit(ui);
1533 				if (otyp != OTYP_LYR)
1534 					rw_exit(&md_unit_array_rw.lock);
1535 
1536 				err = sp_validate(un);
1537 
1538 				if (otyp != OTYP_LYR)
1539 					rw_enter(&md_unit_array_rw.lock,
1540 					    RW_READER);
1541 				(void) md_unit_openclose_enter(ui);
1542 				(void) md_unit_decopen(mnum, otyp);
1543 				mutex_enter(&ui->ui_mx);
1544 				ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
1545 				cv_broadcast(&ui->ui_cv);
1546 				mutex_exit(&ui->ui_mx);
1547 				/*
1548 				 * Should be in the same state as before
1549 				 * the sp_validate.
1550 				 */
1551 				if (err != 0) {
1552 					/* close the device opened above */
1553 					md_layered_close(un->un_dev, md_oflags);
1554 					err = EIO;
1555 					goto out;
1556 				}
1557 			}
1558 			/*
1559 			 * As we're a multi-owner metadevice we need to ensure
1560 			 * that all nodes have the same idea of the status.
1561 			 * sp_validate() will mark the device as errored (if
1562 			 * it cannot read the watermark) or ok (if it was
1563 			 * previously errored but the watermark is now valid).
1564 			 * This code-path is only entered on the non-probe open
1565 			 * so we will maintain the errored state during a probe
1566 			 * call. This means the sys-admin must metarecover -m
1567 			 * to reset the soft-partition error.
1568 			 */
1569 		} else {
1570 			/* For probe, don't incur the overhead of validate */
1571 			if (!(md_oflags & MD_OFLG_PROBEDEV) &&
1572 			    (err = sp_validate(un)) != 0) {
1573 				/* close the device opened above */
1574 				md_layered_close(un->un_dev, md_oflags);
1575 				err = EIO;
1576 				goto out;
1577 			} else {
1578 				/*
1579 				 * we succeeded in validating the on disk
1580 				 * format versus the in core, so reset the
1581 				 * status if it's in error
1582 				 */
1583 				if (un->un_status == MD_SP_ERR) {
1584 					un->un_status = MD_SP_OK;
1585 				}
1586 			}
1587 		}
1588 	}
1589 
1590 	/* count open */
1591 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
1592 		goto out;
1593 
1594 out:
1595 	md_unit_openclose_exit(ui);
1596 	return (err);
1597 }
1598 
1599 /*
1600  * FUNCTION:	sp_close()
1601  * INPUT:	dev		- device to close.
1602  *		flag		- pass-through flag.
1603  *		otyp		- pass-through type.
1604  *		cred_p		- credentials.
1605  *		md_cflags	- close flags.
1606  * OUTPUT:	none.
1607  * RETURNS:	0		- success.
1608  *		non-zero	- err.
1609  * PURPOSE:	close a soft paritition.
1610  */
1611 /* ARGSUSED */
1612 static int
1613 sp_close(
1614 	dev_t		dev,
1615 	int		flag,
1616 	int		otyp,
1617 	cred_t		*cred_p,
1618 	int		md_cflags
1619 )
1620 {
1621 	minor_t		mnum = getminor(dev);
1622 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1623 	mp_unit_t	*un;
1624 	int		err = 0;
1625 
1626 	/* grab necessary locks */
1627 	un = (mp_unit_t *)md_unit_openclose_enter(ui);
1628 
1629 	/* count closed */
1630 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
1631 		goto out;
1632 
1633 	/* close devices, if necessary */
1634 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1635 		md_layered_close(un->un_dev, md_cflags);
1636 	}
1637 
1638 	/*
1639 	 * If a MN set and transient capabilities (eg ABR/DMR) are set,
1640 	 * clear these capabilities if this is the last close in
1641 	 * the cluster
1642 	 */
1643 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1644 	    (ui->ui_tstate & MD_ABR_CAP)) {
1645 		md_unit_openclose_exit(ui);
1646 		mdmn_clear_all_capabilities(mnum);
1647 		return (0);
1648 	}
1649 	/* unlock, return success */
1650 out:
1651 	md_unit_openclose_exit(ui);
1652 	return (err);
1653 }
1654 
1655 
1656 /* used in sp_dump routine */
1657 static struct buf dumpbuf;
1658 
1659 /*
1660  * FUNCTION:	sp_dump()
1661  * INPUT:	dev	- device to dump to.
1662  *		addr	- address to dump.
1663  *		blkno	- blkno on device.
1664  *		nblk	- number of blocks to dump.
1665  * OUTPUT:	none.
1666  * RETURNS:	result from bdev_dump.
1667  * PURPOSE:  This routine dumps memory to the disk.  It assumes that
1668  *           the memory has already been mapped into mainbus space.
1669  *           It is called at disk interrupt priority when the system
1670  *           is in trouble.
1671  *           NOTE: this function is defined using 32-bit arguments,
1672  *           but soft partitioning is internally 64-bit.  Arguments
1673  *           are casted where appropriate.
1674  */
1675 static int
1676 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1677 {
1678 	mp_unit_t	*un;
1679 	buf_t		*bp;
1680 	sp_ext_length_t	nb;
1681 	daddr_t		mapblk;
1682 	int		result;
1683 	int		more;
1684 	int		saveresult = 0;
1685 
1686 	/*
1687 	 * Don't need to grab the unit lock.
1688 	 * Cause nothing else is supposed to be happenning.
1689 	 * Also dump is not supposed to sleep.
1690 	 */
1691 	un = (mp_unit_t *)MD_UNIT(getminor(dev));
1692 
1693 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1694 		return (EINVAL);
1695 
1696 	if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
1697 		return (EINVAL);
1698 
1699 	bp = &dumpbuf;
1700 	nb = (sp_ext_length_t)dbtob(nblk);
1701 	do {
1702 		bzero((caddr_t)bp, sizeof (*bp));
1703 		more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
1704 		nblk = (int)(btodb(bp->b_bcount));
1705 		mapblk = bp->b_blkno;
1706 		result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
1707 		if (result)
1708 			saveresult = result;
1709 
1710 		nb -= bp->b_bcount;
1711 		addr += bp->b_bcount;
1712 		blkno += nblk;
1713 	} while (more);
1714 
1715 	return (saveresult);
1716 }
1717 
1718 static int
1719 sp_imp_set(
1720 	set_t	setno
1721 )
1722 {
1723 	mddb_recid_t	recid;
1724 	int		gotsomething;
1725 	mddb_type_t	rec_type;
1726 	mddb_de_ic_t	*dep;
1727 	mddb_rb32_t	*rbp;
1728 	mp_unit_t	*un64;
1729 	mp_unit32_od_t	*un32;
1730 	md_dev64_t	self_devt;
1731 	minor_t		*self_id;	/* minor needs to be updated */
1732 	md_parent_t	*parent_id;	/* parent needs to be updated */
1733 	mddb_recid_t	*record_id;	/* record id needs to be updated */
1734 
1735 	gotsomething = 0;
1736 
1737 	rec_type = (mddb_type_t)md_getshared_key(setno,
1738 	    sp_md_ops.md_driver.md_drivername);
1739 	recid = mddb_makerecid(setno, 0);
1740 
1741 	while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1742 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1743 			continue;
1744 
1745 		dep = mddb_getrecdep(recid);
1746 		rbp = dep->de_rb;
1747 
1748 		switch (rbp->rb_revision) {
1749 		case MDDB_REV_RB:
1750 		case MDDB_REV_RBFN:
1751 			/*
1752 			 * Small device
1753 			 */
1754 			un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
1755 			self_id = &(un32->c.un_self_id);
1756 			parent_id = &(un32->c.un_parent);
1757 			record_id = &(un32->c.un_record_id);
1758 
1759 			if (!md_update_minor(setno, mddb_getsidenum
1760 			    (setno), un32->un_key))
1761 				goto out;
1762 			break;
1763 
1764 		case MDDB_REV_RB64:
1765 		case MDDB_REV_RB64FN:
1766 			un64 = (mp_unit_t *)mddb_getrecaddr(recid);
1767 			self_id = &(un64->c.un_self_id);
1768 			parent_id = &(un64->c.un_parent);
1769 			record_id = &(un64->c.un_record_id);
1770 
1771 			if (!md_update_minor(setno, mddb_getsidenum
1772 			    (setno), un64->un_key))
1773 				goto out;
1774 			break;
1775 		}
1776 
1777 		/*
1778 		 * If this is a top level and a friendly name metadevice,
1779 		 * update its minor in the namespace.
1780 		 */
1781 		if ((*parent_id == MD_NO_PARENT) &&
1782 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
1783 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
1784 
1785 			self_devt = md_makedevice(md_major, *self_id);
1786 			if (!md_update_top_device_minor(setno,
1787 			    mddb_getsidenum(setno), self_devt))
1788 				goto out;
1789 		}
1790 
1791 		/*
1792 		 * Update unit with the imported setno
1793 		 *
1794 		 */
1795 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
1796 
1797 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1798 		if (*parent_id != MD_NO_PARENT)
1799 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1800 		*record_id = MAKERECID(setno, DBID(*record_id));
1801 
1802 		gotsomething = 1;
1803 	}
1804 
1805 out:
1806 	return (gotsomething);
1807 }
1808 
1809 static md_named_services_t sp_named_services[] = {
1810 	{NULL,					0}
1811 };
1812 
1813 md_ops_t sp_md_ops = {
1814 	sp_open,		/* open */
1815 	sp_close,		/* close */
1816 	md_sp_strategy,		/* strategy */
1817 	NULL,			/* print */
1818 	sp_dump,		/* dump */
1819 	NULL,			/* read */
1820 	NULL,			/* write */
1821 	md_sp_ioctl,		/* ioctl, */
1822 	sp_snarf,		/* snarf */
1823 	sp_halt,		/* halt */
1824 	NULL,			/* aread */
1825 	NULL,			/* awrite */
1826 	sp_imp_set,		/* import set */
1827 	sp_named_services
1828 };
1829 
1830 static void
1831 init_init()
1832 {
1833 	sp_parent_cache = kmem_cache_create("md_softpart_parent",
1834 	    sizeof (md_spps_t), 0, sp_parent_constructor,
1835 	    sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
1836 	sp_child_cache = kmem_cache_create("md_softpart_child",
1837 	    sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
1838 	    sp_child_constructor, sp_child_destructor, sp_run_queue,
1839 	    NULL, NULL, 0);
1840 }
1841 
1842 static void
1843 fini_uninit()
1844 {
1845 	kmem_cache_destroy(sp_parent_cache);
1846 	kmem_cache_destroy(sp_child_cache);
1847 	sp_parent_cache = sp_child_cache = NULL;
1848 }
1849 
1850 /* define the module linkage */
1851 MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit())
1852