xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_sp.c (revision fa9e4066f08beec538e775443c5be79dd423fcab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Just in case we're not in a build environment, make sure that
31  * TEXT_DOMAIN gets set to something.
32  */
33 #if !defined(TEXT_DOMAIN)
34 #define	TEXT_DOMAIN "SYS_TEST"
35 #endif
36 
37 /*
38  * soft partition operations
39  *
40  * Soft Partitions provide a virtual disk mechanism which is used to
41  * divide a large volume into many small pieces, each appearing as a
42  * separate device.  A soft partition consists of a series of extents,
43  * each having an offset and a length.  The extents are logically
44  * contiguous, so where the first extent leaves off the second extent
45  * picks up.  Which extent a given "virtual offset" belongs to is
46  * dependent on the size of all the previous extents in the soft
47  * partition.
48  *
49  * Soft partitions are represented in memory by an extent node
50  * (sp_ext_node_t) which contains all of the information necessary to
51  * create a unit structure and update the on-disk format, called
52  * "watermarks".  These extent nodes are typically kept in a doubly
53  * linked list and are manipulated by list manipulation routines.  A
54  * list of extents may represent all of the soft partitions on a volume,
55  * a single soft partition, or perhaps just a set of extents that need
56  * to be updated.  Extent lists may be sorted by extent or by name/seq#,
57  * depending on which compare function is used.  Most of the routines
58  * require the list be sorted by offset to work, and that's the typical
59  * configuration.
60  *
61  * In order to do an allocation, knowledge of all soft partitions on the
62  * volume is required.  Then free space is determined from the space
63  * that is not allocated, and new allocations can be made from the free
64  * space.  Once the new allocations are made, a unit structure is created
65  * and the watermarks are updated.  The status is then changed to "okay"
66  * on the unit structure to commit the transaction.  If updating the
67  * watermarks fails, the unit structure is in an intermediate state and
68  * the driver will not allow access to the device.
69  *
70  * A typical sequence of events is:
71  *     1. Fetch the list of names for all soft partitions on a volume
72  *         meta_sp_get_by_component()
73  *     2. Construct an extent list from the name list
74  *         meta_sp_extlist_from_namelist()
75  *     3. Fill the gaps in the extent list with free extents
76  *         meta_sp_list_freefill()
77  *     4. Allocate from the free extents
78  *         meta_sp_alloc_by_len()
79  *         meta_sp_alloc_by_list()
80  *     5. Create the unit structure from the extent list
81  *         meta_sp_createunit()
82  *         meta_sp_updateunit()
83  *     6. Write out the watermarks
84  *         meta_sp_update_wm()
85  *     7. Set the status to "Okay"
86  *         meta_sp_setstatus()
87  *
88  */
89 
90 #include <stdio.h>
91 #include <meta.h>
92 #include "meta_repartition.h"
93 #include <sys/lvm/md_sp.h>
94 #include <sys/lvm/md_crc.h>
95 #include <strings.h>
96 #include <sys/lvm/md_mirror.h>
97 #include <sys/bitmap.h>
98 
99 extern int	md_in_daemon;
100 
101 typedef struct sp_ext_node {
102 	struct sp_ext_node	*ext_next;	/* next element */
103 	struct sp_ext_node	*ext_prev;	/* previous element */
104 	sp_ext_type_t		ext_type;	/* type of extent */
105 	sp_ext_offset_t		ext_offset;	/* starting offset */
106 	sp_ext_length_t		ext_length;	/* length of this node */
107 	uint_t			ext_flags;	/* extent flags */
108 	uint32_t		ext_seq;	/* watermark seq no */
109 	mdname_t		*ext_namep;	/* name pointer */
110 	mdsetname_t		*ext_setp;	/* set pointer */
111 } sp_ext_node_t;
112 
113 /* extent flags */
114 #define	EXTFLG_UPDATE	(1)
115 
116 /* Extent node compare function for list sorting */
117 typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *);
118 
119 
120 /* Function Prototypes */
121 
122 /* Debugging Functions */
123 static void meta_sp_debug(char *format, ...);
124 static void meta_sp_printunit(mp_unit_t *mp);
125 
126 /* Misc Support Functions */
127 int meta_sp_parsesize(char *s, sp_ext_length_t *szp);
128 static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp);
129 static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp,
130 	md_error_t *ep);
131 static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp,
132     mdnamelist_t **nlpp, int force, md_error_t *ep);
133 static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp,
134     mdname_t *compnp, md_error_t *ep);
135 
136 /* Extent List Manipulation Functions */
137 static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2);
138 static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2);
139 static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np,
140     sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length,
141     sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare);
142 static void meta_sp_list_free(sp_ext_node_t **head);
143 static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext);
144 static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head,
145     sp_ext_type_t exttype, int exclude_wm);
146 static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head,
147     sp_ext_offset_t offset);
148 static void meta_sp_list_freefill(sp_ext_node_t **extlist,
149     sp_ext_length_t size);
150 static void meta_sp_list_dump(sp_ext_node_t *head);
151 static int meta_sp_list_overlaps(sp_ext_node_t *head);
152 
153 /* Extent List Query Functions */
154 static boolean_t meta_sp_enough_space(int desired_number_of_sps,
155 	blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp,
156 	sp_ext_length_t alignment);
157 static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep,
158 	mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp,
159 	md_error_t *ep);
160 static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep,
161 	mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp);
162 
163 
164 /* Extent Allocation Functions */
165 static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np,
166     sp_ext_node_t **extlist, sp_ext_node_t *free_ext,
167     sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq);
168 static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np,
169     sp_ext_node_t **extlist, sp_ext_length_t *lp,
170     sp_ext_offset_t last_off, sp_ext_length_t alignment);
171 static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np,
172     sp_ext_node_t **extlist, sp_ext_node_t *oblist);
173 
174 /* Extent List Population Functions */
175 static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp,
176     sp_ext_node_t **extlist, md_error_t *ep);
177 static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp,
178     sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep);
179 
180 /* Print (metastat) Functions */
181 static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp,
182     mdprtopts_t options, md_error_t *ep);
183 static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate);
184 static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp,
185     char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep);
186 
187 /* Watermark Manipulation Functions */
188 static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp,
189     sp_ext_node_t *extlist, md_error_t *ep);
190 static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep);
191 static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp,
192     mp_watermark_t *wm, sp_ext_offset_t offset,  md_error_t *ep);
193 static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp,
194     md_error_t *ep);
195 
196 /* Unit Structure Manipulation Functions */
197 static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist);
198 static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp,
199     sp_ext_node_t *extlist, int numexts, sp_ext_length_t len,
200     sp_status_t status, md_error_t *ep);
201 static mp_unit_t *meta_sp_updateunit(mdname_t *np,  mp_unit_t *old_un,
202     sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts,
203     md_error_t *ep);
204 static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist,
205     mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep);
206 static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options,
207     int *repart_options, md_error_t *ep);
208 
209 /* Reset (metaclear) Functions */
210 static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp,
211     md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep);
212 
213 /* Recovery (metarecover) Functions */
214 static void meta_sp_display_exthdr(void);
215 static void meta_sp_display_ext(sp_ext_node_t *ext);
216 static int meta_sp_checkseq(sp_ext_node_t *extlist);
217 static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *,
218     mdname_t **, md_error_t *);
219 static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np,
220     mdcmdopts_t options, md_error_t *ep);
221 static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp,
222     mdcmdopts_t options, md_error_t *ep);
223 static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np,
224     mdcmdopts_t options, md_error_t *ep);
225 static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext,
226     sp_ext_node_t *unitext, md_error_t *ep);
227 static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp,
228     mdcmdopts_t options, md_error_t *ep);
229 static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np,
230     mdcmdopts_t options, md_error_t *ep);
231 
232 /*
233  * Private Constants
234  */
235 
236 static const int FORCE_RELOAD_CACHE = 1;
237 static const uint_t NO_FLAGS = 0;
238 static const sp_ext_offset_t NO_OFFSET = 0ULL;
239 static const uint_t NO_SEQUENCE_NUMBER = 0;
240 static const int ONE_SOFT_PARTITION = 1;
241 
242 static unsigned long sp_parent_printed[BT_BITOUL(MD_MAXUNITS)];
243 
244 #define	TEST_SOFT_PARTITION_NAMEP NULL
245 #define	TEST_SETNAMEP NULL
246 
247 #define	EXCLUDE_WM	(1)
248 #define	INCLUDE_WM	(0)
249 
250 #define	SP_UNALIGNED	(0LL)
251 
252 /*
253  * **************************************************************************
254  *                          Debugging Functions                             *
255  * **************************************************************************
256  */
257 
258 /*PRINTFLIKE1*/
259 static void
260 meta_sp_debug(char *format, ...)
261 {
262 	static int debug;
263 	static int debug_set = 0;
264 	va_list ap;
265 
266 	if (!debug_set) {
267 		debug = getenv(META_SP_DEBUG) ? 1 : 0;
268 		debug_set = 1;
269 	}
270 
271 	if (debug) {
272 		va_start(ap, format);
273 		(void) vfprintf(stderr, format, ap);
274 		va_end(ap);
275 	}
276 }
277 
278 static void
279 meta_sp_printunit(mp_unit_t *mp)
280 {
281 	int i;
282 
283 	if (mp == NULL)
284 		return;
285 
286 	/* print the common fields we know about */
287 	(void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type);
288 	(void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size);
289 	(void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp));
290 
291 	/* sp-specific fields */
292 	(void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status);
293 	(void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts);
294 	(void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length);
295 	(void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev);
296 	(void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev);
297 	(void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key);
298 
299 	/* print extent information */
300 	(void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n");
301 	for (i = 0; i < mp->un_numexts; i++) {
302 		(void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i,
303 		    mp->un_ext[i].un_voff, mp->un_ext[i].un_poff,
304 		    mp->un_ext[i].un_len);
305 	}
306 }
307 
308 /*
309  * FUNCTION:    meta_sp_parsesize()
310  * INPUT:       s       - the string to parse
311  * OUTPUT:      *szp    - disk block count (0 for "all")
312  * RETURNS:     -1 for error, 0 for success
313  * PURPOSE:     parses the command line parameter that specifies the
314  *              requested size of a soft partition.  The input string
315  *              is either the literal "all" or a numeric value
316  *              followed by a single character, b for disk blocks, k
317  *              for kilobytes, m for megabytes, g for gigabytes, or t
318  *              for terabytes.  p for petabytes and e for exabytes
319  *              have been added as undocumented features for future
320  *              expansion.  For example, 100m is 100 megabytes, while
321  *              50g is 50 gigabytes.  All values are rounded up to the
322  *              nearest block size.
323  */
324 int
325 meta_sp_parsesize(char *s, sp_ext_length_t *szp)
326 {
327 	if (s == NULL || szp == NULL) {
328 		return (-1);
329 	}
330 
331 	/* Check for literal "all" */
332 	if (strcasecmp(s, "all") == 0) {
333 		*szp = 0;
334 		return (0);
335 	}
336 
337 	return (meta_sp_parsesizestring(s, szp));
338 }
339 
340 /*
341  * FUNCTION:	meta_sp_parsesizestring()
342  * INPUT:	s	- the string to parse
343  * OUTPUT:	*szp	- disk block count
344  * RETURNS:	-1 for error, 0 for success
345  * PURPOSE:	parses a string that specifies size. The input string is a
346  *		numeric value followed by a single character, b for disk blocks,
347  *		k for kilobytes, m for megabytes, g for gigabytes, or t for
348  *		terabytes.  p for petabytes and e for exabytes have been added
349  *		as undocumented features for future expansion.  For example,
350  *		100m is 100 megabytes, while 50g is 50 gigabytes.  All values
351  *		are rounded up to the nearest block size.
352  */
353 static int
354 meta_sp_parsesizestring(char *s, sp_ext_length_t *szp)
355 {
356 	sp_ext_length_t	len = 0;
357 	char		len_type[2];
358 
359 	if (s == NULL || szp == NULL) {
360 		return (-1);
361 	}
362 
363 	/*
364 	 * make sure block offset does not overflow 2^64 bytes.
365 	 */
366 	if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) ||
367 	    (len == 0LL) ||
368 	    (len > (1LL << (64 - DEV_BSHIFT))))
369 		return (-1);
370 
371 	switch (len_type[0]) {
372 	case 'B':
373 	case 'b':
374 		len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE));
375 		break;
376 	case 'K':
377 	case 'k':
378 		len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE));
379 		break;
380 	case 'M':
381 	case 'm':
382 		len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE));
383 		break;
384 	case 'g':
385 	case 'G':
386 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE));
387 		break;
388 	case 't':
389 	case 'T':
390 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL,
391 		    DEV_BSIZE));
392 		break;
393 	case 'p':
394 	case 'P':
395 		len = lbtodb(roundup(
396 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
397 		    DEV_BSIZE));
398 		break;
399 	case 'e':
400 	case 'E':
401 		len = lbtodb(roundup(
402 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
403 		    DEV_BSIZE));
404 		break;
405 	default:
406 		/* error */
407 		return (-1);
408 	}
409 
410 	*szp = len;
411 	return (0);
412 }
413 
414 /*
415  * FUNCTION:	meta_sp_setgeom()
416  * INPUT:	np      - the underlying device to setup geometry for
417  *		compnp	- the underlying device to setup geometry for
418  *		mp	- the unit structure to set the geometry for
419  * OUTPUT:	ep	- return error pointer
420  * RETURNS:	int	- -1 if error, 0 otherwise
421  * PURPOSE:	establishes geometry information for a device
422  */
423 static int
424 meta_sp_setgeom(
425 	mdname_t	*np,
426 	mdname_t	*compnp,
427 	mp_unit_t	*mp,
428 	md_error_t	*ep
429 )
430 {
431 	mdgeom_t	*geomp;
432 	uint_t		round_cyl = 0;
433 
434 	if ((geomp = metagetgeom(compnp, ep)) == NULL)
435 		return (-1);
436 	if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct,
437 	    geomp->read_reinstruct, round_cyl, ep) != 0)
438 		return (-1);
439 
440 	return (0);
441 }
442 
443 /*
444  * FUNCTION:	meta_sp_setstatus()
445  * INPUT:	sp	- the set name for the devices to set the status on
446  *		minors	- an array of minor numbers of devices to set status on
447  *		num_units - number of entries in the array
448  *		status	- status value to set all units to
449  * OUTPUT:	ep	- return error pointer
450  * RETURNS:	int	- -1 if error, 0 success
451  * PURPOSE:	sets the status of one or more soft partitions to the
452  *		requested value
453  */
454 int
455 meta_sp_setstatus(
456 	mdsetname_t	*sp,
457 	minor_t		*minors,
458 	int		num_units,
459 	sp_status_t	status,
460 	md_error_t	*ep
461 )
462 {
463 	md_sp_statusset_t	status_params;
464 
465 	assert(minors != NULL);
466 
467 	/* update status of all soft partitions to the status passed in */
468 	(void) memset(&status_params, 0, sizeof (status_params));
469 	status_params.num_units = num_units;
470 	status_params.new_status = status;
471 	status_params.size = num_units * sizeof (minor_t);
472 	status_params.minors = (uintptr_t)minors;
473 	MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno);
474 	if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde,
475 	    NULL) != 0) {
476 		(void) mdstealerror(ep, &status_params.mde);
477 		return (-1);
478 	}
479 	return (0);
480 }
481 
482 /*
483  * FUNCTION:	meta_get_sp_names()
484  * INPUT:	sp	- the set name to get soft partitions from
485  *		options	- options from the command line
486  * OUTPUT:	nlpp	- list of all soft partition names
487  *		ep	- return error pointer
488  * RETURNS:	int	- -1 if error, 0 success
489  * PURPOSE:	returns a list of all soft partitions in the metadb
490  *		for all devices in the specified set
491  */
492 int
493 meta_get_sp_names(
494 	mdsetname_t	*sp,
495 	mdnamelist_t	**nlpp,
496 	int		options,
497 	md_error_t	*ep
498 )
499 {
500 	return (meta_get_names(MD_SP, sp, nlpp, options, ep));
501 }
502 
503 /*
504  * FUNCTION:	meta_get_by_component()
505  * INPUT:	sp	- the set name to get soft partitions from
506  *		compnp	- the name of the device containing the soft
507  *			  partitions that will be returned
508  *		force	- 0 - reads cached namelist if available,
509  *			  1 - reloads cached namelist, frees old namelist
510  * OUTPUT:	nlpp	- list of all soft partition names
511  *		ep	- return error pointer
512  * RETURNS:	int	- -1 error, otherwise the number of soft partitions
513  *			  found on the component (0 = none found).
514  * PURPOSE:	returns a list of all soft partitions on a given device
515  *		from the metadb information
516  */
517 static int
518 meta_sp_get_by_component(
519 	mdsetname_t	*sp,
520 	mdname_t	*compnp,
521 	mdnamelist_t	**nlpp,
522 	int		force,
523 	md_error_t	*ep
524 )
525 {
526 	static mdnamelist_t	*cached_list = NULL;	/* cached namelist */
527 	static int		cached_count = 0;	/* cached count */
528 	mdnamelist_t		*spnlp = NULL;		/* all sp names */
529 	mdnamelist_t		*namep;			/* list iterator */
530 	mdnamelist_t		**tailpp = nlpp;	/* namelist tail */
531 	mdnamelist_t		**cachetailpp;		/* cache tail */
532 	md_sp_t			*msp;			/* unit structure */
533 	int			count = 0;		/* count of sp's */
534 	int			err;
535 	mdname_t		*curnp;
536 
537 	if ((cached_list != NULL) && (!force)) {
538 		/* return a copy of the cached list */
539 		for (namep = cached_list; namep != NULL; namep = namep->next)
540 			tailpp = meta_namelist_append_wrapper(tailpp,
541 			    namep->namep);
542 		return (cached_count);
543 	}
544 
545 	/* free the cache and reset values to zeros to prepare for a new list */
546 	metafreenamelist(cached_list);
547 	cached_count = 0;
548 	cached_list = NULL;
549 	cachetailpp = &cached_list;
550 	*nlpp = NULL;
551 
552 	/* get all the softpartitions first of all */
553 	if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
554 		return (-1);
555 
556 	/*
557 	 * Now for each sp, see if it resides on the component we
558 	 * are interested in, if so then add it to our list
559 	 */
560 	for (namep = spnlp; namep != NULL; namep = namep->next) {
561 		curnp = namep->namep;
562 
563 		/* get the unit structure */
564 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
565 			continue;
566 
567 		/*
568 		 * If the current soft partition is not on the same
569 		 * component, continue the search.  If it is on the same
570 		 * component, add it to our namelist.
571 		 */
572 		err = meta_check_samedrive(compnp, msp->compnamep, ep);
573 		if (err <= 0) {
574 			/* not on the same device, check the next one */
575 			continue;
576 		}
577 
578 		/* it's on the same drive */
579 
580 		/*
581 		 * Check for overlapping partitions if the component is not
582 		 * a metadevice.
583 		 */
584 		if (!metaismeta(msp->compnamep)) {
585 			/*
586 			 * if they're on the same drive, neither
587 			 * should be a metadevice if one isn't
588 			 */
589 			assert(!metaismeta(compnp));
590 
591 			if (meta_check_overlap(msp->compnamep->cname,
592 			    compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0)
593 				continue;
594 
595 			/* in this case it's not an error for them to overlap */
596 			mdclrerror(ep);
597 		}
598 
599 		/* Component is on the same device, add to the used list */
600 		tailpp = meta_namelist_append_wrapper(tailpp, curnp);
601 		cachetailpp = meta_namelist_append_wrapper(cachetailpp,
602 		    curnp);
603 
604 		++count;
605 		++cached_count;
606 	}
607 
608 	assert(count == cached_count);
609 	return (count);
610 
611 out:
612 	metafreenamelist(*nlpp);
613 	*nlpp = NULL;
614 	return (-1);
615 }
616 
617 /*
618  * FUNCTION:    meta_sp_get_default_alignment()
619  * INPUT:       sp      - the pertinent set name
620  *              compnp  - the name of the underlying component
621  * OUTPUT:      ep      - return error pointer
622  * RETURNS:     sp_ext_length_t =0: no default alignment
623  *                              >0: default alignment
624  * PURPOSE:     returns the default alignment for soft partitions to
625  *              be built on top of the specified component or
626  *              metadevice
627  */
628 static sp_ext_length_t
629 meta_sp_get_default_alignment(
630 	mdsetname_t	*sp,
631 	mdname_t	*compnp,
632 	md_error_t	*ep
633 )
634 {
635 	sp_ext_length_t	a = SP_UNALIGNED;
636 	char		*mname;
637 
638 	assert(compnp != NULL);
639 
640 	/*
641 	 * We treat raw devices as opaque, and assume nothing about
642 	 * their alignment requirements.
643 	 */
644 	if (!metaismeta(compnp))
645 		return (SP_UNALIGNED);
646 
647 	/*
648 	 * We already know it's a metadevice from the previous test;
649 	 * metagetmiscname() will tell us which metadevice type we
650 	 * have
651 	 */
652 	mname = metagetmiscname(compnp, ep);
653 	if (mname == NULL)
654 		goto out;
655 
656 	/*
657 	 * For a mirror, we want to deal with the stripe that is the
658 	 * primary side.  If it happens to be asymmetrically
659 	 * configured, there is no simple way to fake a universal
660 	 * alignment.  There's a chance that the least common
661 	 * denominator of the set of interlaces from all stripes of
662 	 * all submirrors would do it, but nobody that really cared
663 	 * that much about this issue would create an asymmetric
664 	 * config to start with.
665 	 *
666 	 * If the component underlying the soft partition is a mirror,
667 	 * then at the exit of this loop, compnp will have been
668 	 * updated to describe the first active submirror.
669 	 */
670 	if (strcmp(mname, MD_MIRROR) == 0) {
671 		md_mirror_t	*mp;
672 		int		smi;
673 		md_submirror_t	*smp;
674 
675 		mp = meta_get_mirror(sp, compnp, ep);
676 		if (mp == NULL)
677 			goto out;
678 
679 		for (smi = 0; smi < NMIRROR; smi++) {
680 
681 			smp = &mp->submirrors[smi];
682 			if (smp->state == SMS_UNUSED)
683 				continue;
684 
685 			compnp = smp->submirnamep;
686 			assert(compnp != NULL);
687 
688 			mname = metagetmiscname(compnp, ep);
689 			if (mname == NULL)
690 				goto out;
691 
692 			break;
693 		}
694 
695 		if (smi == NMIRROR)
696 			goto out;
697 	}
698 
699 	/*
700 	 * Handle stripes and submirrors identically; just return the
701 	 * interlace of the first row.
702 	 */
703 	if (strcmp(mname, MD_STRIPE) == 0) {
704 		md_stripe_t	*stp;
705 
706 		stp = meta_get_stripe(sp, compnp, ep);
707 		if (stp == NULL)
708 			goto out;
709 
710 		a = stp->rows.rows_val[0].interlace;
711 		goto out;
712 	}
713 
714 	/*
715 	 * Raid is even more straightforward; the interlace applies to
716 	 * the entire device.
717 	 */
718 	if (strcmp(mname, MD_RAID) == 0) {
719 		md_raid_t	*rp;
720 
721 		rp = meta_get_raid(sp, compnp, ep);
722 		if (rp == NULL)
723 			goto out;
724 
725 		a = rp->interlace;
726 		goto out;
727 	}
728 
729 	/*
730 	 * If we have arrived here with the alignment still not set,
731 	 * then we expect the error to have been set by one of the
732 	 * routines we called.  If neither is the case, something has
733 	 * really gone wrong above.  (Probably the submirror walk
734 	 * failed to produce a valid submirror, but that would be
735 	 * really bad...)
736 	 */
737 out:
738 	meta_sp_debug("meta_sp_get_default_alignment: miscname %s, "
739 	    "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a);
740 
741 	if (getenv(META_SP_DEBUG) && !mdisok(ep)) {
742 		mde_perror(ep, NULL);
743 	}
744 
745 	assert((a > 0) || (!mdisok(ep)));
746 
747 	return (a);
748 }
749 
750 
751 
752 /*
753  * FUNCTION:	meta_check_insp()
754  * INPUT:	sp	- the set name for the device to check
755  *		np	- the name of the device to check
756  *		slblk	- the starting offset of the device to check
757  *		nblks	- the number of blocks in the device to check
758  * OUTPUT:	ep	- return error pointer
759  * RETURNS:	int	-  0 - device contains soft partitions
760  *			  -1 - device does not contain soft partitions
761  * PURPOSE:	determines whether a device contains any soft partitions
762  */
763 /* ARGSUSED */
764 int
765 meta_check_insp(
766 	mdsetname_t	*sp,
767 	mdname_t	*np,
768 	diskaddr_t	slblk,
769 	diskaddr_t	nblks,
770 	md_error_t	*ep
771 )
772 {
773 	mdnamelist_t	*spnlp = NULL;	/* soft partition name list */
774 	int		count;
775 	int		rval;
776 
777 	/* check set pointer */
778 	assert(sp != NULL);
779 
780 	/* find all soft partitions on the component */
781 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
782 
783 	if (count == -1) {
784 		rval = -1;
785 	} else if (count > 0) {
786 		rval = mduseerror(ep, MDE_ALREADY, np->dev,
787 		    spnlp->namep->cname, np->cname);
788 	} else {
789 		rval = 0;
790 	}
791 
792 	metafreenamelist(spnlp);
793 	return (rval);
794 }
795 
796 /*
797  * **************************************************************************
798  *                    Extent List Manipulation Functions                    *
799  * **************************************************************************
800  */
801 
802 /*
803  * FUNCTION:	meta_sp_cmp_by_nameseq()
804  * INPUT:	e1	- first node to compare
805  *		e2	- second node to compare
806  * OUTPUT:	none
807  * RETURNS:	int	- =0 - nodes are equal
808  *			  <0 - e1 should go before e2
809  *			  >0 - e1 should go after e2
810  * PURPOSE:	used for sorted list inserts to build a list sorted by
811  *		name first and sequence number second.
812  */
813 static int
814 meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2)
815 {
816 	int rval;
817 
818 	if (e1->ext_namep == NULL)
819 		return (1);
820 	if (e2->ext_namep == NULL)
821 		return (-1);
822 	if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0)
823 		return (rval);
824 
825 	/* the names are equal, compare sequence numbers */
826 	if (e1->ext_seq > e2->ext_seq)
827 		return (1);
828 	if (e1->ext_seq < e2->ext_seq)
829 		return (-1);
830 	/* sequence numbers are also equal */
831 	return (0);
832 }
833 
834 /*
835  * FUNCTION:	meta_sp_cmp_by_offset()
836  * INPUT:	e1	- first node to compare
837  *		e2	- second node to compare
838  * OUTPUT:	none
839  * RETURNS:	int	- =0 - nodes are equal
840  *			  <0 - e1 should go before e2
841  *			  >0 - e1 should go after e2
842  * PURPOSE:	used for sorted list inserts to build a list sorted by offset
843  */
844 static int
845 meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2)
846 {
847 	if (e1->ext_offset > e2->ext_offset)
848 		return (1);
849 	if (e1->ext_offset < e2->ext_offset)
850 		return (-1);
851 	/* offsets are equal */
852 	return (0);
853 }
854 
855 /*
856  * FUNCTION:	meta_sp_list_insert()
857  * INPUT:	sp	- the set name for the device the node belongs to
858  *		np	- the name of the device the node belongs to
859  *		head	- the head of the list, must be NULL for empty list
860  *		offset	- the physical offset of this extent in sectors
861  *		length	- the length of this extent in sectors
862  *		type	- the type of the extent being inserted
863  *		seq	- the sequence number of the extent being inserted
864  *		flags	- extent flags (eg. whether it needs to be updated)
865  *		compare	- the compare function to use
866  * OUTPUT:	head	- points to the new head if a node was inserted
867  *			  at the beginning
868  * RETURNS:	void
869  * PURPOSE:	inserts an extent node into a sorted doubly linked list.
870  *		The sort order is determined by the compare function.
871  *		Memory is allocated for the node in this function and it
872  *		is up to the caller to free it, possibly using
873  *		meta_sp_list_free().  If a node is inserted at the
874  *		beginning of the list, the head pointer is updated to
875  *		point to the new first node.
876  */
877 static void
878 meta_sp_list_insert(
879 	mdsetname_t	*sp,
880 	mdname_t	*np,
881 	sp_ext_node_t	**head,
882 	sp_ext_offset_t	offset,
883 	sp_ext_length_t	length,
884 	sp_ext_type_t	type,
885 	uint_t		seq,
886 	uint_t		flags,
887 	ext_cmpfunc_t	compare
888 )
889 {
890 	sp_ext_node_t	*newext;
891 	sp_ext_node_t	*curext;
892 
893 	assert(head != NULL);
894 
895 	/* Don't bother adding zero length nodes */
896 	if (length == 0ULL)
897 		return;
898 
899 	/* allocate and fill in new ext_node */
900 	newext = Zalloc(sizeof (sp_ext_node_t));
901 
902 	newext->ext_offset = offset;
903 	newext->ext_length = length;
904 	newext->ext_flags = flags;
905 	newext->ext_type = type;
906 	newext->ext_seq = seq;
907 	newext->ext_setp = sp;
908 	newext->ext_namep = np;
909 
910 	/* first node in the list */
911 	if (*head == NULL) {
912 		newext->ext_next = newext->ext_prev = NULL;
913 		*head = newext;
914 	} else if ((*compare)(*head, newext) >= 0) {
915 		/* the first node has a bigger offset, so insert before it */
916 		assert((*head)->ext_prev == NULL);
917 
918 		newext->ext_prev = NULL;
919 		newext->ext_next = *head;
920 		(*head)->ext_prev = newext;
921 		*head = newext;
922 	} else {
923 		/*
924 		 * find the next node whose offset is greater than
925 		 * the one we want to insert, or the end of the list.
926 		 */
927 		for (curext = *head;
928 		    (curext->ext_next != NULL) &&
929 		    ((*compare)(curext->ext_next, newext) < 0);
930 		    (curext = curext->ext_next))
931 			;
932 
933 		/* link the new node in after the current node */
934 		newext->ext_next = curext->ext_next;
935 		newext->ext_prev = curext;
936 
937 		if (curext->ext_next != NULL)
938 			curext->ext_next->ext_prev = newext;
939 
940 		curext->ext_next = newext;
941 	}
942 }
943 
944 /*
945  * FUNCTION:	meta_sp_list_free()
946  * INPUT:	head	- the head of the list, must be NULL for empty list
947  * OUTPUT:	head	- points to NULL on return
948  * RETURNS:	void
949  * PURPOSE:	walks a double linked extent list and frees each node
950  */
951 static void
952 meta_sp_list_free(sp_ext_node_t **head)
953 {
954 	sp_ext_node_t	*ext;
955 	sp_ext_node_t	*next;
956 
957 	assert(head != NULL);
958 
959 	ext = *head;
960 	while (ext) {
961 		next = ext->ext_next;
962 		Free(ext);
963 		ext = next;
964 	}
965 	*head = NULL;
966 }
967 
968 /*
969  * FUNCTION:	meta_sp_list_remove()
970  * INPUT:	head	- the head of the list, must be NULL for empty list
971  *		ext	- the extent to remove, must be a member of the list
972  * OUTPUT:	head	- points to the new head of the list
973  * RETURNS:	void
974  * PURPOSE:	unlinks the node specified by ext from the list and
975  *		frees it, possibly moving the head pointer forward if
976  *		the head is the node being removed.
977  */
978 static void
979 meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext)
980 {
981 	assert(head != NULL);
982 	assert(*head != NULL);
983 
984 	if (*head == ext)
985 		*head = ext->ext_next;
986 
987 	if (ext->ext_prev != NULL)
988 		ext->ext_prev->ext_next = ext->ext_next;
989 	if (ext->ext_next != NULL)
990 		ext->ext_next->ext_prev = ext->ext_prev;
991 	Free(ext);
992 }
993 
994 /*
995  * FUNCTION:	meta_sp_list_size()
996  * INPUT:	head	- the head of the list, must be NULL for empty list
997  *		exttype	- the type of the extents to sum
998  *		exclude_wm - subtract space for extent headers from total
999  * OUTPUT:	none
1000  * RETURNS:	sp_ext_length_t	- the sum of all of the lengths
1001  * PURPOSE:	sums the lengths of all extents in the list matching the
1002  *		specified type.  This could be used for computing the
1003  *		amount of free or used space, for example.
1004  */
1005 static sp_ext_length_t
1006 meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm)
1007 {
1008 	sp_ext_node_t	*ext;
1009 	sp_ext_length_t	size = 0LL;
1010 
1011 	for (ext = head; ext != NULL; ext = ext->ext_next)
1012 		if (ext->ext_type == exttype)
1013 			size += ext->ext_length -
1014 			    ((exclude_wm) ? MD_SP_WMSIZE : 0);
1015 
1016 	return (size);
1017 }
1018 
1019 /*
1020  * FUNCTION:	meta_sp_list_find()
1021  * INPUT:	head	- the head of the list, must be NULL for empty list
1022  *		offset	- the offset contained by the node to find
1023  * OUTPUT:	none
1024  * RETURNS:	sp_ext_node_t *	- the node containing the requested offset
1025  *				  or NULL if no such nodes were found.
1026  * PURPOSE:	finds a node in a list containing the requested offset
1027  *		(inclusive).  If multiple nodes contain this offset then
1028  *		only the first will be returned, though typically these
1029  *		lists are managed with non-overlapping nodes.
1030  *
1031  *		*The list MUST be sorted by offset for this function to work.*
1032  */
1033 static sp_ext_node_t *
1034 meta_sp_list_find(
1035 	sp_ext_node_t	*head,
1036 	sp_ext_offset_t	offset
1037 )
1038 {
1039 	sp_ext_node_t	*ext;
1040 
1041 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1042 		/* check if the offset lies within this extent */
1043 		if ((offset >= ext->ext_offset) &&
1044 		    (offset < ext->ext_offset + ext->ext_length)) {
1045 			/*
1046 			 * the requested extent should always be a
1047 			 * subset of an extent in the list.
1048 			 */
1049 			return (ext);
1050 		}
1051 	}
1052 	return (NULL);
1053 }
1054 
1055 /*
1056  * FUNCTION:	meta_sp_list_freefill()
1057  * INPUT:	head	- the head of the list, must be NULL for empty list
1058  *		size	- the size of the volume this extent list is
1059  *			  representing
1060  * OUTPUT:	head	- the new head of the list
1061  * RETURNS:	void
1062  * PURPOSE:	finds gaps in the extent list and fills them with a free
1063  *		node.  If there is a gap at the beginning the head
1064  *		pointer will be changed to point to the new free node.
1065  *		If there is free space at the end, the last free extent
1066  *		will extend all the way out to the size specified.
1067  *
1068  *		*The list MUST be sorted by offset for this function to work.*
1069  */
1070 static void
1071 meta_sp_list_freefill(
1072 	sp_ext_node_t	**head,
1073 	sp_ext_length_t	size
1074 )
1075 {
1076 	sp_ext_node_t	*ext;
1077 	sp_ext_offset_t	curoff = 0LL;
1078 
1079 	for (ext = *head; ext != NULL; ext = ext->ext_next) {
1080 		if (curoff < ext->ext_offset)
1081 			meta_sp_list_insert(NULL, NULL, head,
1082 			    curoff, ext->ext_offset - curoff,
1083 			    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1084 		curoff = ext->ext_offset + ext->ext_length;
1085 	}
1086 
1087 	/* pad inverse list out to the end */
1088 	if (curoff < size)
1089 		meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff,
1090 		    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1091 
1092 	if (getenv(META_SP_DEBUG)) {
1093 		meta_sp_debug("meta_sp_list_freefill: Extent list with "
1094 		    "holes freefilled:\n");
1095 		meta_sp_list_dump(*head);
1096 	}
1097 }
1098 
1099 /*
1100  * FUNCTION:	meta_sp_list_dump()
1101  * INPUT:	head	- the head of the list, must be NULL for empty list
1102  * OUTPUT:	none
1103  * RETURNS:	void
1104  * PURPOSE:	dumps the entire extent list to stdout for easy debugging
1105  */
1106 static void
1107 meta_sp_list_dump(sp_ext_node_t *head)
1108 {
1109 	sp_ext_node_t	*ext;
1110 
1111 	meta_sp_debug("meta_sp_list_dump: dumping extent list:\n");
1112 	meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name",
1113 	    "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev",
1114 	    "Next");
1115 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1116 		if (ext->ext_namep != NULL)
1117 			meta_sp_debug("%5s", ext->ext_namep->cname);
1118 		else
1119 			meta_sp_debug("%5s", "NONE");
1120 
1121 		meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq);
1122 		switch (ext->ext_type) {
1123 		case EXTTYP_ALLOC:
1124 			meta_sp_debug("%7s ", "ALLOC");
1125 			break;
1126 		case EXTTYP_FREE:
1127 			meta_sp_debug("%7s ", "FREE");
1128 			break;
1129 		case EXTTYP_END:
1130 			meta_sp_debug("%7s ", "END");
1131 			break;
1132 		case EXTTYP_RESERVED:
1133 			meta_sp_debug("%7s ", "RESV");
1134 			break;
1135 		default:
1136 			meta_sp_debug("%7s ", "INVLD");
1137 			break;
1138 		}
1139 
1140 		meta_sp_debug("%10llu %10llu %5u %10p %10p\n",
1141 		    ext->ext_offset, ext->ext_length,
1142 		    ext->ext_flags, (void *) ext->ext_prev,
1143 		    (void *) ext->ext_next);
1144 	}
1145 	meta_sp_debug("\n");
1146 }
1147 
1148 /*
1149  * FUNCTION:	meta_sp_list_overlaps()
1150  * INPUT:	head	- the head of the list, must be NULL for empty list
1151  * OUTPUT:	none
1152  * RETURNS:	int	- 1 if extents overlap, 0 if ok
1153  * PURPOSE:	checks a list for overlaps.  The list MUST be sorted by
1154  *		offset for this function to work properly.
1155  */
1156 static int
1157 meta_sp_list_overlaps(sp_ext_node_t *head)
1158 {
1159 	sp_ext_node_t	*ext;
1160 
1161 	for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) {
1162 		if (ext->ext_offset + ext->ext_length >
1163 		    ext->ext_next->ext_offset)
1164 			return (1);
1165 	}
1166 	return (0);
1167 }
1168 
1169 /*
1170  * **************************************************************************
1171  *                        Extent Allocation Functions                       *
1172  * **************************************************************************
1173  */
1174 
1175 /*
1176  * FUNCTION:	meta_sp_alloc_by_ext()
1177  * INPUT:	sp	- the set name for the device the node belongs to
1178  *		np	- the name of the device the node belongs to
1179  *		head	- the head of the list, must be NULL for empty list
1180  *		free_ext	- the free extent being allocated from
1181  *		alloc_offset	- the offset of the allocation
1182  *		alloc_len	- the length of the allocation
1183  *		seq		- the sequence number of the allocation
1184  * OUTPUT:	head	- the new head pointer
1185  * RETURNS:	void
1186  * PURPOSE:	allocates a portion of the free extent free_ext.  The
1187  *		allocated portion starts at alloc_offset and is
1188  *		alloc_length long.  Both (alloc_offset) and (alloc_offset +
1189  *		alloc_length) must be contained within the free extent.
1190  *
1191  *		The free extent is split into as many as 3 pieces - a
1192  *		free extent containing [ free_offset .. alloc_offset ), an
1193  *		allocated extent containing the range [ alloc_offset ..
1194  *		alloc_end ], and another free extent containing the
1195  *		range ( alloc_end .. free_end ].  If either of the two
1196  *		new free extents would be zero length, they are not created.
1197  *
1198  *		Finally, the original free extent is removed.  All newly
1199  *		created extents have the EXTFLG_UPDATE flag set.
1200  */
1201 static void
1202 meta_sp_alloc_by_ext(
1203 	mdsetname_t	*sp,
1204 	mdname_t	*np,
1205 	sp_ext_node_t	**head,
1206 	sp_ext_node_t	*free_ext,
1207 	sp_ext_offset_t	alloc_offset,
1208 	sp_ext_length_t	alloc_length,
1209 	uint_t		seq
1210 )
1211 {
1212 	sp_ext_offset_t	free_offset = free_ext->ext_offset;
1213 	sp_ext_length_t	free_length = free_ext->ext_length;
1214 
1215 	sp_ext_offset_t	alloc_end = alloc_offset + alloc_length;
1216 	sp_ext_offset_t	free_end  = free_offset  + free_length;
1217 
1218 	/* allocated extent must be a subset of the free extent */
1219 	assert(free_offset <= alloc_offset);
1220 	assert(free_end >= alloc_end);
1221 
1222 	meta_sp_list_remove(head, free_ext);
1223 
1224 	if (free_offset < alloc_offset) {
1225 		meta_sp_list_insert(NULL, NULL, head, free_offset,
1226 		    (alloc_offset - free_offset), EXTTYP_FREE, 0,
1227 		    EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1228 	}
1229 
1230 	if (free_end > alloc_end) {
1231 		meta_sp_list_insert(NULL, NULL, head, alloc_end,
1232 		    (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE,
1233 		    meta_sp_cmp_by_offset);
1234 	}
1235 
1236 	meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length,
1237 	    EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1238 
1239 	if (getenv(META_SP_DEBUG)) {
1240 		meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n");
1241 		meta_sp_list_dump(*head);
1242 	}
1243 }
1244 
1245 /*
1246  * FUNCTION:	meta_sp_alloc_by_len()
1247  * INPUT:	sp	- the set name for the device the node belongs to
1248  *		np	- the name of the device the node belongs to
1249  *		head	- the head of the list, must be NULL for empty list
1250  *		*lp	- the requested length to allocate
1251  *		last_off	- the last offset already allocated.
1252  *		alignment	- the desired extent alignmeent
1253  * OUTPUT:	head	- the new head pointer
1254  *		*lp	- the length allocated
1255  * RETURNS:	int	- -1 if error, the number of new extents on success
1256  * PURPOSE:	allocates extents from free space to satisfy the requested
1257  *		length.  If requested length is zero, allocates all
1258  *		remaining free space.  This function provides the meat
1259  *		of the extent allocation algorithm.  Allocation is a
1260  *		three tier process:
1261  *
1262  *		1. If last_off is nonzero and there is free space following
1263  *		   that node, then it is extended to allocate as much of that
1264  *		   free space as possible.  This is useful for metattach.
1265  *		2. If a free extent can be found to satisfy the remaining
1266  *		   requested space, then satisfy the rest of the request
1267  *		   from that extent.
1268  *		3. Start allocating space from any remaining free extents until
1269  *		   the remainder of the request is satisified.
1270  *
1271  *              If alignment is non-zero, then every extent modified
1272  *              or newly allocated will be aligned modulo alignment,
1273  *              with a length that is an integer multiple of
1274  *              alignment.
1275  *
1276  *		The EXTFLG_UPDATE flag is set for all nodes (free and
1277  *		allocated) that require updated watermarks.
1278  *
1279  *		This algorithm may have a negative impact on fragmentation
1280  *		in pathological cases and may be improved if it turns out
1281  *		to be a problem.  This may be exacerbated by particularly
1282  *		large alignments.
1283  *
1284  * NOTE:	It's confusing, so it demands an explanation:
1285  *		- len is used to represent requested data space; it
1286  *		  does not include room for a watermark.  On each full
1287  *		  or partial allocation, len will be decremented by
1288  *		  alloc_len (see next paragraph) until it reaches
1289  *		  zero.
1290  *		- alloc_len is used to represent data space allocated
1291  *		  from a particular extent; it does not include space
1292  *		  for a watermark.  In the rare event that a_length
1293  *		  (see next paragraph) is equal to MD_SP_WMSIZE,
1294  *		  alloc_len will be zero and the resulting MD_SP_WMSIZE
1295  *		  fragment of space will be utterly unusable.
1296  *		- a_length is used to represent all space to be
1297  *		  allocated from a particular extent; it DOES include
1298  *		  space for a watermark.
1299  */
1300 static int
1301 meta_sp_alloc_by_len(
1302 	mdsetname_t	*sp,
1303 	mdname_t	*np,
1304 	sp_ext_node_t	**head,
1305 	sp_ext_length_t	*lp,
1306 	sp_ext_offset_t	last_off,
1307 	sp_ext_offset_t	alignment
1308 )
1309 {
1310 	sp_ext_node_t	*free_ext;
1311 	sp_ext_node_t	*alloc_ext;
1312 	uint_t		last_seq = 0;
1313 	uint_t		numexts = 0;
1314 	sp_ext_length_t	freespace;
1315 	sp_ext_length_t	alloc_len;
1316 	sp_ext_length_t	len;
1317 
1318 	/* We're DOA if we can't read *lp */
1319 	assert(lp != NULL);
1320 	len = *lp;
1321 
1322 	/*
1323 	 * Process the nominal case first: we've been given an actual
1324 	 * size argument, rather than the literal "all"
1325 	 */
1326 
1327 	if (len != 0) {
1328 
1329 		/*
1330 		 * Short circuit the check for free space.  This may
1331 		 * tell us we have enough space when we really don't
1332 		 * because each extent loses space to a watermark, but
1333 		 * it will always tell us there isn't enough space
1334 		 * correctly.  Worst case we do some extra work.
1335 		 */
1336 		freespace = meta_sp_list_size(*head, EXTTYP_FREE,
1337 		    INCLUDE_WM);
1338 
1339 		if (freespace < len)
1340 			return (-1);
1341 
1342 		/*
1343 		 * First see if we can extend the last extent for an
1344 		 * attach.
1345 		 */
1346 		if (last_off != 0LL) {
1347 			int align = 0;
1348 
1349 			alloc_ext =
1350 			    meta_sp_list_find(*head, last_off);
1351 			assert(alloc_ext != NULL);
1352 
1353 			/*
1354 			 * The offset test reflects the
1355 			 * inclusion of the watermark in the extent
1356 			 */
1357 			align = (alignment > 0) &&
1358 			    (((alloc_ext->ext_offset + MD_SP_WMSIZE) %
1359 				alignment) == 0);
1360 
1361 			/*
1362 			 * If we decided not to align here, we should
1363 			 * also reset "alignment" so we don't bother
1364 			 * later, either.
1365 			 */
1366 			if (!align) {
1367 				alignment = 0;
1368 			}
1369 
1370 			last_seq = alloc_ext->ext_seq;
1371 
1372 			free_ext = meta_sp_list_find(*head,
1373 			    alloc_ext->ext_offset +
1374 			    alloc_ext->ext_length);
1375 
1376 			/*
1377 			 * If a free extent follows our last allocated
1378 			 * extent, then remove the last allocated
1379 			 * extent and increase the size of the free
1380 			 * extent to overlap it, then allocate the
1381 			 * total space from the new free extent.
1382 			 */
1383 			if (free_ext != NULL &&
1384 			    free_ext->ext_type == EXTTYP_FREE) {
1385 				assert(free_ext->ext_offset ==
1386 				    alloc_ext->ext_offset +
1387 				    alloc_ext->ext_length);
1388 
1389 				alloc_len =
1390 				    MIN(len, free_ext->ext_length);
1391 
1392 				if (align && (alloc_len < len)) {
1393 					/* No watermark space needed */
1394 					alloc_len -= alloc_len % alignment;
1395 				}
1396 
1397 				if (alloc_len > 0) {
1398 					free_ext->ext_offset -=
1399 					    alloc_ext->ext_length;
1400 					free_ext->ext_length +=
1401 					    alloc_ext->ext_length;
1402 
1403 					meta_sp_alloc_by_ext(sp, np, head,
1404 					    free_ext, free_ext->ext_offset,
1405 					    alloc_ext->ext_length + alloc_len,
1406 					    last_seq);
1407 
1408 					/*
1409 					 * now remove the original allocated
1410 					 * node.  We may have overlapping
1411 					 * extents for a short time before
1412 					 * this node is removed.
1413 					 */
1414 					meta_sp_list_remove(head, alloc_ext);
1415 					len -= alloc_len;
1416 				}
1417 			}
1418 			last_seq++;
1419 		}
1420 
1421 		if (len == 0LL)
1422 			goto out;
1423 
1424 		/*
1425 		 * Next, see if we can find a single allocation for
1426 		 * the remainder.  This may make fragmentation worse
1427 		 * in some cases, but there's no good way to allocate
1428 		 * that doesn't have a highly fragmented corner case.
1429 		 */
1430 		for (free_ext = *head; free_ext != NULL;
1431 			free_ext = free_ext->ext_next) {
1432 			sp_ext_offset_t	a_offset;
1433 			sp_ext_offset_t	a_length;
1434 
1435 			if (free_ext->ext_type != EXTTYP_FREE)
1436 				continue;
1437 
1438 			/*
1439 			 * The length test should include space for
1440 			 * the watermark
1441 			 */
1442 
1443 			a_offset = free_ext->ext_offset;
1444 			a_length = free_ext->ext_length;
1445 
1446 			if (alignment > 0) {
1447 
1448 				/*
1449 				 * Shortcut for extents that have been
1450 				 * previously added to pad out the
1451 				 * data space
1452 				 */
1453 				if (a_length < alignment) {
1454 					continue;
1455 				}
1456 
1457 				/*
1458 				 * Round up so the data space begins
1459 				 * on a properly aligned boundary.
1460 				 */
1461 				a_offset += alignment -
1462 				    (a_offset % alignment) - MD_SP_WMSIZE;
1463 
1464 				/*
1465 				 * This is only necessary in case the
1466 				 * watermark size is ever greater than
1467 				 * one.  It'll never happen, of
1468 				 * course; we'll get rid of watermarks
1469 				 * before we make 'em bigger.
1470 				 */
1471 				if (a_offset < free_ext->ext_offset) {
1472 					a_offset += alignment;
1473 				}
1474 
1475 				/*
1476 				 * Adjust the length to account for
1477 				 * the space lost above (if any)
1478 				 */
1479 				a_length -=
1480 					(a_offset - free_ext->ext_offset);
1481 			}
1482 
1483 			if (a_length >= len + MD_SP_WMSIZE) {
1484 				meta_sp_alloc_by_ext(sp, np, head,
1485 					free_ext, a_offset,
1486 					len + MD_SP_WMSIZE, last_seq);
1487 
1488 				len = 0LL;
1489 				numexts++;
1490 				break;
1491 			}
1492 		}
1493 
1494 		if (len == 0LL)
1495 			goto out;
1496 
1497 
1498 		/*
1499 		 * If the request could not be satisfied by extending
1500 		 * the last extent or by a single extent, then put
1501 		 * multiple smaller extents together until the request
1502 		 * is satisfied.
1503 		 */
1504 		for (free_ext = *head; (free_ext != NULL) && (len > 0);
1505 			free_ext = free_ext->ext_next) {
1506 			sp_ext_offset_t a_offset;
1507 			sp_ext_length_t a_length;
1508 
1509 			if (free_ext->ext_type != EXTTYP_FREE)
1510 				continue;
1511 
1512 			a_offset = free_ext->ext_offset;
1513 			a_length = free_ext->ext_length;
1514 
1515 			if (alignment > 0) {
1516 
1517 				/*
1518 				 * Shortcut for extents that have been
1519 				 * previously added to pad out the
1520 				 * data space
1521 				 */
1522 				if (a_length < alignment) {
1523 					continue;
1524 				}
1525 
1526 				/*
1527 				 * Round up so the data space begins
1528 				 * on a properly aligned boundary.
1529 				 */
1530 				a_offset += alignment -
1531 					(a_offset % alignment) - MD_SP_WMSIZE;
1532 
1533 				/*
1534 				 * This is only necessary in case the
1535 				 * watermark size is ever greater than
1536 				 * one.  It'll never happen, of
1537 				 * course; we'll get rid of watermarks
1538 				 * before we make 'em bigger.
1539 				 */
1540 				if (a_offset < free_ext->ext_offset) {
1541 					a_offset += alignment;
1542 				}
1543 
1544 				/*
1545 				 * Adjust the length to account for
1546 				 * the space lost above (if any)
1547 				 */
1548 				a_length -=
1549 					(a_offset - free_ext->ext_offset);
1550 
1551 				/*
1552 				 * Adjust the length to be properly
1553 				 * aligned if it is NOT to be the
1554 				 * last extent in the soft partition.
1555 				 */
1556 				if ((a_length - MD_SP_WMSIZE) < len)
1557 					a_length -=
1558 						(a_length - MD_SP_WMSIZE)
1559 						% alignment;
1560 			}
1561 
1562 			alloc_len = MIN(len, a_length - MD_SP_WMSIZE);
1563 			if (alloc_len == 0)
1564 				continue;
1565 
1566 			/*
1567 			 * meta_sp_alloc_by_ext() expects the
1568 			 * allocation length to include the watermark
1569 			 * size, which is why we don't simply pass in
1570 			 * alloc_len here.
1571 			 */
1572 			meta_sp_alloc_by_ext(sp, np, head, free_ext,
1573 				a_offset, MIN(len + MD_SP_WMSIZE, a_length),
1574 				last_seq);
1575 
1576 			len -= alloc_len;
1577 			numexts++;
1578 			last_seq++;
1579 		}
1580 
1581 
1582 		/*
1583 		 * If there was not enough space we can throw it all
1584 		 * away since no real work has been done yet.
1585 		 */
1586 		if (len != 0) {
1587 			meta_sp_list_free(head);
1588 			return (-1);
1589 		}
1590 	}
1591 
1592 	/*
1593 	 * Otherwise, the literal "all" was specified: allocate all
1594 	 * available free space.  Don't bother with alignment.
1595 	 */
1596 	else {
1597 		/* First, extend the last extent if this is a grow */
1598 		if (last_off != 0LL) {
1599 			alloc_ext =
1600 				meta_sp_list_find(*head, last_off);
1601 			assert(alloc_ext != NULL);
1602 
1603 			last_seq = alloc_ext->ext_seq;
1604 
1605 			free_ext = meta_sp_list_find(*head,
1606 				alloc_ext->ext_offset +
1607 				alloc_ext->ext_length);
1608 
1609 			/*
1610 			 * If a free extent follows our last allocated
1611 			 * extent, then remove the last allocated
1612 			 * extent and increase the size of the free
1613 			 * extent to overlap it, then allocate the
1614 			 * total space from the new free extent.
1615 			 */
1616 			if (free_ext != NULL &&
1617 			    free_ext->ext_type == EXTTYP_FREE) {
1618 				assert(free_ext->ext_offset ==
1619 				    alloc_ext->ext_offset +
1620 				    alloc_ext->ext_length);
1621 
1622 				len = alloc_len =
1623 				    free_ext->ext_length;
1624 
1625 				free_ext->ext_offset -=
1626 				    alloc_ext->ext_length;
1627 				free_ext->ext_length +=
1628 				    alloc_ext->ext_length;
1629 
1630 				meta_sp_alloc_by_ext(sp, np, head,
1631 				    free_ext, free_ext->ext_offset,
1632 				    alloc_ext->ext_length + alloc_len,
1633 				    last_seq);
1634 
1635 				/*
1636 				 * now remove the original allocated
1637 				 * node.  We may have overlapping
1638 				 * extents for a short time before
1639 				 * this node is removed.
1640 				 */
1641 				meta_sp_list_remove(head, alloc_ext);
1642 			}
1643 
1644 			last_seq++;
1645 		}
1646 
1647 		/* Next, grab all remaining free space */
1648 		for (free_ext = *head; free_ext != NULL;
1649 			free_ext = free_ext->ext_next) {
1650 
1651 			if (free_ext->ext_type == EXTTYP_FREE) {
1652 				alloc_len =
1653 				    free_ext->ext_length - MD_SP_WMSIZE;
1654 				if (alloc_len == 0)
1655 					continue;
1656 
1657 				/*
1658 				 * meta_sp_alloc_by_ext() expects the
1659 				 * allocation length to include the
1660 				 * watermark size, which is why we
1661 				 * don't simply pass in alloc_len
1662 				 * here.
1663 				 */
1664 				meta_sp_alloc_by_ext(sp, np, head,
1665 				    free_ext, free_ext->ext_offset,
1666 				    free_ext->ext_length,
1667 				    last_seq);
1668 
1669 				len += alloc_len;
1670 				numexts++;
1671 				last_seq++;
1672 			}
1673 		}
1674 	}
1675 
1676 out:
1677 	if (getenv(META_SP_DEBUG)) {
1678 		meta_sp_debug("meta_sp_alloc_by_len: Extent list after "
1679 		    "allocation:\n");
1680 		meta_sp_list_dump(*head);
1681 	}
1682 
1683 	if (*lp == 0) {
1684 		*lp = len;
1685 
1686 		/*
1687 		 * Make sure the callers hit a no space error if we
1688 		 * didn't actually find anything.
1689 		 */
1690 		if (len == 0) {
1691 			return (-1);
1692 		}
1693 	}
1694 
1695 	return (numexts);
1696 }
1697 
1698 /*
1699  * FUNCTION:	meta_sp_alloc_by_list()
1700  * INPUT:	sp	- the set name for the device the node belongs to
1701  *		np	- the name of the device the node belongs to
1702  *		head	- the head of the list, must be NULL for empty list
1703  *		oblist	- an extent list containing requested nodes to allocate
1704  * OUTPUT:	head	- the new head pointer
1705  * RETURNS:	int	- -1 if error, the number of new extents on success
1706  * PURPOSE:	allocates extents from free space to satisfy the requested
1707  *		extent list.  This is primarily used for the -o/-b options
1708  *		where the user may specifically request extents to allocate.
1709  *		Each extent in the oblist must be a subset (inclusive) of a
1710  *		free extent and may not overlap each other.  This
1711  *		function sets the EXTFLG_UPDATE flag for each node that
1712  *		requires a watermark update after allocating.
1713  */
1714 static int
1715 meta_sp_alloc_by_list(
1716 	mdsetname_t	*sp,
1717 	mdname_t	*np,
1718 	sp_ext_node_t	**head,
1719 	sp_ext_node_t	*oblist
1720 )
1721 {
1722 	sp_ext_node_t	*ext;
1723 	sp_ext_node_t	*free_ext;
1724 	uint_t		numexts = 0;
1725 
1726 	for (ext = oblist; ext != NULL; ext = ext->ext_next) {
1727 
1728 		free_ext = meta_sp_list_find(*head,
1729 		    ext->ext_offset - MD_SP_WMSIZE);
1730 
1731 		/* Make sure the allocation is within the free extent */
1732 		if ((free_ext == NULL) ||
1733 		    (ext->ext_offset + ext->ext_length >
1734 		    free_ext->ext_offset + free_ext->ext_length) ||
1735 		    (free_ext->ext_type != EXTTYP_FREE))
1736 			return (-1);
1737 
1738 		meta_sp_alloc_by_ext(sp, np, head, free_ext,
1739 		    ext->ext_offset - MD_SP_WMSIZE,
1740 		    ext->ext_length + MD_SP_WMSIZE, ext->ext_seq);
1741 
1742 		numexts++;
1743 	}
1744 
1745 	assert(meta_sp_list_overlaps(*head) == 0);
1746 
1747 	if (getenv(META_SP_DEBUG)) {
1748 		meta_sp_debug("meta_sp_alloc_by_list: Extent list after "
1749 		    "allocation:\n");
1750 		meta_sp_list_dump(*head);
1751 	}
1752 
1753 	return (numexts);
1754 }
1755 
1756 /*
1757  * **************************************************************************
1758  *                     Extent List Population Functions                     *
1759  * **************************************************************************
1760  */
1761 
1762 /*
1763  * FUNCTION:	meta_sp_extlist_from_namelist()
1764  * INPUT:	sp	- the set name for the device the node belongs to
1765  *		spnplp	- the namelist of soft partitions to build a list from
1766  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1767  *		ep	- return error pointer
1768  * RETURNS:	int	- -1 if error, 0 on success
1769  * PURPOSE:	builds an extent list representing the soft partitions
1770  *		specified in the namelist.  Each extent in each soft
1771  *		partition is added to the list with the type EXTTYP_ALLOC.
1772  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1773  *		extent in the list includes the space occupied by the
1774  *		watermark, which is not included in the unit structures.
1775  */
1776 static int
1777 meta_sp_extlist_from_namelist(
1778 	mdsetname_t	*sp,
1779 	mdnamelist_t	*spnlp,
1780 	sp_ext_node_t	**extlist,
1781 	md_error_t	*ep
1782 )
1783 {
1784 	int		extn;
1785 	md_sp_t		*msp;		/* unit structure of the sp's */
1786 	mdnamelist_t	*namep;
1787 
1788 	assert(sp != NULL);
1789 
1790 	/*
1791 	 * Now go through the soft partitions and add a node to the used
1792 	 * list for each allocated extent.
1793 	 */
1794 	for (namep = spnlp; namep != NULL; namep = namep->next) {
1795 		mdname_t	*curnp = namep->namep;
1796 
1797 		/* get the unit structure */
1798 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
1799 			return (-1);
1800 
1801 		for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1802 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1803 
1804 			/*
1805 			 * subtract from offset and add to the length
1806 			 * to account for the watermark, which is not
1807 			 * contained in the extents in the unit structure.
1808 			 */
1809 			meta_sp_list_insert(sp, curnp, extlist,
1810 			    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
1811 			    EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset);
1812 		}
1813 	}
1814 	return (0);
1815 }
1816 
1817 /*
1818  * FUNCTION:	meta_sp_extlist_from_wm()
1819  * INPUT:	sp	- the set name for the device the node belongs to
1820  *		compnp	- the name of the device to scan watermarks on
1821  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1822  *		ep	- return error pointer
1823  * RETURNS:	int	- -1 if error, 0 on success
1824  * PURPOSE:	builds an extent list representing the soft partitions
1825  *		specified in the namelist.  Each extent in each soft
1826  *		partition is added to the list with the type EXTTYP_ALLOC.
1827  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1828  *		extent in the list includes the space occupied by the
1829  *		watermark, which is not included in the unit structures.
1830  */
1831 static int
1832 meta_sp_extlist_from_wm(
1833 	mdsetname_t	*sp,
1834 	mdname_t	*compnp,
1835 	sp_ext_node_t	**extlist,
1836 	ext_cmpfunc_t	compare,
1837 	md_error_t	*ep
1838 )
1839 {
1840 	mp_watermark_t	wm;
1841 	mdname_t	*np = NULL;
1842 	mdsetname_t	*spsetp = NULL;
1843 	sp_ext_offset_t	cur_off;
1844 
1845 	if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR)
1846 		return (-1);
1847 
1848 	for (;;) {
1849 		if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) {
1850 			return (-1);
1851 		}
1852 
1853 		/* get the set and name pointers */
1854 		if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) {
1855 			if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) {
1856 				return (-1);
1857 			}
1858 		}
1859 
1860 		if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) {
1861 			if (meta_init_make_device(&sp, wm.wm_mdname, ep) != 0)
1862 				return (-1);
1863 			np = metaname(&spsetp, wm.wm_mdname, ep);
1864 			if (np == NULL) {
1865 				return (-1);
1866 			}
1867 		}
1868 
1869 		/* insert watermark into extent list */
1870 		meta_sp_list_insert(spsetp, np, extlist, cur_off,
1871 		    wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq,
1872 		    EXTFLG_UPDATE, compare);
1873 
1874 		/* if we see the end watermark, we're done */
1875 		if (wm.wm_type == EXTTYP_END)
1876 			break;
1877 
1878 		cur_off += wm.wm_length + 1;
1879 
1880 		/* clear out set and name pointers for next iteration */
1881 		np = NULL;
1882 		spsetp = NULL;
1883 	}
1884 
1885 	return (0);
1886 }
1887 
1888 /*
1889  * **************************************************************************
1890  *                        Print (metastat) Functions                        *
1891  * **************************************************************************
1892  */
1893 
1894 /*
1895  * FUNCTION:	meta_sp_short_print()
1896  * INPUT:	msp	- the unit structure to display
1897  *		fp	- the file pointer to send output to
1898  *		options	- print options from the command line processor
1899  * OUTPUT:	ep	- return error pointer
1900  * RETURNS:	int	- -1 if error, 0 on success
1901  * PURPOSE:	display a short report of the soft partition in md.tab
1902  *		form, primarily used for metastat -p.
1903  */
1904 static int
1905 meta_sp_short_print(
1906 	md_sp_t		*msp,
1907 	char		*fname,
1908 	FILE		*fp,
1909 	mdprtopts_t	options,
1910 	md_error_t	*ep
1911 )
1912 {
1913 	int	extn;
1914 
1915 	if (options & PRINT_LARGEDEVICES) {
1916 		if (msp->common.revision != MD_64BIT_META_DEV)
1917 			return (0);
1918 	}
1919 
1920 	/* print name and -p */
1921 	if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF)
1922 		return (mdsyserror(ep, errno, fname));
1923 
1924 	/* print the component */
1925 	/*
1926 	 * If the path is our standard /dev/rdsk or /dev/md/rdsk
1927 	 * then just print out the cxtxdxsx or the dx, metainit
1928 	 * will assume the default, otherwise we need the full
1929 	 * pathname to make sure this works as we intend.
1930 	 */
1931 	if ((strstr(msp->compnamep->rname, "/dev/rdsk") == NULL) &&
1932 	    (strstr(msp->compnamep->rname, "/dev/md/rdsk") == NULL) &&
1933 	    (strstr(msp->compnamep->rname, "/dev/td/") == NULL)) {
1934 		/* not standard path so print full pathname */
1935 		if (fprintf(fp, " %s", msp->compnamep->rname) == EOF)
1936 			return (mdsyserror(ep, errno, fname));
1937 	} else {
1938 		/* standard path so print ctds or d number */
1939 		if (fprintf(fp, " %s", msp->compnamep->cname) == EOF)
1940 			return (mdsyserror(ep, errno, fname));
1941 	}
1942 
1943 	/* print out each extent */
1944 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1945 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1946 		if (fprintf(fp, " -o %llu -b %llu ", extp->poff,
1947 		    extp->len) == EOF)
1948 			return (mdsyserror(ep, errno, fname));
1949 	}
1950 
1951 	if (fprintf(fp, "\n") == EOF)
1952 		return (mdsyserror(ep, errno, fname));
1953 
1954 	/* success */
1955 	return (0);
1956 }
1957 
1958 /*
1959  * FUNCTION:	meta_sp_status_to_name()
1960  * INPUT:	xsp_status	- the status value to convert to a string
1961  *		tstate		- transient errored device state. If set the
1962  *				  device is Unavailable
1963  * OUTPUT:	none
1964  * RETURNS:	char *	- a pointer to the string representing the status value
1965  * PURPOSE:	return an internationalized string representing the
1966  *		status value for a soft partition.  The strings are
1967  *		strdup'd and must be freed by the caller.
1968  */
1969 static char *
1970 meta_sp_status_to_name(
1971 	xsp_status_t	xsp_status,
1972 	uint_t		tstate
1973 )
1974 {
1975 	char *rval = NULL;
1976 
1977 	/*
1978 	 * Check to see if we have MD_INACCESSIBLE set. This is the only valid
1979 	 * value for an 'Unavailable' return. tstate can be set because of
1980 	 * other multi-node reasons (e.g. ABR being set)
1981 	 */
1982 	if (tstate & MD_INACCESSIBLE) {
1983 		return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable")));
1984 	}
1985 
1986 	switch (xsp_status) {
1987 	case MD_SP_CREATEPEND:
1988 		rval = Strdup(dgettext(TEXT_DOMAIN, "Creating"));
1989 		break;
1990 	case MD_SP_GROWPEND:
1991 		rval = Strdup(dgettext(TEXT_DOMAIN, "Growing"));
1992 		break;
1993 	case MD_SP_DELPEND:
1994 		rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting"));
1995 		break;
1996 	case MD_SP_OK:
1997 		rval = Strdup(dgettext(TEXT_DOMAIN, "Okay"));
1998 		break;
1999 	case MD_SP_ERR:
2000 		rval = Strdup(dgettext(TEXT_DOMAIN, "Errored"));
2001 		break;
2002 	case MD_SP_RECOVER:
2003 		rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering"));
2004 		break;
2005 	}
2006 
2007 	if (rval == NULL)
2008 		rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid"));
2009 
2010 	return (rval);
2011 }
2012 
2013 /*
2014  * FUNCTION:	meta_sp_report()
2015  * INPUT:	sp	- the set name for the unit being displayed
2016  *		msp	- the unit structure to display
2017  *		nlpp	- pass back the large devs
2018  *		fp	- the file pointer to send output to
2019  *		options	- print options from the command line processor
2020  * OUTPUT:	ep	- return error pointer
2021  * RETURNS:	int	- -1 if error, 0 on success
2022  * PURPOSE:	print a full report of the device specified
2023  */
2024 static int
2025 meta_sp_report(
2026 	mdsetname_t	*sp,
2027 	md_sp_t		*msp,
2028 	mdnamelist_t	**nlpp,
2029 	char		*fname,
2030 	FILE		*fp,
2031 	mdprtopts_t	options,
2032 	md_error_t	*ep
2033 )
2034 {
2035 	uint_t		extn;
2036 	char		*status;
2037 	char		*devid = "";
2038 	mdname_t	*didnp = NULL;
2039 	ddi_devid_t	dtp;
2040 	int		len;
2041 	uint_t		tstate = 0;
2042 
2043 	if (options & PRINT_LARGEDEVICES) {
2044 		if (msp->common.revision != MD_64BIT_META_DEV) {
2045 			return (0);
2046 		} else {
2047 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2048 				return (-1);
2049 		}
2050 	}
2051 
2052 	if (options & PRINT_HEADER) {
2053 		if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"),
2054 		    msp->common.namep->cname) == EOF)
2055 			return (mdsyserror(ep, errno, fname));
2056 	}
2057 
2058 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Device: %s\n"),
2059 	    msp->compnamep->cname) == EOF)
2060 		return (mdsyserror(ep, errno, fname));
2061 
2062 	/* Determine if device is available before displaying status */
2063 	if (metaismeta(msp->common.namep)) {
2064 		if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0)
2065 			return (-1);
2066 	}
2067 	status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED);
2068 
2069 	/* print out "State" to be consistent with other metadevices */
2070 	if (tstate & MD_ABR_CAP) {
2071 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2072 		    "    State: %s - Application Based Recovery (ABR)\n"),
2073 		    status) == EOF) {
2074 			Free(status);
2075 			return (mdsyserror(ep, errno, fname));
2076 		}
2077 	} else {
2078 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2079 		    "    State: %s\n"), status) == EOF) {
2080 			Free(status);
2081 			return (mdsyserror(ep, errno, fname));
2082 		}
2083 	}
2084 	free(status);
2085 
2086 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Size: %llu blocks (%s)\n"),
2087 	    msp->common.size,
2088 	    meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF)
2089 		return (mdsyserror(ep, errno, fname));
2090 
2091 	/* print component details */
2092 	if (! metaismeta(msp->compnamep)) {
2093 		diskaddr_t	start_blk;
2094 		int		has_mddb;
2095 		char		*has_mddb_str;
2096 
2097 		/* print header */
2098 		/*
2099 		 * Building a format string on the fly that will
2100 		 * be used in (f)printf. This allows the length
2101 		 * of the ctd to vary from small to large without
2102 		 * looking horrible.
2103 		 */
2104 		len = strlen(msp->compnamep->cname);
2105 		len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device")));
2106 		len += 2;
2107 		if (fprintf(fp,
2108 		    "\t%-*.*s %-12.12s %-5.5s %s\n",
2109 		    len, len,
2110 		    dgettext(TEXT_DOMAIN, "Device"),
2111 		    dgettext(TEXT_DOMAIN, "Start Block"),
2112 		    dgettext(TEXT_DOMAIN, "Dbase"),
2113 		    dgettext(TEXT_DOMAIN, "Reloc")) == EOF) {
2114 			return (mdsyserror(ep, errno, fname));
2115 		}
2116 
2117 
2118 		/* get info */
2119 		if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) ==
2120 		    MD_DISKADDR_ERROR)
2121 			return (-1);
2122 
2123 		if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0)
2124 			return (-1);
2125 
2126 		if (has_mddb)
2127 			has_mddb_str = dgettext(TEXT_DOMAIN, "Yes");
2128 		else
2129 			has_mddb_str = dgettext(TEXT_DOMAIN, "No");
2130 
2131 		/* populate the key in the name_p structure */
2132 		didnp = metadevname(&sp, msp->compnamep->dev, ep);
2133 		if (didnp == NULL) {
2134 			return (-1);
2135 		}
2136 
2137 		/* determine if devid does NOT exist */
2138 		if (options & PRINT_DEVID) {
2139 		    if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep),
2140 					didnp->key, ep)) == NULL)
2141 				devid = dgettext(TEXT_DOMAIN, "No ");
2142 			else {
2143 				devid = dgettext(TEXT_DOMAIN, "Yes");
2144 				free(dtp);
2145 			}
2146 		}
2147 
2148 		/* print info */
2149 		/*
2150 		 * This allows the length
2151 		 * of the ctd to vary from small to large without
2152 		 * looking horrible.
2153 		 */
2154 		if (fprintf(fp, "\t%-*s %8lld     %-5.5s %s\n",
2155 		    len, msp->compnamep->cname,
2156 		    start_blk, has_mddb_str, devid) == EOF) {
2157 			return (mdsyserror(ep, errno, fname));
2158 		}
2159 		(void) fprintf(fp, "\n");
2160 	}
2161 
2162 
2163 	/* print the headers */
2164 	if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n",
2165 	    dgettext(TEXT_DOMAIN, "Extent"),
2166 	    dgettext(TEXT_DOMAIN, "Start Block"),
2167 	    dgettext(TEXT_DOMAIN, "Block count")) == EOF)
2168 		return (mdsyserror(ep, errno, fname));
2169 
2170 	/* print out each extent */
2171 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2172 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2173 
2174 		/* If PRINT_TIMES option is ever supported, add output here */
2175 		if (fprintf(fp, "\t%6u %24llu %24llu\n",
2176 		    extn, extp->poff, extp->len) == EOF)
2177 			return (mdsyserror(ep, errno, fname));
2178 	}
2179 
2180 	/* separate records with a newline */
2181 	(void) fprintf(fp, "\n");
2182 	return (0);
2183 }
2184 
2185 /*
2186  * FUNCTION:	meta_sp_print()
2187  * INPUT:	sp	- the set name for the unit being displayed
2188  *		np	- the name of the device to print
2189  *		fname	- ??? not used
2190  *		fp	- the file pointer to send output to
2191  *		options	- print options from the command line processor
2192  * OUTPUT:	ep	- return error pointer
2193  * RETURNS:	int	- -1 if error, 0 on success
2194  * PURPOSE:	print a full report of the device specified by metastat.
2195  *		This is the main entry point for printing.
2196  */
2197 int
2198 meta_sp_print(
2199 	mdsetname_t	*sp,
2200 	mdname_t	*np,
2201 	mdnamelist_t	**nlpp,
2202 	char		*fname,
2203 	FILE		*fp,
2204 	mdprtopts_t	options,
2205 	md_error_t	*ep
2206 )
2207 {
2208 	md_sp_t		*msp;
2209 	md_unit_t	*mdp;
2210 	int		rval = 0;
2211 
2212 	/* should always have the same set */
2213 	assert(sp != NULL);
2214 
2215 	/* print all the soft partitions */
2216 	if (np == NULL) {
2217 		mdnamelist_t	*nlp = NULL;
2218 		mdnamelist_t	*p;
2219 		int		cnt;
2220 
2221 		if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0)
2222 			return (-1);
2223 		else if (cnt == 0)
2224 			return (0);
2225 
2226 		/* recusively print them out */
2227 		for (p = nlp; (p != NULL); p = p->next) {
2228 			mdname_t	*curnp = p->namep;
2229 
2230 			/*
2231 			 * one problem with the rval of -1 here is that
2232 			 * the error gets "lost" when the next device is
2233 			 * printed, but we want to print them all anyway.
2234 			 */
2235 			rval = meta_sp_print(sp, curnp, nlpp, fname, fp,
2236 			    options, ep);
2237 		}
2238 
2239 		/* clean up, return success */
2240 		metafreenamelist(nlp);
2241 		return (rval);
2242 	}
2243 
2244 	/* get the unit structure */
2245 	if ((msp = meta_get_sp_common(sp, np,
2246 	    ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL)
2247 		return (-1);
2248 
2249 	/* check for parented */
2250 	if ((! (options & PRINT_SUBDEVS)) &&
2251 	    (MD_HAS_PARENT(msp->common.parent))) {
2252 		return (0);
2253 	}
2254 
2255 	/* print appropriate detail */
2256 	if (options & PRINT_SHORT) {
2257 		if (meta_sp_short_print(msp, fname, fp, options, ep) != 0)
2258 			return (-1);
2259 	} else {
2260 		if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0)
2261 			return (-1);
2262 	}
2263 
2264 	/*
2265 	 * Print underlying metadevices if they are parented to us and
2266 	 * if the info for the underlying metadevice has not been printed.
2267 	 */
2268 	if (metaismeta(msp->compnamep)) {
2269 		/* get the unit structure for the subdevice */
2270 		if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL)
2271 			return (-1);
2272 
2273 		/* If info not already printed, recurse */
2274 		if (!BT_TEST(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)))) {
2275 			if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp,
2276 			    (options | PRINT_HEADER | PRINT_SUBDEVS),
2277 			    NULL, ep) != 0) {
2278 				return (-1);
2279 			}
2280 			BT_SET(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)));
2281 		}
2282 	}
2283 	return (0);
2284 }
2285 
2286 /*
2287  * **************************************************************************
2288  *                     Watermark Manipulation Functions                     *
2289  * **************************************************************************
2290  */
2291 
2292 /*
2293  * FUNCTION:	meta_sp_get_start()
2294  * INPUT:	sp	- the operating set
2295  *		np 	- device upon which the sp is being built
2296  * OUTPUT:	ep	- return error pointer
2297  * RETURNS:	daddr_t	- -1 if error, otherwise the start block
2298  * PURPOSE:	Encapsulate the determination of the start block of the
2299  *		device upon which the sp is built or being built.
2300  *		This is done to hide the ugliness of the algorithm.  In
2301  *		the case where a sp is being built upon a stripe of > 1
2302  *		TB that is made up of a set of disks in which the first
2303  *		has a VTOC label the result returned from the call to
2304  *		metagetstart is incorrect.  The reason being that a > 1
2305  *		TB metadevice will manufacture an EFI label in which the
2306  *		start address is zero.  This is irrespective of the underlying
2307  *		devices.  The long term fix for this is to fix
2308  *		meta_efi_to_mdvtoc and meta_efi_to mdgeom so that they return
2309  *		values that are indicative of the first underlying device in
2310  *		metadevice.
2311  */
2312 static diskaddr_t
2313 meta_sp_get_start(
2314 	mdsetname_t	*sp,
2315 	mdname_t	*np,
2316 	md_error_t	*ep
2317 )
2318 {
2319 	daddr_t		start_block;
2320 
2321 	if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR) {
2322 		start_block += MD_SP_START;
2323 		/*
2324 		 * In the case that the device upon which the sp is being
2325 		 * created is a metadevice then ensure that in the case that
2326 		 * the first underlying device has a vtoc label that it is
2327 		 * not overwritten with a watermark by setting the start block
2328 		 * to point just past the vtoc label
2329 		 */
2330 		if (start_block < VTOC_SIZE && metaismeta(np))
2331 			start_block = VTOC_SIZE;
2332 	}
2333 
2334 	return (start_block);
2335 }
2336 
2337 /*
2338  * FUNCTION:	meta_sp_update_wm()
2339  * INPUT:	sp	- the operating set
2340  *		msp	- a pointer to the XDR unit structure
2341  *		extlist	- the extent list specifying watermarks to update
2342  * OUTPUT:	ep	- return error pointer
2343  * RETURNS:	int	- -1 if error, 0 on success
2344  * PURPOSE:	steps backwards through the extent list updating
2345  *		watermarks for all extents with the EXTFLG_UPDATE flag
2346  *		set.  Writing the watermarks guarantees consistency when
2347  *		extents must be broken into pieces since the original
2348  *		watermark will be the last to be updated, and will be
2349  *		changed to point to a new watermark that is already
2350  *		known to be consistent.  If one of the writes fails, the
2351  *		original watermark stays intact and none of the changes
2352  *		are realized.
2353  */
2354 static int
2355 meta_sp_update_wm(
2356 	mdsetname_t	*sp,
2357 	md_sp_t		*msp,
2358 	sp_ext_node_t	*extlist,
2359 	md_error_t	*ep
2360 )
2361 {
2362 	sp_ext_node_t	*ext;
2363 	sp_ext_node_t	*tail;
2364 	mp_watermark_t	*wmp, *watermarks;
2365 	xsp_offset_t	*osp, *offsets;
2366 	int		update_count = 0;
2367 	int		rval = 0;
2368 	md_unit_t	*mdp;
2369 	md_sp_update_wm_t	update_params;
2370 
2371 	if (getenv(META_SP_DEBUG)) {
2372 		meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n");
2373 		meta_sp_list_dump(extlist);
2374 	}
2375 
2376 	/*
2377 	 * find the last node so we can write the watermarks backwards
2378 	 * and count watermarks to update so we can allocate space
2379 	 */
2380 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
2381 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2382 			update_count++;
2383 		}
2384 
2385 		if (ext->ext_next == NULL) {
2386 			tail = ext;
2387 		}
2388 	}
2389 	ext = tail;
2390 
2391 	wmp = watermarks =
2392 	    Zalloc(update_count * sizeof (mp_watermark_t));
2393 	osp = offsets =
2394 	    Zalloc(update_count * sizeof (sp_ext_offset_t));
2395 
2396 	while (ext != NULL) {
2397 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2398 			/* update watermark */
2399 			wmp->wm_magic = MD_SP_MAGIC;
2400 			wmp->wm_version = MD_SP_VERSION;
2401 			wmp->wm_type = ext->ext_type;
2402 			wmp->wm_seq = ext->ext_seq;
2403 			wmp->wm_length = ext->ext_length - MD_SP_WMSIZE;
2404 
2405 			/* fill in the volume name and set name */
2406 			if (ext->ext_namep != NULL)
2407 				(void) strcpy(wmp->wm_mdname,
2408 				    ext->ext_namep->cname);
2409 			else
2410 				(void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME);
2411 			if (ext->ext_setp != NULL &&
2412 			    ext->ext_setp->setno != MD_LOCAL_SET)
2413 				(void) strcpy(wmp->wm_setname,
2414 				    ext->ext_setp->setname);
2415 			else
2416 				(void) strcpy(wmp->wm_setname,
2417 				    MD_SP_LOCALSETNAME);
2418 
2419 			/* Generate the checksum */
2420 			wmp->wm_checksum = 0;
2421 			crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum,
2422 			    sizeof (*wmp), NULL);
2423 
2424 			/* record the extent offset */
2425 			*osp = ext->ext_offset;
2426 
2427 			/* Advance the placeholders */
2428 			osp++; wmp++;
2429 		}
2430 		ext = ext->ext_prev;
2431 	}
2432 
2433 	mdp = meta_get_mdunit(sp, msp->common.namep, ep);
2434 	if (mdp == NULL) {
2435 		rval = -1;
2436 		goto out;
2437 	}
2438 
2439 	(void) memset(&update_params, 0, sizeof (update_params));
2440 	update_params.mnum = MD_SID(mdp);
2441 	update_params.count = update_count;
2442 	update_params.wmp = (uintptr_t)watermarks;
2443 	update_params.osp = (uintptr_t)offsets;
2444 	MD_SETDRIVERNAME(&update_params, MD_SP,
2445 	    MD_MIN2SET(update_params.mnum));
2446 
2447 	if (metaioctl(MD_IOC_SPUPDATEWM, &update_params,
2448 	    &update_params.mde, msp->common.namep->cname) != 0) {
2449 		(void) mdstealerror(ep, &update_params.mde);
2450 		rval = -1;
2451 		goto out;
2452 	}
2453 
2454 out:
2455 	Free(watermarks);
2456 	Free(offsets);
2457 
2458 	return (rval);
2459 }
2460 
2461 /*
2462  * FUNCTION:	meta_sp_clear_wm()
2463  * INPUT:	sp	- the operating set
2464  *		msp	- the unit structure for the soft partition to clear
2465  * OUTPUT:	ep	- return error pointer
2466  * RETURNS:	int	- -1 if error, 0 on success
2467  * PURPOSE:	steps through the extents for a soft partition unit and
2468  *		creates an extent list designed to mark all of the
2469  *		watermarks for those extents as free.  The extent list
2470  *		is then passed to meta_sp_update_wm() to actually write
2471  *		the watermarks out.
2472  */
2473 static int
2474 meta_sp_clear_wm(
2475 	mdsetname_t	*sp,
2476 	md_sp_t		*msp,
2477 	md_error_t	*ep
2478 )
2479 {
2480 	sp_ext_node_t	*extlist = NULL;
2481 	int		numexts = msp->ext.ext_len;
2482 	uint_t		i;
2483 	int		rval = 0;
2484 
2485 	/* for each watermark must set the flag to SP_FREE */
2486 	for (i = 0; i < numexts; i++) {
2487 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
2488 
2489 		meta_sp_list_insert(NULL, NULL, &extlist,
2490 		    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
2491 		    EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
2492 	}
2493 
2494 	/* update watermarks */
2495 	rval = meta_sp_update_wm(sp, msp, extlist, ep);
2496 
2497 	meta_sp_list_free(&extlist);
2498 	return (rval);
2499 }
2500 
2501 /*
2502  * FUNCTION:	meta_sp_read_wm()
2503  * INPUT:	sp	- setname for component
2504  *		compnp	- mdname_t for component
2505  *		offset	- the offset of the watermark to read (sectors)
2506  * OUTPUT:	wm	- the watermark structure to read into
2507  *		ep	- return error pointer
2508  * RETURNS:	int	- -1 if error, 0 on success
2509  * PURPOSE:	seeks out to the requested offset and reads a watermark.
2510  *		It then verifies that the magic number is correct and
2511  *		that the checksum is valid, returning an error if either
2512  *		is wrong.
2513  */
2514 static int
2515 meta_sp_read_wm(
2516 	mdsetname_t	*sp,
2517 	mdname_t	*compnp,
2518 	mp_watermark_t	*wm,
2519 	sp_ext_offset_t	offset,
2520 	md_error_t	*ep
2521 )
2522 {
2523 	md_sp_read_wm_t	read_params;
2524 
2525 	/*
2526 	 * make sure block offset does not overflow 2^64 bytes and it's a
2527 	 * multiple of the block size.
2528 	 */
2529 	assert(offset <= (1LL << (64 - DEV_BSHIFT)));
2530 	/* LINTED */
2531 	assert((sizeof (*wm) % DEV_BSIZE) == 0);
2532 
2533 	(void) memset(wm, 0, sizeof (*wm));
2534 
2535 	(void) memset(&read_params, 0, sizeof (read_params));
2536 	read_params.rdev = compnp->dev;
2537 	read_params.wmp = (uintptr_t)wm;
2538 	read_params.offset = offset;
2539 	MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno);
2540 
2541 	if (metaioctl(MD_IOC_SPREADWM, &read_params,
2542 	    &read_params.mde, compnp->cname) != 0) {
2543 
2544 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2545 		    "Extent header read failed, block %llu.\n"), offset);
2546 		return (mdstealerror(ep, &read_params.mde));
2547 	}
2548 
2549 	/* make sure magic number is correct */
2550 	if (wm->wm_magic != MD_SP_MAGIC) {
2551 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2552 		    "found incorrect magic number %x, expected %x.\n"),
2553 		    wm->wm_magic, MD_SP_MAGIC);
2554 		/*
2555 		 * Pass NULL for the device name as we don't have
2556 		 * valid watermark contents.
2557 		 */
2558 		return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL));
2559 	}
2560 
2561 	if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
2562 	    sizeof (*wm), NULL)) {
2563 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2564 		    "found incorrect checksum %x.\n"),
2565 		    wm->wm_checksum);
2566 		return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname));
2567 	}
2568 
2569 	return (0);
2570 }
2571 
2572 /*
2573  * **************************************************************************
2574  *                  Query Functions
2575  * **************************************************************************
2576  */
2577 
2578 /*
2579  * IMPORTANT NOTE: This is a static function that assumes that
2580  *		   its input parameters have been checked and
2581  *		   have valid values that lie within acceptable
2582  *		   ranges.
2583  *
2584  * FUNCTION:	meta_sp_enough_space()
2585  * INPUT:	desired_number_of_sps - the number of soft partitions desired;
2586  *					must be > 0
2587  *		desired_sp_size - the desired soft partition size in blocks;
2588  *				  must be > 0
2589  *		extent_listpp - a reference to a reference to an extent
2590  *				list that lists the extents on a device;
2591  *				must be a reference to a reference to a
2592  *				valid extent list
2593  *		alignment - the desired data space alignment for the sp's
2594  * OUTPUT:	boolean_t return value
2595  * RETURNS:	boolean_t - B_TRUE if there's enough space in the extent
2596  *			    list to create the desired soft partitions,
2597  *			    B_FALSE if there's not enough space
2598  * PURPOSE:	determines whether there's enough free space in an extent
2599  *		list to allow creation of a set of soft partitions
2600  */
2601 static boolean_t
2602 meta_sp_enough_space(
2603 	int		desired_number_of_sps,
2604 	blkcnt_t	desired_sp_size,
2605 	sp_ext_node_t	**extent_listpp,
2606 	sp_ext_length_t	alignment
2607 )
2608 {
2609 	boolean_t		enough_space;
2610 	int			number_of_sps;
2611 	int			number_of_extents_used;
2612 	sp_ext_length_t		desired_ext_length = desired_sp_size;
2613 
2614 	enough_space = B_TRUE;
2615 	number_of_sps = 0;
2616 	while ((enough_space == B_TRUE) &&
2617 		(number_of_sps < desired_number_of_sps)) {
2618 		/*
2619 		 * Use the extent allocation algorithm implemented by
2620 		 * meta_sp_alloc_by_len() to test whether the free
2621 		 * extents in the extent list referenced by *extent_listpp
2622 		 * contain enough space to accomodate a soft partition
2623 		 * of size desired_ext_length.
2624 		 *
2625 		 * Repeat the test <desired_number_of_sps> times
2626 		 * or until it fails, whichever comes first,
2627 		 * each time allocating the extents required to
2628 		 * create the soft partition without actually
2629 		 * creating the soft partition.
2630 		 */
2631 		number_of_extents_used = meta_sp_alloc_by_len(
2632 						TEST_SETNAMEP,
2633 						TEST_SOFT_PARTITION_NAMEP,
2634 						extent_listpp,
2635 						&desired_ext_length,
2636 						NO_OFFSET,
2637 						alignment);
2638 		if (number_of_extents_used == -1) {
2639 			enough_space = B_FALSE;
2640 		} else {
2641 			number_of_sps++;
2642 		}
2643 	}
2644 	return (enough_space);
2645 }
2646 
2647 /*
2648  * IMPORTANT NOTE: This is a static function that calls other functions
2649  *		   that check its mdsetnamep and device_mdnamep
2650  *		   input parameters, but expects extent_listpp to
2651  *		   be a initialized to a valid address to which
2652  *		   it can write a reference to the extent list that
2653  *		   it creates.
2654  *
2655  * FUNCTION:	meta_sp_get_extent_list()
2656  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2657  *			     for the set containing the device for
2658  *			     which the extents are to be listed
2659  *		device_mdnamep - a reference to the mdname_t structure
2660  *				 for the device for which the extents
2661  *				 are to be listed
2662  * OUTPUT:	*extent_listpp - a reference to the extent list for
2663  *				 the device; NULL if the function fails
2664  *		*ep - the libmeta error encountered, if any
2665  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2666  *			    B_FALSE if not
2667  * PURPOSE:	gets the extent list for a device
2668  */
2669 static boolean_t
2670 meta_sp_get_extent_list(
2671 	mdsetname_t	*mdsetnamep,
2672 	mdname_t	*device_mdnamep,
2673 	sp_ext_node_t	**extent_listpp,
2674 	md_error_t	*ep
2675 )
2676 {
2677 	diskaddr_t		device_size_in_blocks;
2678 	mdnamelist_t		*sp_name_listp;
2679 	diskaddr_t		start_block_address_in_blocks;
2680 
2681 	*extent_listpp = NULL;
2682 	sp_name_listp = NULL;
2683 
2684 	start_block_address_in_blocks = meta_sp_get_start(mdsetnamep,
2685 						device_mdnamep,
2686 						ep);
2687 	if (start_block_address_in_blocks == MD_DISKADDR_ERROR) {
2688 	    if (getenv(META_SP_DEBUG)) {
2689 		mde_perror(ep, "meta_sp_get_extent_list:meta_sp_get_start");
2690 	    }
2691 	    return (B_FALSE);
2692 	}
2693 
2694 	device_size_in_blocks = metagetsize(device_mdnamep, ep);
2695 	if (device_size_in_blocks == MD_DISKADDR_ERROR) {
2696 	    if (getenv(META_SP_DEBUG)) {
2697 		mde_perror(ep,
2698 		    "meta_sp_get_extent_list:metagetsize");
2699 	    }
2700 	    return (B_FALSE);
2701 	}
2702 
2703 	/*
2704 	 * Sanity check: the start block will have skipped an integer
2705 	 * number of cylinders, C.  C will usually be zero.  If (C > 0),
2706 	 * and the disk slice happens to only be C cylinders in total
2707 	 * size, we'll fail this check.
2708 	 */
2709 	if (device_size_in_blocks <=
2710 	    (start_block_address_in_blocks + MD_SP_WMSIZE)) {
2711 	    (void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname);
2712 	    return (B_FALSE);
2713 	}
2714 
2715 	/*
2716 	 * After this point, we will have allocated resources, so any
2717 	 * failure returns must be through the supplied "fail" label
2718 	 * to properly deallocate things.
2719 	 */
2720 
2721 	/*
2722 	 * Create an empty extent list that starts one watermark past
2723 	 * the start block of the device and ends one watermark before
2724 	 * the end of the device.
2725 	 */
2726 	meta_sp_list_insert(TEST_SETNAMEP,
2727 			    TEST_SOFT_PARTITION_NAMEP,
2728 			    extent_listpp,
2729 			    NO_OFFSET,
2730 			    (sp_ext_length_t)start_block_address_in_blocks,
2731 			    EXTTYP_RESERVED,
2732 			    NO_SEQUENCE_NUMBER,
2733 			    NO_FLAGS,
2734 			    meta_sp_cmp_by_offset);
2735 	meta_sp_list_insert(TEST_SETNAMEP,
2736 			    TEST_SOFT_PARTITION_NAMEP,
2737 			    extent_listpp,
2738 			    (sp_ext_offset_t)(device_size_in_blocks -
2739 				MD_SP_WMSIZE),
2740 			    MD_SP_WMSIZE,
2741 			    EXTTYP_END,
2742 			    NO_SEQUENCE_NUMBER,
2743 			    NO_FLAGS,
2744 			    meta_sp_cmp_by_offset);
2745 
2746 	/*
2747 	 * Get the list of soft partitions that are already on the
2748 	 * device.
2749 	 */
2750 	if (meta_sp_get_by_component(mdsetnamep, device_mdnamep,
2751 	    &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) {
2752 		if (getenv(META_SP_DEBUG)) {
2753 			mde_perror(ep,
2754 			    "meta_sp_get_extent_list:meta_sp_get_by_component");
2755 		}
2756 		goto fail;
2757 	}
2758 
2759 	if (sp_name_listp != NULL) {
2760 		/*
2761 		 * If there are soft partitions on the device, add the
2762 		 * extents used in them to the extent list.
2763 		 */
2764 		if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp,
2765 		    extent_listpp, ep) == -1) {
2766 			if (getenv(META_SP_DEBUG)) {
2767 				mde_perror(ep, "meta_sp_get_extent_list:"
2768 				    "meta_sp_extlist_from_namelist");
2769 			}
2770 			goto fail;
2771 		}
2772 		metafreenamelist(sp_name_listp);
2773 	}
2774 
2775 	/*
2776 	 * Add free extents to the extent list to represent
2777 	 * the remaining regions of free space on the
2778 	 * device.
2779 	 */
2780 	meta_sp_list_freefill(extent_listpp, device_size_in_blocks);
2781 	return (B_TRUE);
2782 
2783 fail:
2784 	if (sp_name_listp != NULL) {
2785 		metafreenamelist(sp_name_listp);
2786 	}
2787 
2788 	if (*extent_listpp != NULL) {
2789 		/*
2790 		 * meta_sp_list_free sets *extent_listpp to NULL.
2791 		 */
2792 		meta_sp_list_free(extent_listpp);
2793 	}
2794 	return (B_FALSE);
2795 }
2796 
2797 /*
2798  * IMPORTANT NOTE: This is a static function that calls other functions
2799  *		   that check its mdsetnamep and mddrivenamep
2800  *		   input parameters, but expects extent_listpp to
2801  *		   be a initialized to a valid address to which
2802  *		   it can write a reference to the extent list that
2803  *		   it creates.
2804  *
2805  * FUNCTION:	meta_sp_get_extent_list_for_drive()
2806  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2807  *			     for the set containing the drive for
2808  *			     which the extents are to be listed
2809  *		mddrivenamep   - a reference to the mddrivename_t structure
2810  *				 for the drive for which the extents
2811  *				 are to be listed
2812  * OUTPUT:	*extent_listpp - a reference to the extent list for
2813  *				 the drive; NULL if the function fails
2814  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2815  *			    B_FALSE if not
2816  * PURPOSE:	gets the extent list for a drive when the entire drive
2817  *		is to be soft partitioned
2818  */
2819 static boolean_t
2820 meta_sp_get_extent_list_for_drive(
2821 	mdsetname_t	*mdsetnamep,
2822 	mddrivename_t	*mddrivenamep,
2823 	sp_ext_node_t	**extent_listpp
2824 )
2825 {
2826 	boolean_t		can_use;
2827 	diskaddr_t		free_space;
2828 	md_error_t		mderror;
2829 	mdvtoc_t		proposed_vtoc;
2830 	int			repartition_options;
2831 	int			return_value;
2832 	md_sp_t			test_sp_struct;
2833 
2834 	can_use = B_TRUE;
2835 	*extent_listpp = NULL;
2836 	mderror = mdnullerror;
2837 	test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0,
2838 					&mderror);
2839 	if (test_sp_struct.compnamep == NULL) {
2840 		can_use = B_FALSE;
2841 	}
2842 
2843 	if (can_use == B_TRUE) {
2844 		mderror = mdnullerror;
2845 		repartition_options = 0;
2846 		return_value = meta_check_sp(mdsetnamep, &test_sp_struct,
2847 				MDCMD_USE_WHOLE_DISK, &repartition_options,
2848 				&mderror);
2849 		if (return_value != 0) {
2850 			can_use = B_FALSE;
2851 		}
2852 	}
2853 
2854 	if (can_use == B_TRUE) {
2855 		mderror = mdnullerror;
2856 		repartition_options = repartition_options |
2857 			(MD_REPART_FORCE | MD_REPART_DONT_LABEL);
2858 		return_value = meta_repartition_drive(mdsetnamep, mddrivenamep,
2859 				repartition_options, &proposed_vtoc, &mderror);
2860 		if (return_value != 0) {
2861 			can_use = B_FALSE;
2862 		}
2863 	}
2864 
2865 	if (can_use == B_TRUE) {
2866 		free_space = proposed_vtoc.parts[MD_SLICE0].size;
2867 		if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) {
2868 			can_use = B_FALSE;
2869 		}
2870 	}
2871 
2872 	if (can_use == B_TRUE) {
2873 		/*
2874 		 * Create an extent list that starts with
2875 		 * a reserved extent that ends at the start
2876 		 * of the usable space on slice zero of the
2877 		 * proposed VTOC, ends with an extent that
2878 		 * reserves space for a watermark at the end
2879 		 * of slice zero, and contains a single free
2880 		 * extent that occupies the rest of the space
2881 		 * on the slice.
2882 		 *
2883 		 * NOTE:
2884 		 *
2885 		 * Don't use metagetstart() or metagetsize() to
2886 		 * find the usable space.  They query the mdname_t
2887 		 * structure that represents an actual device to
2888 		 * determine the amount of space on the device that
2889 		 * contains metadata and the total amount of space
2890 		 * on the device.  Since this function creates a
2891 		 * proposed extent list that doesn't reflect the
2892 		 * state of an actual device, there's no mdname_t
2893 		 * structure to be queried.
2894 		 *
2895 		 * When a drive is reformatted to prepare for
2896 		 * soft partitioning, all of slice seven is
2897 		 * reserved for metadata, all of slice zero is
2898 		 * available for soft partitioning, and all other
2899 		 * slices on the drive are empty.  The proposed
2900 		 * extent list for the drive therefore contains
2901 		 * only three extents: a reserved extent that ends
2902 		 * at the start of the usable space on slice zero,
2903 		 * a single free extent that occupies all the usable
2904 		 * space on slice zero, and an ending extent that
2905 		 * reserves space for a watermark at the end of
2906 		 * slice zero.
2907 		 */
2908 		meta_sp_list_insert(TEST_SETNAMEP,
2909 			TEST_SOFT_PARTITION_NAMEP,
2910 			extent_listpp,
2911 			NO_OFFSET,
2912 			(sp_ext_length_t)(MD_SP_START),
2913 			EXTTYP_RESERVED,
2914 			NO_SEQUENCE_NUMBER,
2915 			NO_FLAGS,
2916 			meta_sp_cmp_by_offset);
2917 		meta_sp_list_insert(TEST_SETNAMEP,
2918 			TEST_SOFT_PARTITION_NAMEP,
2919 			extent_listpp,
2920 			(sp_ext_offset_t)(free_space - MD_SP_WMSIZE),
2921 			MD_SP_WMSIZE,
2922 			EXTTYP_END,
2923 			NO_SEQUENCE_NUMBER,
2924 			NO_FLAGS,
2925 			meta_sp_cmp_by_offset);
2926 		meta_sp_list_freefill(extent_listpp, free_space);
2927 	}
2928 	return (can_use);
2929 }
2930 
2931 /*
2932  * FUNCTION:	meta_sp_can_create_sps()
2933  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2934  *			     for the set containing the device for
2935  *			     which the extents are to be listed
2936  *		mdnamep - a reference to the mdname_t of the device
2937  *			  on which the soft parititions are to be created
2938  *		number_of_sps - the desired number of soft partitions
2939  *		sp_size - the desired soft partition size
2940  * OUTPUT:	boolean_t return value
2941  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
2942  *			    B_FALSE if not
2943  * PURPOSE:	determines whether a set of soft partitions can be created
2944  *		on a device
2945  */
2946 boolean_t
2947 meta_sp_can_create_sps(
2948 	mdsetname_t	*mdsetnamep,
2949 	mdname_t	*mdnamep,
2950 	int		number_of_sps,
2951 	blkcnt_t	sp_size
2952 )
2953 {
2954 	sp_ext_node_t	*extent_listp;
2955 	boolean_t	succeeded;
2956 	md_error_t	mde;
2957 
2958 	if ((number_of_sps > 0) && (sp_size > 0)) {
2959 		succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
2960 						    &extent_listp, &mde);
2961 	} else {
2962 		succeeded = B_FALSE;
2963 	}
2964 
2965 	/*
2966 	 * We don't really care about an error return from the
2967 	 * alignment call; that will just result in passing zero,
2968 	 * which will be interpreted as no alignment.
2969 	 */
2970 
2971 	if (succeeded == B_TRUE) {
2972 		succeeded = meta_sp_enough_space(number_of_sps,
2973 		    sp_size, &extent_listp,
2974 		    meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde));
2975 		meta_sp_list_free(&extent_listp);
2976 	}
2977 	return (succeeded);
2978 }
2979 
2980 /*
2981  * FUNCTION:	meta_sp_can_create_sps_on_drive()
2982  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2983  *			     for the set containing the drive for
2984  *			     which the extents are to be listed
2985  *		mddrivenamep - a reference to the mddrivename_t of the drive
2986  *			       on which the soft parititions are to be created
2987  *		number_of_sps - the desired number of soft partitions
2988  *		sp_size - the desired soft partition size
2989  * OUTPUT:	boolean_t return value
2990  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
2991  *			    B_FALSE if not
2992  * PURPOSE:	determines whether a set of soft partitions can be created
2993  *		on a drive if the entire drive is soft partitioned
2994  */
2995 boolean_t
2996 meta_sp_can_create_sps_on_drive(
2997 	mdsetname_t	*mdsetnamep,
2998 	mddrivename_t	*mddrivenamep,
2999 	int		number_of_sps,
3000 	blkcnt_t	sp_size
3001 )
3002 {
3003 	sp_ext_node_t	*extent_listp;
3004 	boolean_t	succeeded;
3005 
3006 	if ((number_of_sps > 0) && (sp_size > 0)) {
3007 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3008 							mddrivenamep,
3009 							&extent_listp);
3010 	} else {
3011 		succeeded = B_FALSE;
3012 	}
3013 
3014 	/*
3015 	 * We don't care about alignment on the space call because
3016 	 * we're specifically dealing with a drive, which will have no
3017 	 * inherent alignment.
3018 	 */
3019 
3020 	if (succeeded == B_TRUE) {
3021 		succeeded = meta_sp_enough_space(number_of_sps, sp_size,
3022 		    &extent_listp, SP_UNALIGNED);
3023 		meta_sp_list_free(&extent_listp);
3024 	}
3025 	return (succeeded);
3026 }
3027 
3028 /*
3029  * FUNCTION:	meta_sp_get_free_space()
3030  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3031  *			     for the set containing the device for
3032  *			     which the free space is to be returned
3033  *		mdnamep - a reference to the mdname_t of the device
3034  *			  for which the free space is to be returned
3035  * OUTPUT:	blkcnt_t return value
3036  * RETURNS:	blkcnt_t - the number of blocks of free space on the device
3037  * PURPOSE:	returns the number of blocks of free space on a device
3038  */
3039 blkcnt_t
3040 meta_sp_get_free_space(
3041 	mdsetname_t	*mdsetnamep,
3042 	mdname_t	*mdnamep
3043 )
3044 {
3045 	sp_ext_node_t		*extent_listp;
3046 	sp_ext_length_t		free_blocks;
3047 	boolean_t		succeeded;
3048 	md_error_t		mde;
3049 
3050 	extent_listp = NULL;
3051 	free_blocks = 0;
3052 	succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3053 					    &extent_listp, &mde);
3054 	if (succeeded == B_TRUE) {
3055 		free_blocks = meta_sp_list_size(extent_listp,
3056 		    EXTTYP_FREE, INCLUDE_WM);
3057 		meta_sp_list_free(&extent_listp);
3058 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3059 			/*
3060 			 * Subtract a safety margin for watermarks when
3061 			 * computing the number of blocks available for
3062 			 * use.  The actual number of watermarks can't
3063 			 * be calculated without knowing the exact numbers
3064 			 * and sizes of both the free extents and the soft
3065 			 * partitions to be created.  The calculation is
3066 			 * highly complex and error-prone even if those
3067 			 * quantities are known.  The approximate value
3068 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3069 			 * correct value in all practical cases.
3070 			 */
3071 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3072 		} else {
3073 			free_blocks = 0;
3074 		}
3075 	} else {
3076 	    mdclrerror(&mde);
3077 	}
3078 
3079 	return (free_blocks);
3080 }
3081 
3082 /*
3083  * FUNCTION:	meta_sp_get_free_space_on_drive()
3084  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3085  *			     for the set containing the drive for
3086  *			     which the free space is to be returned
3087  *		mddrivenamep - a reference to the mddrivename_t of the drive
3088  *			       for which the free space is to be returned
3089  * OUTPUT:	blkcnt_t return value
3090  * RETURNS:	blkcnt_t - the number of blocks of free space on the drive
3091  * PURPOSE:	returns the number of blocks of space usable for soft
3092  *		partitions on an entire drive, if the entire drive is
3093  *		soft partitioned
3094  */
3095 blkcnt_t
3096 meta_sp_get_free_space_on_drive(
3097 	mdsetname_t	*mdsetnamep,
3098 	mddrivename_t	*mddrivenamep
3099 )
3100 {
3101 	sp_ext_node_t		*extent_listp;
3102 	sp_ext_length_t		free_blocks;
3103 	boolean_t		succeeded;
3104 
3105 	extent_listp = NULL;
3106 	free_blocks = 0;
3107 	succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3108 			mddrivenamep, &extent_listp);
3109 	if (succeeded == B_TRUE) {
3110 		free_blocks = meta_sp_list_size(extent_listp,
3111 		    EXTTYP_FREE, INCLUDE_WM);
3112 		meta_sp_list_free(&extent_listp);
3113 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3114 			/*
3115 			 * Subtract a safety margin for watermarks when
3116 			 * computing the number of blocks available for
3117 			 * use.  The actual number of watermarks can't
3118 			 * be calculated without knowing the exact numbers
3119 			 * and sizes of both the free extents and the soft
3120 			 * partitions to be created.  The calculation is
3121 			 * highly complex and error-prone even if those
3122 			 * quantities are known.  The approximate value
3123 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3124 			 * correct value in all practical cases.
3125 			 */
3126 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3127 		} else {
3128 			free_blocks = 0;
3129 		}
3130 	}
3131 	return (free_blocks);
3132 }
3133 
3134 /*
3135  * FUNCTION:	meta_sp_get_number_of_possible_sps()
3136  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3137  *			     for the set containing the device for
3138  *			     which the number of possible soft partitions
3139  *			     is to be returned
3140  *		mdnamep - a reference to the mdname_t of the device
3141  *			  for which the number of possible soft partitions
3142  *			  is to be returned
3143  * OUTPUT:	int return value
3144  * RETURNS:	int - the number of soft partitions of the desired size
3145  *		      that can be created on the device
3146  * PURPOSE:	returns the number of soft partitions of a given size
3147  *		that can be created on a device
3148  */
3149 int
3150 meta_sp_get_number_of_possible_sps(
3151 	mdsetname_t	*mdsetnamep,
3152 	mdname_t	*mdnamep,
3153 	blkcnt_t	sp_size
3154 )
3155 {
3156 	sp_ext_node_t	*extent_listp;
3157 	int		number_of_possible_sps;
3158 	boolean_t	succeeded;
3159 	md_error_t	mde;
3160 	sp_ext_length_t	alignment;
3161 
3162 	extent_listp = NULL;
3163 	number_of_possible_sps = 0;
3164 	if (sp_size > 0) {
3165 	    if ((succeeded = meta_sp_get_extent_list(mdsetnamep,
3166 		mdnamep, &extent_listp, &mde)) == B_FALSE)
3167 		mdclrerror(&mde);
3168 	} else {
3169 		succeeded = B_FALSE;
3170 	}
3171 
3172 	if (succeeded == B_TRUE) {
3173 		alignment = meta_sp_get_default_alignment(mdsetnamep,
3174 		    mdnamep, &mde);
3175 	}
3176 
3177 	while (succeeded == B_TRUE) {
3178 		/*
3179 		 * Keep allocating space from the extent list
3180 		 * for soft partitions of the desired size until
3181 		 * there's not enough free space left in the list
3182 		 * for another soft partiition of that size.
3183 		 * Add one to the number of possible soft partitions
3184 		 * for each soft partition for which there is
3185 		 * enough free space left.
3186 		 */
3187 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3188 		    sp_size, &extent_listp, alignment);
3189 		if (succeeded == B_TRUE) {
3190 			number_of_possible_sps++;
3191 		}
3192 	}
3193 	if (extent_listp != NULL) {
3194 		meta_sp_list_free(&extent_listp);
3195 	}
3196 	return (number_of_possible_sps);
3197 }
3198 
3199 /*
3200  * FUNCTION:	meta_sp_get_number_of_possible_sps_on_drive()
3201  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3202  *			     for the set containing the drive for
3203  *			     which the number of possible soft partitions
3204  *			     is to be returned
3205  *		mddrivenamep - a reference to the mddrivename_t of the drive
3206  *			       for which the number of possible soft partitions
3207  *			       is to be returned
3208  *		sp_size - the size in blocks of the proposed soft partitions
3209  * OUTPUT:	int return value
3210  * RETURNS:	int - the number of soft partitions of the desired size
3211  *		      that can be created on the drive
3212  * PURPOSE:	returns the number of soft partitions of a given size
3213  *		that can be created on a drive, if the entire drive is
3214  *		soft partitioned
3215  */
3216 int
3217 meta_sp_get_number_of_possible_sps_on_drive(
3218 	mdsetname_t	*mdsetnamep,
3219 	mddrivename_t	*mddrivenamep,
3220 	blkcnt_t	sp_size
3221 )
3222 {
3223 	sp_ext_node_t	*extent_listp;
3224 	int		number_of_possible_sps;
3225 	boolean_t	succeeded;
3226 
3227 	extent_listp = NULL;
3228 	number_of_possible_sps = 0;
3229 	if (sp_size > 0) {
3230 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3231 					mddrivenamep, &extent_listp);
3232 	} else {
3233 		succeeded = B_FALSE;
3234 	}
3235 	while (succeeded == B_TRUE) {
3236 		/*
3237 		 * Keep allocating space from the extent list
3238 		 * for soft partitions of the desired size until
3239 		 * there's not enough free space left in the list
3240 		 * for another soft partition of that size.
3241 		 * Add one to the number of possible soft partitions
3242 		 * for each soft partition for which there is
3243 		 * enough free space left.
3244 		 *
3245 		 * Since it's a drive, not a metadevice, make no
3246 		 * assumptions about alignment.
3247 		 */
3248 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3249 		    sp_size, &extent_listp, SP_UNALIGNED);
3250 		if (succeeded == B_TRUE) {
3251 			number_of_possible_sps++;
3252 		}
3253 	}
3254 	if (extent_listp != NULL) {
3255 		meta_sp_list_free(&extent_listp);
3256 	}
3257 	return (number_of_possible_sps);
3258 }
3259 
3260 /*
3261  * FUNCTION:	meta_sp_get_possible_sp_size()
3262  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3263  *			     for the set containing the device for
3264  *			     which the possible soft partition size
3265  *			     is to be returned
3266  *		mdnamep - a reference to the mdname_t of the device
3267  *			  for which the possible soft partition size
3268  *			  is to be returned
3269  *		number_of_sps - the desired number of soft partitions
3270  * OUTPUT:	blkcnt_t return value
3271  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3272  * PURPOSE:	returns the maximum possible size of each of a given number of
3273  *		soft partitions of equal size that can be created on a device
3274  */
3275 blkcnt_t
3276 meta_sp_get_possible_sp_size(
3277 	mdsetname_t	*mdsetnamep,
3278 	mdname_t	*mdnamep,
3279 	int		number_of_sps
3280 )
3281 {
3282 	blkcnt_t	free_blocks;
3283 	blkcnt_t	sp_size;
3284 	boolean_t	succeeded;
3285 
3286 	sp_size = 0;
3287 	if (number_of_sps > 0) {
3288 		free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep);
3289 		sp_size = free_blocks / number_of_sps;
3290 		succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3291 						number_of_sps, sp_size);
3292 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3293 			/*
3294 			 * To compensate for space that may have been
3295 			 * occupied by watermarks, reduce sp_size by a
3296 			 * number of blocks equal to the number of soft
3297 			 * partitions desired, and test again to see
3298 			 * whether the desired number of soft partitions
3299 			 * can be created.
3300 			 */
3301 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3302 			succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3303 							number_of_sps, sp_size);
3304 		}
3305 		if (sp_size < 0) {
3306 			sp_size = 0;
3307 		}
3308 	}
3309 	return (sp_size);
3310 }
3311 
3312 /*
3313  * FUNCTION:	meta_sp_get_possible_sp_size_on_drive()
3314  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3315  *			     for the set containing the drive for
3316  *			     which the possible soft partition size
3317  *			     is to be returned
3318  *		mddrivenamep - a reference to the mddrivename_t of the drive
3319  *			       for which the possible soft partition size
3320  *			       is to be returned
3321  *		number_of_sps - the desired number of soft partitions
3322  * OUTPUT:	blkcnt_t return value
3323  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3324  * PURPOSE:	returns the maximum possible size of each of a given number of
3325  *		soft partitions of equal size that can be created on a drive
3326  *              if the entire drive is soft partitioned
3327  */
3328 blkcnt_t
3329 meta_sp_get_possible_sp_size_on_drive(
3330 	mdsetname_t	*mdsetnamep,
3331 	mddrivename_t	*mddrivenamep,
3332 	int		number_of_sps
3333 )
3334 {
3335 	blkcnt_t	free_blocks;
3336 	blkcnt_t	sp_size;
3337 	boolean_t	succeeded;
3338 
3339 	sp_size = 0;
3340 	if (number_of_sps > 0) {
3341 		free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep,
3342 								mddrivenamep);
3343 		sp_size = free_blocks / number_of_sps;
3344 		succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3345 						mddrivenamep,
3346 						number_of_sps, sp_size);
3347 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3348 			/*
3349 			 * To compensate for space that may have been
3350 			 * occupied by watermarks, reduce sp_size by a
3351 			 * number of blocks equal to the number of soft
3352 			 * partitions desired, and test again to see
3353 			 * whether the desired number of soft partitions
3354 			 * can be created.
3355 			 */
3356 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3357 			succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3358 							mddrivenamep,
3359 							number_of_sps, sp_size);
3360 		}
3361 		if (sp_size < 0) {
3362 			sp_size = 0;
3363 		}
3364 	}
3365 	return (sp_size);
3366 }
3367 
3368 /*
3369  * **************************************************************************
3370  *                  Unit Structure Manipulation Functions                   *
3371  * **************************************************************************
3372  */
3373 
3374 /*
3375  * FUNCTION:	meta_sp_fillextarray()
3376  * INPUT:	mp	- the unit structure to fill
3377  *		extlist	- the list of extents to fill with
3378  * OUTPUT:	none
3379  * RETURNS:	void
3380  * PURPOSE:	fills in the unit structure extent list with the extents
3381  *		specified by extlist.  Only extents in extlist with the
3382  *		EXTFLG_UPDATE flag are changed in the unit structure,
3383  *		and the index into the unit structure is the sequence
3384  *		number in the extent list.  After all of the nodes have
3385  *		been updated the virtual offsets in the unit structure
3386  *		are updated to reflect the new lengths.
3387  */
3388 static void
3389 meta_sp_fillextarray(
3390 	mp_unit_t	*mp,
3391 	sp_ext_node_t	*extlist
3392 )
3393 {
3394 	int	i;
3395 	sp_ext_node_t	*ext;
3396 	sp_ext_offset_t	curvoff = 0LL;
3397 
3398 	assert(mp != NULL);
3399 
3400 	/* go through the allocation list and fill in our unit structure */
3401 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
3402 		if ((ext->ext_type == EXTTYP_ALLOC) &&
3403 		    (ext->ext_flags & EXTFLG_UPDATE) != 0) {
3404 			mp->un_ext[ext->ext_seq].un_poff =
3405 			    ext->ext_offset + MD_SP_WMSIZE;
3406 			mp->un_ext[ext->ext_seq].un_len =
3407 			    ext->ext_length - MD_SP_WMSIZE;
3408 		}
3409 	}
3410 
3411 	for (i = 0; i < mp->un_numexts; i++) {
3412 		assert(mp->un_ext[i].un_poff != 0);
3413 		assert(mp->un_ext[i].un_len  != 0);
3414 		mp->un_ext[i].un_voff = curvoff;
3415 		curvoff += mp->un_ext[i].un_len;
3416 	}
3417 }
3418 
3419 /*
3420  * FUNCTION:	meta_sp_createunit()
3421  * INPUT:	np	- the name of the device to create a unit structure for
3422  *		compnp	- the name of the device the soft partition is on
3423  *		extlist	- the extent list to populate the new unit with
3424  *		numexts	- the number of extents in the extent list
3425  *		len	- the total size of the soft partition (sectors)
3426  *		status	- the initial status of the unit structure
3427  * OUTPUT:	ep	- return error pointer
3428  * RETURNS:	mp_unit_t * - the new unit structure.
3429  * PURPOSE:	allocates and fills in a new soft partition unit
3430  *		structure to be passed to the soft partitioning driver
3431  *		for creation.
3432  */
3433 static mp_unit_t *
3434 meta_sp_createunit(
3435 	mdname_t	*np,
3436 	mdname_t	*compnp,
3437 	sp_ext_node_t	*extlist,
3438 	int		numexts,
3439 	sp_ext_length_t	len,
3440 	sp_status_t	status,
3441 	md_error_t	*ep
3442 )
3443 {
3444 	mp_unit_t	*mp;
3445 	uint_t		ms_size;
3446 
3447 	ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) +
3448 	    (numexts * sizeof (mp->un_ext[0]));
3449 
3450 	mp = Zalloc(ms_size);
3451 
3452 	/* fill in fields in common unit structure */
3453 	mp->c.un_type = MD_METASP;
3454 	mp->c.un_size = ms_size;
3455 	MD_SID(mp) = meta_getminor(np->dev);
3456 	mp->c.un_total_blocks = len;
3457 	mp->c.un_actual_tb = len;
3458 
3459 	/* set up geometry */
3460 	(void) meta_sp_setgeom(np, compnp, mp, ep);
3461 
3462 	/* if we're building on metadevice we can't parent */
3463 	if (metaismeta(compnp))
3464 		MD_CAPAB(mp) = MD_CANT_PARENT;
3465 	else
3466 		MD_CAPAB(mp) = MD_CAN_PARENT;
3467 
3468 	/* fill soft partition-specific fields */
3469 	mp->un_dev = compnp->dev;
3470 	mp->un_key = compnp->key;
3471 
3472 	/* mdname_t start_blk field is not 64-bit! */
3473 	mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk;
3474 	mp->un_status = status;
3475 	mp->un_numexts = numexts;
3476 	mp->un_length = len;
3477 
3478 	/* fill in the extent array */
3479 	meta_sp_fillextarray(mp, extlist);
3480 
3481 	return (mp);
3482 }
3483 
3484 /*
3485  * FUNCTION:	meta_sp_updateunit()
3486  * INPUT:	np       - name structure for the metadevice being updated
3487  *		old_un	 - the original unit structure that is being updated
3488  *		extlist	 - the extent list to populate the new unit with
3489  *		grow_len - the amount by which the partition is being grown
3490  *		numexts	 - the number of extents in the extent list
3491  *		ep       - return error pointer
3492  * OUTPUT:	none
3493  * RETURNS:	mp_unit_t * - the updated unit structure
3494  * PURPOSE:	allocates and fills in a new soft partition unit structure to
3495  *		be passed to the soft partitioning driver for creation.  The
3496  *		old unit structure is first copied in, and then the updated
3497  *		extents are changed in the new unit structure.  This is
3498  *		typically used when the size of an existing unit is changed.
3499  */
3500 static mp_unit_t *
3501 meta_sp_updateunit(
3502 	mdname_t	*np,
3503 	mp_unit_t	*old_un,
3504 	sp_ext_node_t	*extlist,
3505 	sp_ext_length_t	grow_len,
3506 	int		numexts,
3507 	md_error_t	*ep
3508 )
3509 {
3510 	mp_unit_t	*new_un;
3511 	sp_ext_length_t	new_len;
3512 	uint_t		new_size;
3513 
3514 	assert(old_un != NULL);
3515 	assert(extlist != NULL);
3516 
3517 	/* allocate new unit structure and copy in old unit */
3518 	new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) +
3519 	    ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0]));
3520 	new_len = old_un->un_length + grow_len;
3521 	new_un = Zalloc(new_size);
3522 	bcopy(old_un, new_un, old_un->c.un_size);
3523 
3524 	/* update size and geometry information */
3525 	new_un->c.un_size = new_size;
3526 	new_un->un_length = new_len;
3527 	new_un->c.un_total_blocks = new_len;
3528 	new_un->c.un_actual_tb = new_len;
3529 	if (meta_adjust_geom((md_unit_t *)new_un, np,
3530 	    old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct,
3531 	    0, ep) != 0) {
3532 		Free(new_un);
3533 		return (NULL);
3534 	}
3535 
3536 	/* update extent information */
3537 	new_un->un_numexts += numexts;
3538 
3539 	meta_sp_fillextarray(new_un, extlist);
3540 
3541 	return (new_un);
3542 }
3543 
3544 /*
3545  * FUNCTION:	meta_get_sp()
3546  * INPUT:	sp	- the set name for the device to get
3547  *		np	- the name of the device to get
3548  * OUTPUT:	ep	- return error pointer
3549  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition
3550  * PURPOSE:	interface to the rest of libmeta for fetching a unit structure
3551  *		for the named device.  Just a wrapper for meta_get_sp_common().
3552  */
3553 md_sp_t *
3554 meta_get_sp(
3555 	mdsetname_t	*sp,
3556 	mdname_t	*np,
3557 	md_error_t	*ep
3558 )
3559 {
3560 	return (meta_get_sp_common(sp, np, 0, ep));
3561 }
3562 
3563 /*
3564  * FUNCTION:	meta_get_sp_common()
3565  * INPUT:	sp	- the set name for the device to get
3566  *		np	- the name of the device to get
3567  *		fast	- whether to use the cache or not (NOT IMPLEMENTED!)
3568  * OUTPUT:	ep	- return error pointer
3569  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition,
3570  *			    NULL if np is not a soft partition
3571  * PURPOSE:	common routine for fetching a soft partition unit structure
3572  */
3573 md_sp_t *
3574 meta_get_sp_common(
3575 	mdsetname_t	*sp,
3576 	mdname_t	*np,
3577 	int		fast,
3578 	md_error_t	*ep
3579 )
3580 {
3581 	mddrivename_t	*dnp = np->drivenamep;
3582 	char		*miscname;
3583 	mp_unit_t	*mp;
3584 	md_sp_t		*msp;
3585 	int		i;
3586 
3587 	/* must have set */
3588 	assert(sp != NULL);
3589 
3590 	/* short circuit */
3591 	if (dnp->unitp != NULL) {
3592 		if (dnp->unitp->type != MD_METASP)
3593 			return (NULL);
3594 		return ((md_sp_t *)dnp->unitp);
3595 	}
3596 	/* get miscname and unit */
3597 	if ((miscname = metagetmiscname(np, ep)) == NULL)
3598 		return (NULL);
3599 
3600 	if (strcmp(miscname, MD_SP) != 0) {
3601 		(void) mdmderror(ep, MDE_NOT_SP, 0, np->cname);
3602 		return (NULL);
3603 	}
3604 
3605 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
3606 		return (NULL);
3607 
3608 	assert(mp->c.un_type == MD_METASP);
3609 
3610 	/* allocate soft partition */
3611 	msp = Zalloc(sizeof (*msp));
3612 
3613 	/* get the common information */
3614 	msp->common.namep = np;
3615 	msp->common.type = mp->c.un_type;
3616 	msp->common.state = mp->c.un_status;
3617 	msp->common.capabilities = mp->c.un_capabilities;
3618 	msp->common.parent = mp->c.un_parent;
3619 	msp->common.size = mp->c.un_total_blocks;
3620 	msp->common.user_flags = mp->c.un_user_flags;
3621 	msp->common.revision = mp->c.un_revision;
3622 
3623 	/* get soft partition information */
3624 	if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL)
3625 		goto out;
3626 
3627 	/*
3628 	 * Fill in the key and the start block.  Note that the start
3629 	 * block in the unit structure is 64 bits but the name pointer
3630 	 * only supports 32 bits.
3631 	 */
3632 	msp->compnamep->key = mp->un_key;
3633 	msp->compnamep->start_blk = mp->un_start_blk;
3634 
3635 	/* fill in status field */
3636 	msp->status = mp->un_status;
3637 
3638 	/* allocate the extents */
3639 	msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val));
3640 	msp->ext.ext_len = mp->un_numexts;
3641 
3642 	/* do the extents for this soft partition */
3643 	for (i = 0; i < mp->un_numexts; i++) {
3644 		struct mp_ext	*mde = &mp->un_ext[i];
3645 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
3646 
3647 		extp->voff = mde->un_voff;
3648 		extp->poff = mde->un_poff;
3649 		extp->len = mde->un_len;
3650 	}
3651 
3652 	/* cleanup, return success */
3653 	Free(mp);
3654 	dnp->unitp = (md_common_t *)msp;
3655 	return (msp);
3656 
3657 out:
3658 	/* clean up and return error */
3659 	Free(mp);
3660 	Free(msp);
3661 	return (NULL);
3662 }
3663 
3664 
3665 /*
3666  * FUNCTION:	meta_init_sp()
3667  * INPUT:	spp	- the set name for the new device
3668  *		argc	- the remaining argument count for the metainit cmdline
3669  *		argv	- the remainder of the unparsed command line
3670  *		options	- global options parsed by metainit
3671  * OUTPUT:	ep	- return error pointer
3672  * RETURNS:	int	- -1 failure, 0 success
3673  * PURPOSE:	provides the command line parsing and name management overhead
3674  *		for creating a new soft partition.  Ultimately this calls
3675  *		meta_create_sp() which does the real work of allocating space
3676  *		for the new soft partition.
3677  */
3678 int
3679 meta_init_sp(
3680 	mdsetname_t	**spp,
3681 	int		argc,
3682 	char		*argv[],
3683 	mdcmdopts_t	options,
3684 	md_error_t	*ep
3685 )
3686 {
3687 	char		*compname = NULL;
3688 	mdname_t	*spcompnp = NULL;	/* name of component volume */
3689 	char		*devname = argv[0];	/* unit name */
3690 	mdname_t	*np = NULL;		/* name of soft partition */
3691 	md_sp_t		*msp = NULL;
3692 	int		c;
3693 	int		old_optind;
3694 	sp_ext_length_t	len = 0LL;
3695 	int		rval = -1;
3696 	uint_t		seq;
3697 	int		oflag;
3698 	int		failed;
3699 	mddrivename_t	*dnp = NULL;
3700 	sp_ext_length_t	alignment = 0LL;
3701 	sp_ext_node_t	*extlist = NULL;
3702 
3703 	assert(argc > 0);
3704 
3705 	/* expect sp name, -p, optional -e, compname, and size parameters */
3706 	/* grab soft partition name */
3707 	if ((np = metaname(spp, devname, ep)) == NULL)
3708 		goto out;
3709 
3710 	/* see if it exists already */
3711 	if (metagetmiscname(np, ep) != NULL) {
3712 		(void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP,
3713 		    meta_getminor(np->dev), devname);
3714 		goto out;
3715 	} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) {
3716 		goto out;
3717 	} else {
3718 		mdclrerror(ep);
3719 	}
3720 	--argc, ++argv;
3721 
3722 	if (argc == 0)
3723 		goto syntax;
3724 
3725 	/* grab -p */
3726 	if (strcmp(argv[0], "-p") != 0)
3727 		goto syntax;
3728 	--argc, ++argv;
3729 
3730 	if (argc == 0)
3731 		goto syntax;
3732 
3733 	/* see if -e is there */
3734 	if (strcmp(argv[0], "-e") == 0) {
3735 		/* use the whole disk */
3736 		options |= MDCMD_USE_WHOLE_DISK;
3737 		--argc, ++argv;
3738 	}
3739 
3740 	if (argc == 0)
3741 		goto syntax;
3742 
3743 	/* get component name */
3744 	compname = Strdup(argv[0]);
3745 
3746 	if (options & MDCMD_USE_WHOLE_DISK) {
3747 		if ((dnp = metadrivename(spp, compname, ep)) == NULL) {
3748 			goto out;
3749 		}
3750 		if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) {
3751 			goto out;
3752 		}
3753 	} else if ((spcompnp = metaname(spp, compname, ep)) == NULL) {
3754 		goto out;
3755 	}
3756 	assert(*spp != NULL);
3757 
3758 	if (!(options & MDCMD_NOLOCK)) {
3759 		/* grab set lock */
3760 		if (meta_lock(*spp, TRUE, ep))
3761 			goto out;
3762 
3763 		if (meta_check_ownership(*spp, ep) != 0)
3764 			goto out;
3765 	}
3766 
3767 	/* allocate the soft partition */
3768 	msp = Zalloc(sizeof (*msp));
3769 
3770 	/* setup common */
3771 	msp->common.namep = np;
3772 	msp->common.type = MD_METASP;
3773 
3774 	compname = spcompnp->cname;
3775 
3776 	assert(spcompnp->rname != NULL);
3777 	--argc, ++argv;
3778 
3779 	if (argc == 0) {
3780 		goto syntax;
3781 	}
3782 
3783 	if (*argv[0] == '-') {
3784 		/*
3785 		 * parse any other command line options, this includes
3786 		 * the recovery options -o and -b. The special thing
3787 		 * with these options is that the len needs to be
3788 		 * kept track of otherwise when the geometry of the
3789 		 * "device" is built it will create an invalid geometry
3790 		 */
3791 		old_optind = optind = 0;
3792 		opterr = 0;
3793 		oflag = 0;
3794 		seq = 0;
3795 		failed = 0;
3796 		while ((c = getopt(argc, argv, "A:o:b:")) != -1) {
3797 			sp_ext_offset_t	offset;
3798 			sp_ext_length_t	length;
3799 			longlong_t	tmp_size;
3800 
3801 			switch (c) {
3802 			case 'A':	/* data alignment */
3803 				if (meta_sp_parsesizestring(optarg,
3804 					&alignment) == -1) {
3805 					failed = 1;
3806 				}
3807 				break;
3808 			case 'o':	/* offset in the partition */
3809 				if (oflag == 1) {
3810 					failed = 1;
3811 				} else {
3812 					tmp_size = atoll(optarg);
3813 					if (tmp_size <= 0) {
3814 						failed = 1;
3815 					} else {
3816 						oflag = 1;
3817 						options |= MDCMD_DIRECT;
3818 
3819 						offset = tmp_size;
3820 					}
3821 				}
3822 
3823 				break;
3824 			case 'b':	/* number of blocks */
3825 				if (oflag == 0) {
3826 					failed = 1;
3827 				} else {
3828 					tmp_size = atoll(optarg);
3829 					if (tmp_size <= 0) {
3830 						failed = 1;
3831 					} else {
3832 						oflag = 0;
3833 
3834 						length = tmp_size;
3835 
3836 						/* we have a pair of values */
3837 						meta_sp_list_insert(*spp, np,
3838 							&extlist, offset,
3839 							length, EXTTYP_ALLOC,
3840 							seq++, EXTFLG_UPDATE,
3841 							meta_sp_cmp_by_offset);
3842 						len += length;
3843 					}
3844 				}
3845 
3846 				break;
3847 			default:
3848 				argc -= old_optind;
3849 				argv += old_optind;
3850 				goto options;
3851 			}
3852 
3853 			if (failed) {
3854 				argc -= old_optind;
3855 				argv += old_optind;
3856 				goto syntax;
3857 			}
3858 
3859 			old_optind = optind;
3860 		}
3861 		argc -= optind;
3862 		argv += optind;
3863 
3864 		/*
3865 		 * Must have matching pairs of -o and -b flags
3866 		 */
3867 		if (oflag != 0)
3868 			goto syntax;
3869 
3870 		/*
3871 		 * Can't specify both layout (indicated indirectly by
3872 		 * len being set by thye -o/-b cases above) AND
3873 		 * alignment
3874 		 */
3875 		if ((len > 0LL) && (alignment > 0LL))
3876 			goto syntax;
3877 
3878 		/*
3879 		 * sanity check the allocation list
3880 		 */
3881 		if ((extlist != NULL) && meta_sp_list_overlaps(extlist))
3882 			goto syntax;
3883 	}
3884 
3885 	if (len == 0LL) {
3886 		if (argc == 0)
3887 			goto syntax;
3888 		if (meta_sp_parsesize(argv[0], &len) == -1)
3889 			goto syntax;
3890 		--argc, ++argv;
3891 	}
3892 
3893 	msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val));
3894 	msp->ext.ext_val->len = len;
3895 	msp->compnamep = spcompnp;
3896 
3897 	/* we should be at the end */
3898 	if (argc != 0)
3899 		goto syntax;
3900 
3901 	/* create soft partition */
3902 	if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0)
3903 		goto out;
3904 	rval = 0;
3905 
3906 	/* let em know */
3907 	if (options & MDCMD_PRINT) {
3908 		(void) printf(dgettext(TEXT_DOMAIN,
3909 		    "%s: Soft Partition is setup\n"),
3910 		    devname);
3911 		(void) fflush(stdout);
3912 	}
3913 	goto out;
3914 
3915 syntax:
3916 	/* syntax error */
3917 	rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv);
3918 	goto out;
3919 
3920 options:
3921 	/* options error */
3922 	rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv);
3923 	goto out;
3924 
3925 out:
3926 	if (msp != NULL) {
3927 		if (msp->ext.ext_val != NULL) {
3928 			Free(msp->ext.ext_val);
3929 		}
3930 		Free(msp);
3931 	}
3932 
3933 	return (rval);
3934 }
3935 
3936 /*
3937  * FUNCTION:	meta_free_sp()
3938  * INPUT:	msp	- the soft partition unit to free
3939  * OUTPUT:	none
3940  * RETURNS:	void
3941  * PURPOSE:	provides an interface from the rest of libmeta for freeing a
3942  *		soft partition unit
3943  */
3944 void
3945 meta_free_sp(md_sp_t *msp)
3946 {
3947 	Free(msp);
3948 }
3949 
3950 /*
3951  * FUNCTION:	meta_sp_issp()
3952  * INPUT:	sp	- the set name to check
3953  *		np	- the name to check
3954  * OUTPUT:	ep	- return error pointer
3955  * RETURNS:	int	- 0 means sp,np is a soft partition
3956  *			  1 means sp,np is not a soft partition
3957  * PURPOSE:	determines whether the given device is a soft partition
3958  *		device.  This is called by other metadevice check routines.
3959  */
3960 int
3961 meta_sp_issp(
3962 	mdsetname_t	*sp,
3963 	mdname_t	*np,
3964 	md_error_t	*ep
3965 )
3966 {
3967 	if (meta_get_sp_common(sp, np, 0, ep) == NULL)
3968 		return (1);
3969 
3970 	return (0);
3971 }
3972 
3973 /*
3974  * FUNCTION:	meta_check_sp()
3975  * INPUT:	sp	- the set name to check
3976  *		msp	- the unit structure to check
3977  *		options	- creation options
3978  * OUTPUT:	repart_options - options to be passed to
3979  *				meta_repartition_drive()
3980  *		ep	- return error pointer
3981  * RETURNS:	int	-  0 ok to create on this component
3982  *			  -1 error or not ok to create on this component
3983  * PURPOSE:	Checks to determine whether the rules for creation of
3984  *		soft partitions allow creation of a soft partition on
3985  *		the device described by the mdname_t structure referred
3986  *		to by msp->compnamep.
3987  *
3988  *		NOTE: Does NOT check to determine whether the extents
3989  *		      described in the md_sp_t structure referred to by
3990  *		      msp will fit on the device described by the mdname_t
3991  *		      structure located at msp->compnamep.
3992  */
3993 static int
3994 meta_check_sp(
3995 	mdsetname_t	*sp,
3996 	md_sp_t		*msp,
3997 	mdcmdopts_t	options,
3998 	int		*repart_options,
3999 	md_error_t	*ep
4000 )
4001 {
4002 	md_common_t	*mdp;
4003 	mdname_t	*compnp = msp->compnamep;
4004 	uint_t		slice;
4005 	mddrivename_t	*dnp;
4006 	mdname_t	*slicenp;
4007 	mdvtoc_t	*vtocp;
4008 
4009 	/* make sure it is in the set */
4010 	if (meta_check_inset(sp, compnp, ep) != 0)
4011 		return (-1);
4012 
4013 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4014 		uint_t	rep_slice;
4015 
4016 		/*
4017 		 * check to make sure we can partition this drive.
4018 		 * we cannot continue if any of the following are
4019 		 * true:
4020 		 * The drive is a metadevice.
4021 		 * The drive contains a mounted slice.
4022 		 * The drive contains a slice being swapped to.
4023 		 * The drive contains slices which are part of other
4024 		 * metadevices.
4025 		 * The drive contains a metadb.
4026 		 */
4027 		if (metaismeta(compnp))
4028 			return (mddeverror(ep, MDE_IS_META, compnp->dev,
4029 			    compnp->cname));
4030 
4031 		assert(compnp->drivenamep != NULL);
4032 
4033 		/*
4034 		 * ensure that we have slice 0 since the disk will be
4035 		 * repartitioned in the USE_WHOLE_DISK case.  this check
4036 		 * is redundant unless the user incorrectly specifies a
4037 		 * a fully qualified drive AND slice name (i.e.,
4038 		 * /dev/dsk/cXtXdXsX), which will be incorrectly
4039 		 * recognized as a drive name by the metaname code.
4040 		 */
4041 
4042 		if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL)
4043 			return (-1);
4044 		if (slice != MD_SLICE0)
4045 			return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname));
4046 
4047 		dnp = compnp->drivenamep;
4048 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
4049 			return (-1);
4050 
4051 		for (slice = 0; slice < vtocp->nparts; slice++) {
4052 
4053 			/* only check if the slice really exists */
4054 			if (vtocp->parts[slice].size == 0)
4055 				continue;
4056 
4057 			slicenp = metaslicename(dnp, slice, ep);
4058 			if (slicenp == NULL)
4059 				return (-1);
4060 
4061 			/* check to ensure that it is not already in use */
4062 			if (meta_check_inuse(sp,
4063 			    slicenp, MDCHK_INUSE, ep) != 0) {
4064 				return (-1);
4065 			}
4066 
4067 			/*
4068 			 * Up to this point, tests are applied to all
4069 			 * slices uniformly.
4070 			 */
4071 
4072 			if (slice == rep_slice) {
4073 				/*
4074 				 * Tests inside the body of this
4075 				 * conditional are applied only to
4076 				 * slice seven.
4077 				 */
4078 				if (meta_check_inmeta(sp, slicenp,
4079 				    options | MDCHK_ALLOW_MDDB |
4080 				    MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0)
4081 					return (-1);
4082 
4083 				/*
4084 				 * For slice seven, a metadb is NOT an
4085 				 * automatic failure. It merely means
4086 				 * that we're not allowed to muck
4087 				 * about with the partitioning of that
4088 				 * slice.  We indicate this by masking
4089 				 * in the MD_REPART_LEAVE_REP flag.
4090 				 */
4091 				if (metahasmddb(sp, slicenp, ep)) {
4092 					assert(repart_options !=
4093 					    NULL);
4094 					*repart_options |=
4095 					    MD_REPART_LEAVE_REP;
4096 				}
4097 
4098 				/*
4099 				 * Skip the remaining tests for slice
4100 				 * seven
4101 				 */
4102 				continue;
4103 			}
4104 
4105 			/*
4106 			 * Tests below this point will be applied to
4107 			 * all slices EXCEPT for the replica slice.
4108 			 */
4109 
4110 
4111 			/* check if component is in a metadevice */
4112 			if (meta_check_inmeta(sp, slicenp, options, 0,
4113 			    -1, ep) != 0)
4114 				return (-1);
4115 
4116 			/* check to see if component has a metadb */
4117 			if (metahasmddb(sp, slicenp, ep))
4118 				return (mddeverror(ep, MDE_HAS_MDDB,
4119 				    slicenp->dev, slicenp->cname));
4120 		}
4121 		/*
4122 		 * This should be all of the testing necessary when
4123 		 * the MDCMD_USE_WHOLE_DISK flag is set; the rest of
4124 		 * meta_check_sp() is oriented towards component
4125 		 * arguments instead of disks.
4126 		 */
4127 		goto meta_check_sp_ok;
4128 
4129 	}
4130 
4131 	/* check to ensure that it is not already in use */
4132 	if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) {
4133 		return (-1);
4134 	}
4135 
4136 	if (!metaismeta(compnp)) {	/* handle non-metadevices */
4137 
4138 		/*
4139 		 * The component can have one or more soft partitions on it
4140 		 * already, but can't be part of any other type of metadevice,
4141 		 * so if it is used for a metadevice, but the metadevice
4142 		 * isn't a soft partition, return failure.
4143 		 */
4144 
4145 		if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 &&
4146 		    meta_check_insp(sp, compnp, 0, -1, ep) == 0) {
4147 			return (-1);
4148 		}
4149 	} else {			/* handle metadevices */
4150 		/* get underlying unit & check capabilities */
4151 		if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL)
4152 			return (-1);
4153 
4154 		if ((! (mdp->capabilities & MD_CAN_PARENT)) ||
4155 		    (! (mdp->capabilities & MD_CAN_SP)))
4156 			return (mdmderror(ep, MDE_INVAL_UNIT,
4157 			    meta_getminor(compnp->dev), compnp->cname));
4158 	}
4159 
4160 meta_check_sp_ok:
4161 	mdclrerror(ep);
4162 	return (0);
4163 }
4164 
4165 /*
4166  * FUNCTION:	meta_create_sp()
4167  * INPUT:	sp	- the set name to create in
4168  *		msp	- the unit structure to create
4169  *		oblist	- an optional list of requested extents (-o/-b options)
4170  *		options	- creation options
4171  *		alignment - data alignment
4172  * OUTPUT:	ep	- return error pointer
4173  * RETURNS:	int	-  0 success, -1 error
4174  * PURPOSE:	does most of the work for creating a soft partition.  If
4175  *		metainit -p -e was used, first partition the drive.  Then
4176  *		create an extent list based on the existing soft partitions
4177  *		and assume all space not used by them is free.  Storage for
4178  *		the new soft partition is allocated from the free extents
4179  *		based on the length specified on the command line or the
4180  *		oblist passed in.  The unit structure is then committed and
4181  *		the watermarks are updated.  Finally, the status is changed to
4182  *		Okay and the process is complete.
4183  */
4184 static int
4185 meta_create_sp(
4186 	mdsetname_t	*sp,
4187 	md_sp_t		*msp,
4188 	sp_ext_node_t	*oblist,
4189 	mdcmdopts_t	options,
4190 	sp_ext_length_t	alignment,
4191 	md_error_t	*ep
4192 )
4193 {
4194 	mdname_t	*np = msp->common.namep;
4195 	mdname_t	*compnp = msp->compnamep;
4196 	mp_unit_t	*mp = NULL;
4197 	mdnamelist_t	*keynlp = NULL, *spnlp = NULL;
4198 	md_set_params_t	set_params;
4199 	int		rval = -1;
4200 	diskaddr_t	comp_size;
4201 	diskaddr_t	sp_start;
4202 	sp_ext_node_t	*extlist = NULL;
4203 	int		numexts = 0;	/* number of extents */
4204 	int		count = 0;
4205 	int		committed = 0;
4206 	int		repart_options = MD_REPART_FORCE;
4207 	int		create_flag = MD_CRO_32BIT;
4208 
4209 	md_set_desc	*sd;
4210 	mm_unit_t	*mm;
4211 	md_set_mmown_params_t	*ownpar = NULL;
4212 	int		comp_is_mirror = 0;
4213 
4214 	/* validate soft partition */
4215 	if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0)
4216 		return (-1);
4217 
4218 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4219 		if ((options & MDCMD_DOIT) != 0) {
4220 			if (meta_repartition_drive(sp,
4221 			    compnp->drivenamep,
4222 			    repart_options,
4223 			    NULL, /* Don't return the VTOC */
4224 			    ep) != 0)
4225 
4226 				return (-1);
4227 		} else {
4228 			/*
4229 			 * If -n and -e are both specified, it doesn't make
4230 			 * sense to continue without actually partitioning
4231 			 * the drive.
4232 			 */
4233 			return (0);
4234 		}
4235 	}
4236 
4237 	/* populate the start_blk field of the component name */
4238 	if ((sp_start = meta_sp_get_start(sp, compnp, ep)) ==
4239 	    MD_DISKADDR_ERROR) {
4240 		rval = -1;
4241 		goto out;
4242 	}
4243 
4244 	if (options & MDCMD_DOIT) {
4245 		/* store name in namespace */
4246 		if (add_key_name(sp, compnp, &keynlp, ep) != 0) {
4247 			rval = -1;
4248 			goto out;
4249 		}
4250 	}
4251 
4252 	/*
4253 	 * Get a list of the soft partitions that currently reside on
4254 	 * the component.  We should ALWAYS force reload the cache,
4255 	 * because if this is a single creation, there will not BE a
4256 	 * cached list, and if we're using the md.tab, we must rebuild
4257 	 * the list because it won't contain the previous (if any)
4258 	 * soft partition.
4259 	 */
4260 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4261 	if (count < 0) {
4262 		/* error occured */
4263 		rval = -1;
4264 		goto out;
4265 	}
4266 
4267 	/*
4268 	 * get the size of the underlying device.  if the size is smaller
4269 	 * than or equal to the watermark size, we know there isn't
4270 	 * enough space.
4271 	 */
4272 	if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) {
4273 		rval = -1;
4274 		goto out;
4275 	} else if (comp_size <= MD_SP_WMSIZE) {
4276 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname);
4277 		rval = -1;
4278 		goto out;
4279 	}
4280 	/*
4281 	 * seed extlist with reserved space at the beginning of the volume and
4282 	 * enough space for the end watermark.  The end watermark always gets
4283 	 * updated, but if the underlying device changes size it may not be
4284 	 * pointed to until the extent before it is updated.  Since the
4285 	 * end of the reserved space is where the first watermark starts,
4286 	 * the reserved extent should never be marked for updating.
4287 	 */
4288 
4289 	meta_sp_list_insert(NULL, NULL, &extlist,
4290 	    0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4291 	meta_sp_list_insert(NULL, NULL, &extlist,
4292 	    (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE,
4293 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4294 
4295 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4296 		rval = -1;
4297 		goto out;
4298 	}
4299 
4300 	metafreenamelist(spnlp);
4301 
4302 	if (getenv(META_SP_DEBUG)) {
4303 		meta_sp_debug("meta_create_sp: list of used extents:\n");
4304 		meta_sp_list_dump(extlist);
4305 	}
4306 
4307 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4308 
4309 	/* get extent list from -o/-b options or from free space */
4310 	if (options & MDCMD_DIRECT) {
4311 		if (getenv(META_SP_DEBUG)) {
4312 			meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n");
4313 			meta_sp_list_dump(oblist);
4314 		}
4315 
4316 		numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist);
4317 		if (numexts == -1) {
4318 			(void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname);
4319 			rval = -1;
4320 			goto out;
4321 		}
4322 	} else {
4323 		numexts = meta_sp_alloc_by_len(sp, np, &extlist,
4324 		    &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment :
4325 		    meta_sp_get_default_alignment(sp, compnp, ep));
4326 		if (numexts == -1) {
4327 			(void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname);
4328 			rval = -1;
4329 			goto out;
4330 		}
4331 	}
4332 
4333 	assert(extlist != NULL);
4334 
4335 	/* create soft partition */
4336 	mp = meta_sp_createunit(msp->common.namep, msp->compnamep,
4337 	    extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep);
4338 
4339 	create_flag = meta_check_devicesize(mp->c.un_total_blocks);
4340 
4341 	/* if we're not doing anything (metainit -n), return success */
4342 	if (! (options & MDCMD_DOIT)) {
4343 		rval = 0;	/* success */
4344 		goto out;
4345 	}
4346 
4347 	(void) memset(&set_params, 0, sizeof (set_params));
4348 
4349 	if (create_flag == MD_CRO_64BIT) {
4350 		mp->c.un_revision = MD_64BIT_META_DEV;
4351 		set_params.options = MD_CRO_64BIT;
4352 	} else {
4353 		mp->c.un_revision = MD_32BIT_META_DEV;
4354 		set_params.options = MD_CRO_32BIT;
4355 	}
4356 
4357 	if (getenv(META_SP_DEBUG)) {
4358 		meta_sp_debug("meta_create_sp: printing unit structure\n");
4359 		meta_sp_printunit(mp);
4360 	}
4361 
4362 	/*
4363 	 * Check to see if we're trying to create a partition on a mirror. If so
4364 	 * we may have to enforce an ownership change before writing the
4365 	 * watermark out.
4366 	 */
4367 	if (metaismeta(compnp)) {
4368 		char *miscname;
4369 
4370 		miscname = metagetmiscname(compnp, ep);
4371 		if (miscname != NULL)
4372 			comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0);
4373 		else
4374 			comp_is_mirror = 0;
4375 	} else {
4376 		comp_is_mirror = 0;
4377 	}
4378 
4379 	/*
4380 	 * For a multi-node environment we have to ensure that the master
4381 	 * node owns an underlying mirror before we issue the MD_IOCSET ioctl.
4382 	 * If the master does not own the device we will deadlock as the
4383 	 * implicit write of the watermarks (in sp_ioctl.c) will cause an
4384 	 * ownership change that will block as the MD_IOCSET is still in
4385 	 * progress. To close this window we force an owner change to occur
4386 	 * before issuing the MD_IOCSET. We cannot simply open the device and
4387 	 * write to it as this will only work for the first soft-partition
4388 	 * creation.
4389 	 */
4390 
4391 	if (comp_is_mirror && !metaislocalset(sp)) {
4392 
4393 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4394 			rval = -1;
4395 			goto out;
4396 		}
4397 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
4398 			mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
4399 			if (mm == NULL) {
4400 				rval = -1;
4401 				goto out;
4402 			} else {
4403 				rval = meta_mn_change_owner(&ownpar, sp->setno,
4404 					meta_getminor(compnp->dev),
4405 					sd->sd_mn_mynode->nd_nodeid,
4406 					MD_MN_MM_PREVENT_CHANGE |
4407 					    MD_MN_MM_SPAWN_THREAD);
4408 				if (rval == -1)
4409 					goto out;
4410 			}
4411 		}
4412 	}
4413 
4414 	set_params.mnum = MD_SID(mp);
4415 	set_params.size = mp->c.un_size;
4416 	set_params.mdp = (uintptr_t)mp;
4417 	MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum));
4418 
4419 	/* first phase of commit. */
4420 	if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
4421 	    np->cname) != 0) {
4422 		(void) mdstealerror(ep, &set_params.mde);
4423 		rval = -1;
4424 		goto out;
4425 	}
4426 
4427 	/* we've successfully committed the record */
4428 	committed = 1;
4429 
4430 	/* write watermarks */
4431 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
4432 		rval = -1;
4433 		goto out;
4434 	}
4435 
4436 	/*
4437 	 * Allow mirror ownership to change. If we don't succeed in this
4438 	 * ioctl it isn't fatal, but the cluster will probably hang fairly
4439 	 * soon as the mirror owner won't change. However, we have
4440 	 * successfully written the watermarks out to the device so the
4441 	 * softpart creation has succeeded
4442 	 */
4443 	if (ownpar) {
4444 		(void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum,
4445 		    ownpar->d.owner,
4446 		    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
4447 	}
4448 
4449 	/* second phase of commit, set status to MD_SP_OK */
4450 	if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) {
4451 		rval = -1;
4452 		goto out;
4453 	}
4454 	rval = 0;
4455 out:
4456 	Free(mp);
4457 	if (ownpar)
4458 		Free(ownpar);
4459 
4460 	if (extlist != NULL)
4461 		meta_sp_list_free(&extlist);
4462 
4463 	if (rval != 0 && keynlp != NULL && committed != 1)
4464 		(void) del_key_names(sp, keynlp, NULL);
4465 
4466 	metafreenamelist(keynlp);
4467 
4468 	return (rval);
4469 }
4470 
4471 /*
4472  * **************************************************************************
4473  *                      Reset (metaclear) Functions                         *
4474  * **************************************************************************
4475  */
4476 
4477 /*
4478  * FUNCTION:	meta_sp_reset_common()
4479  * INPUT:	sp	- the set name of the device to reset
4480  *		np	- the name of the device to reset
4481  *		msp	- the unit structure to reset
4482  *		options	- metaclear options
4483  * OUTPUT:	ep	- return error pointer
4484  * RETURNS:	int	-  0 success, -1 error
4485  * PURPOSE:	"resets", or more accurately deletes, the soft partition
4486  *		specified.  First the state is set to "deleting" and then the
4487  *		watermarks are all cleared out.  Once the watermarks have been
4488  *		updated, the unit structure is deleted from the metadb.
4489  */
4490 static int
4491 meta_sp_reset_common(
4492 	mdsetname_t	*sp,
4493 	mdname_t	*np,
4494 	md_sp_t		*msp,
4495 	md_sp_reset_t	reset_params,
4496 	mdcmdopts_t	options,
4497 	md_error_t	*ep
4498 )
4499 {
4500 	char	*miscname;
4501 	int	rval = -1;
4502 	int	is_open = 0;
4503 
4504 	/* make sure that nobody owns us */
4505 	if (MD_HAS_PARENT(msp->common.parent))
4506 		return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev),
4507 					np->cname));
4508 
4509 	/* make sure that the soft partition isn't open */
4510 	if ((is_open = meta_isopen(sp, np, ep, options)) < 0)
4511 		return (-1);
4512 	else if (is_open)
4513 		return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev),
4514 					np->cname));
4515 
4516 	/* get miscname */
4517 	if ((miscname = metagetmiscname(np, ep)) == NULL)
4518 		return (-1);
4519 
4520 	/* fill in reset params */
4521 	MD_SETDRIVERNAME(&reset_params, miscname, sp->setno);
4522 	reset_params.mnum = meta_getminor(np->dev);
4523 	reset_params.force = (options & MDCMD_FORCE) ? 1 : 0;
4524 
4525 	/*
4526 	 * clear soft partition - phase one.
4527 	 * place the soft partition into the "delete pending" state.
4528 	 */
4529 	if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0)
4530 		return (-1);
4531 
4532 	/*
4533 	 * Now clear the watermarks.  If the force flag is specified,
4534 	 * ignore any errors writing the watermarks and delete the unit
4535 	 * structure anyway.  An error may leave the on-disk format in a
4536 	 * corrupt state.  If force is not specified and we fail here,
4537 	 * the soft partition will remain in the "delete pending" state.
4538 	 */
4539 	if ((meta_sp_clear_wm(sp, msp, ep) < 0) &&
4540 	    ((options & MDCMD_FORCE) == 0))
4541 		goto out;
4542 
4543 	/*
4544 	 * clear soft partition - phase two.
4545 	 * the driver removes the soft partition from the metadb and
4546 	 * zeros out incore version.
4547 	 */
4548 	if (metaioctl(MD_IOCRESET, &reset_params,
4549 	    &reset_params.mde, np->cname) != 0) {
4550 		(void) mdstealerror(ep, &reset_params.mde);
4551 		goto out;
4552 	}
4553 	rval = 0;	/* success */
4554 
4555 	if (options & MDCMD_PRINT) {
4556 		(void) printf(dgettext(TEXT_DOMAIN,
4557 		    "%s: Soft Partition is cleared\n"),
4558 		    np->cname);
4559 		(void) fflush(stdout);
4560 	}
4561 
4562 	/*
4563 	 * if told to recurse and on a metadevice, then attempt to
4564 	 * clear the subdevices.  Indicate failure if the clear fails.
4565 	 */
4566 	if ((options & MDCMD_RECURSE) &&
4567 	    (metaismeta(msp->compnamep)) &&
4568 	    (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0))
4569 		rval = -1;
4570 
4571 out:
4572 	meta_invalidate_name(np);
4573 	return (rval);
4574 }
4575 
4576 /*
4577  * FUNCTION:	meta_sp_reset()
4578  * INPUT:	sp	- the set name of the device to reset
4579  *		np	- the name of the device to reset
4580  *		options	- metaclear options
4581  * OUTPUT:	ep	- return error pointer
4582  * RETURNS:	int	-  0 success, -1 error
4583  * PURPOSE:	provides the entry point to the rest of libmeta for deleting a
4584  *		soft partition.  If np is NULL, then soft partitions are
4585  *		all deleted at the current level and then recursively deleted.
4586  *		Otherwise, if a name is specified either directly or as a
4587  *		result of a recursive operation, it deletes only that name.
4588  *		Since something sitting under a soft partition may be parented
4589  *		to it, we have to reparent that other device to another soft
4590  *		partition on the same component if we're deleting the one it's
4591  *		parented to.
4592  */
4593 int
4594 meta_sp_reset(
4595 	mdsetname_t	*sp,
4596 	mdname_t	*np,
4597 	mdcmdopts_t	options,
4598 	md_error_t	*ep
4599 )
4600 {
4601 	md_sp_t		*msp;
4602 	int		rval = -1;
4603 	mdnamelist_t	*spnlp = NULL, *nlp = NULL;
4604 	md_sp_reset_t	reset_params;
4605 	int		num_sp;
4606 
4607 	assert(sp != NULL);
4608 
4609 	/* reset/delete all soft paritions */
4610 	if (np == NULL) {
4611 		/*
4612 		 * meta_reset_all sets MDCMD_RECURSE, but this behavior
4613 		 * is incorrect for soft partitions.  We want to clear
4614 		 * all soft partitions at a particular level in the
4615 		 * metadevice stack before moving to the next level.
4616 		 * Thus, we clear MDCMD_RECURSE from the options.
4617 		 */
4618 		options &= ~MDCMD_RECURSE;
4619 
4620 		/* for each soft partition */
4621 		rval = 0;
4622 		if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
4623 			rval = -1;
4624 
4625 		for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) {
4626 			np = nlp->namep;
4627 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4628 				rval = -1;
4629 				break;
4630 			}
4631 			/*
4632 			 * meta_reset_all calls us twice to get soft
4633 			 * partitions at the top and bottom of the stack.
4634 			 * thus, if we have a parent, we'll get deleted
4635 			 * on the next call.
4636 			 */
4637 			if (MD_HAS_PARENT(msp->common.parent))
4638 				continue;
4639 			/*
4640 			 * If this is a multi-node set, we send a series
4641 			 * of individual metaclear commands.
4642 			 */
4643 			if (meta_is_mn_set(sp, ep)) {
4644 				if (meta_mn_send_metaclear_command(sp,
4645 				    np->cname, options, 0, ep) != 0) {
4646 					rval = -1;
4647 					break;
4648 				}
4649 			} else {
4650 				if (meta_sp_reset(sp, np, options, ep) != 0) {
4651 					rval = -1;
4652 					break;
4653 				}
4654 			}
4655 		}
4656 		/* cleanup return status */
4657 		metafreenamelist(spnlp);
4658 		return (rval);
4659 	}
4660 
4661 	/* check the name */
4662 	if (metachkmeta(np, ep) != 0)
4663 		return (-1);
4664 
4665 	/* get the unit structure */
4666 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4667 		return (-1);
4668 
4669 	/* clear out reset parameters */
4670 	(void) memset(&reset_params, 0, sizeof (reset_params));
4671 
4672 	/* if our child is a metadevice, we need to deparent/reparent it */
4673 	if (metaismeta(msp->compnamep)) {
4674 		/* get sp's on this component */
4675 		if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep,
4676 		    &spnlp, 1, ep)) <= 0)
4677 			/* no sp's on this device.  error! */
4678 			return (-1);
4679 		else if (num_sp == 1)
4680 			/* last sp on this device, so we deparent */
4681 			reset_params.new_parent = MD_NO_PARENT;
4682 		else {
4683 			/* have to reparent this metadevice */
4684 			for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4685 				if (meta_getminor(nlp->namep->dev) ==
4686 					meta_getminor(np->dev))
4687 					continue;
4688 				/*
4689 				 * this isn't the softpart we are deleting,
4690 				 * so use this device as the new parent.
4691 				 */
4692 				reset_params.new_parent =
4693 				    meta_getminor(nlp->namep->dev);
4694 				break;
4695 			}
4696 		}
4697 		metafreenamelist(spnlp);
4698 	}
4699 
4700 	if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0)
4701 		return (-1);
4702 
4703 	return (0);
4704 }
4705 
4706 /*
4707  * FUNCTION:	meta_sp_reset_component()
4708  * INPUT:	sp	- the set name of the device to reset
4709  *		name	- the string name of the device to reset
4710  *		options	- metaclear options
4711  * OUTPUT:	ep	- return error pointer
4712  * RETURNS:	int	-  0 success, -1 error
4713  * PURPOSE:	provides the ability to delete all soft partitions on a
4714  *		specified device (metaclear -p).  It first gets all of the
4715  *		soft partitions on the component and then deletes each one
4716  *		individually.
4717  */
4718 int
4719 meta_sp_reset_component(
4720 	mdsetname_t	*sp,
4721 	char		*name,
4722 	mdcmdopts_t	options,
4723 	md_error_t	*ep
4724 )
4725 {
4726 	mdname_t	*compnp, *np;
4727 	mdnamelist_t	*spnlp = NULL;
4728 	mdnamelist_t	*nlp = NULL;
4729 	md_sp_t		*msp;
4730 	int		count;
4731 	md_sp_reset_t	reset_params;
4732 
4733 	if ((compnp = metaname(&sp, name, ep)) == NULL)
4734 		return (-1);
4735 
4736 	/* If we're starting out with no soft partitions, it's an error */
4737 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4738 	if (count == 0)
4739 		return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname));
4740 	else if (count < 0)
4741 		return (-1);
4742 
4743 	/*
4744 	 * clear all soft partitions on this component.
4745 	 * NOTE: we reparent underlying metadevices as we go so that
4746 	 * things stay sane.  Also, if we encounter an error, we stop
4747 	 * and go no further in case recovery might be needed.
4748 	 */
4749 	for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4750 		/* clear out reset parameters */
4751 		(void) memset(&reset_params, 0, sizeof (reset_params));
4752 
4753 		/* check the name */
4754 		np = nlp->namep;
4755 
4756 		if (metachkmeta(np, ep) != 0) {
4757 			metafreenamelist(spnlp);
4758 			return (-1);
4759 		}
4760 
4761 		/* get the unit structure */
4762 		if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4763 			metafreenamelist(spnlp);
4764 			return (-1);
4765 		}
4766 
4767 		/* have to deparent/reparent metadevices */
4768 		if (metaismeta(compnp)) {
4769 			if (nlp->next == NULL)
4770 				reset_params.new_parent = MD_NO_PARENT;
4771 			else
4772 				reset_params.new_parent =
4773 				    meta_getminor(spnlp->next->namep->dev);
4774 		}
4775 
4776 		/* clear soft partition */
4777 		if (meta_sp_reset_common(sp, np, msp, reset_params,
4778 		    options, ep) < 0) {
4779 			metafreenamelist(spnlp);
4780 			return (-1);
4781 		}
4782 	}
4783 	metafreenamelist(spnlp);
4784 	return (0);
4785 }
4786 
4787 /*
4788  * **************************************************************************
4789  *                      Grow (metattach) Functions                          *
4790  * **************************************************************************
4791  */
4792 
4793 /*
4794  * FUNCTION:	meta_sp_attach()
4795  * INPUT:	sp	- the set name of the device to attach to
4796  *		np	- the name of the device to attach to
4797  *		addsize	- the unparsed string holding the amount of space to add
4798  *		options	- metattach options
4799  *		alignment - data alignment
4800  * OUTPUT:	ep	- return error pointer
4801  * RETURNS:	int	-  0 success, -1 error
4802  * PURPOSE:	grows a soft partition by reading in the existing unit
4803  *		structure and setting its state to Growing, allocating more
4804  *		space (similar to meta_create_sp()), updating the watermarks,
4805  *		and then writing out the new unit structure in the Okay state.
4806  */
4807 int
4808 meta_sp_attach(
4809 	mdsetname_t	*sp,
4810 	mdname_t	*np,
4811 	char		*addsize,
4812 	mdcmdopts_t	options,
4813 	sp_ext_length_t	alignment,
4814 	md_error_t	*ep
4815 )
4816 {
4817 	md_grow_params_t	grow_params;
4818 	sp_ext_length_t		grow_len;	/* amount to grow */
4819 	mp_unit_t		*mp, *new_un;
4820 	mdname_t		*compnp = NULL;
4821 
4822 	sp_ext_node_t		*extlist = NULL;
4823 	int			numexts;
4824 	mdnamelist_t		*spnlp = NULL;
4825 	int			count;
4826 	md_sp_t			*msp;
4827 	daddr_t			start_block;
4828 
4829 	/* should have the same set */
4830 	assert(sp != NULL);
4831 	assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev)));
4832 
4833 	/* check name */
4834 	if (metachkmeta(np, ep) != 0)
4835 		return (-1);
4836 
4837 	if (meta_sp_parsesize(addsize, &grow_len) == -1) {
4838 		return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname));
4839 	}
4840 
4841 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
4842 		return (-1);
4843 
4844 	/* make sure we don't have a parent */
4845 	if (MD_HAS_PARENT(mp->c.un_parent)) {
4846 		Free(mp);
4847 		return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname));
4848 	}
4849 
4850 	if (getenv(META_SP_DEBUG)) {
4851 		meta_sp_debug("meta_sp_attach: Unit structure before new "
4852 		    "space:\n");
4853 		meta_sp_printunit(mp);
4854 	}
4855 
4856 	/*
4857 	 * NOTE: the fast option to metakeyname is 0 as opposed to 1
4858 	 * If this was not the case we would suffer the following
4859 	 * assertion failure:
4860 	 * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP
4861 	 * file meta_check.x, line 315
4862 	 * I guess this is because we have not "seen" this drive before
4863 	 * and hence hit the failure - this is of course the attach routine
4864 	 */
4865 	if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) {
4866 		Free(mp);
4867 		return (-1);
4868 	}
4869 
4870 	/* metakeyname does not fill in the key. */
4871 	compnp->key = mp->un_key;
4872 
4873 	/* work out the space on the component that we are dealing with */
4874 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
4875 
4876 	/*
4877 	 * see if the component has been soft partitioned yet, or if an
4878 	 * error occurred.
4879 	 */
4880 	if (count == 0) {
4881 		Free(mp);
4882 		return (mdmderror(ep, MDE_NOT_SP, 0, np->cname));
4883 	} else if (count < 0) {
4884 		Free(mp);
4885 		return (-1);
4886 	}
4887 
4888 	/*
4889 	 * seed extlist with reserved space at the beginning of the volume and
4890 	 * enough space for the end watermark.  The end watermark always gets
4891 	 * updated, but if the underlying device changes size it may not be
4892 	 * pointed to until the extent before it is updated.  Since the
4893 	 * end of the reserved space is where the first watermark starts,
4894 	 * the reserved extent should never be marked for updating.
4895 	 */
4896 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
4897 	    MD_DISKADDR_ERROR) {
4898 		Free(mp);
4899 		return (-1);
4900 	}
4901 
4902 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
4903 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4904 	meta_sp_list_insert(NULL, NULL, &extlist,
4905 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
4906 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4907 
4908 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4909 		Free(mp);
4910 		return (-1);
4911 	}
4912 
4913 	metafreenamelist(spnlp);
4914 
4915 	if (getenv(META_SP_DEBUG)) {
4916 		meta_sp_debug("meta_sp_attach: list of used extents:\n");
4917 		meta_sp_list_dump(extlist);
4918 	}
4919 
4920 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4921 
4922 	assert(mp->un_numexts >= 1);
4923 	numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len,
4924 	    mp->un_ext[mp->un_numexts - 1].un_poff,
4925 	    (alignment > 0) ? alignment :
4926 	    meta_sp_get_default_alignment(sp, compnp, ep));
4927 
4928 	if (numexts == -1) {
4929 		Free(mp);
4930 		return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname));
4931 	}
4932 
4933 	/* allocate new unit structure and copy in old unit */
4934 	if ((new_un = meta_sp_updateunit(np, mp, extlist,
4935 	    grow_len, numexts, ep)) == NULL) {
4936 		Free(mp);
4937 		return (-1);
4938 	}
4939 	Free(mp);
4940 
4941 	/* If running in dryrun mode (-n option), we're done here */
4942 	if ((options & MDCMD_DOIT) == 0) {
4943 		if (options & MDCMD_PRINT) {
4944 			(void) printf(dgettext(TEXT_DOMAIN,
4945 			    "%s: Soft Partition would grow\n"),
4946 			    np->cname);
4947 			(void) fflush(stdout);
4948 		}
4949 		return (0);
4950 	}
4951 
4952 	if (getenv(META_SP_DEBUG)) {
4953 		meta_sp_debug("meta_sp_attach: updated unit structure:\n");
4954 		meta_sp_printunit(new_un);
4955 	}
4956 
4957 	assert(new_un != NULL);
4958 
4959 	(void) memset(&grow_params, 0, sizeof (grow_params));
4960 	if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) {
4961 		grow_params.options = MD_CRO_64BIT;
4962 		new_un->c.un_revision = MD_64BIT_META_DEV;
4963 	} else {
4964 		grow_params.options = MD_CRO_32BIT;
4965 		new_un->c.un_revision = MD_32BIT_META_DEV;
4966 	}
4967 	grow_params.mnum = MD_SID(new_un);
4968 	grow_params.size = new_un->c.un_size;
4969 	grow_params.mdp = (uintptr_t)new_un;
4970 	MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum));
4971 
4972 	if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde,
4973 	    np->cname) != 0) {
4974 		(void) mdstealerror(ep, &grow_params.mde);
4975 		return (-1);
4976 	}
4977 
4978 	/* update all watermarks */
4979 
4980 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4981 		return (-1);
4982 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0)
4983 		return (-1);
4984 
4985 
4986 	/* second phase of commit, set status to MD_SP_OK */
4987 	if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0)
4988 		return (-1);
4989 
4990 	meta_invalidate_name(np);
4991 
4992 	if (options & MDCMD_PRINT) {
4993 		(void) printf(dgettext(TEXT_DOMAIN,
4994 		    "%s: Soft Partition has been grown\n"),
4995 		    np->cname);
4996 		(void) fflush(stdout);
4997 	}
4998 
4999 	return (0);
5000 }
5001 
5002 /*
5003  * **************************************************************************
5004  *                    Recovery (metarecover) Functions                      *
5005  * **************************************************************************
5006  */
5007 
5008 /*
5009  * FUNCTION:	meta_recover_sp()
5010  * INPUT:	sp	- the name of the set we are recovering on
5011  *		compnp	- name pointer for device we are recovering on
5012  *		argc	- argument count
5013  *		argv	- left over arguments not parsed by metarecover command
5014  *		options	- metarecover options
5015  * OUTPUT:	ep	- return error pointer
5016  * RETURNS:	int	- 0 - success, -1 - error
5017  * PURPOSE:	parse soft partitioning-specific metarecover options and
5018  *		dispatch to the appropriate function to handle recovery.
5019  */
5020 int
5021 meta_recover_sp(
5022 	mdsetname_t	*sp,
5023 	mdname_t	*compnp,
5024 	int		argc,
5025 	char		*argv[],
5026 	mdcmdopts_t	options,
5027 	md_error_t	*ep
5028 )
5029 {
5030 	md_set_desc	*sd;
5031 
5032 	if (argc > 1) {
5033 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5034 		    argc, argv);
5035 		return (-1);
5036 	}
5037 
5038 	/*
5039 	 * For a MN set, this operation must be performed on the master
5040 	 * as it is responsible for maintaining the watermarks
5041 	 */
5042 	if (!metaislocalset(sp)) {
5043 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
5044 			return (-1);
5045 		if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) {
5046 			(void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno,
5047 			    sd->sd_mn_master_nodenm, NULL, NULL);
5048 			return (-1);
5049 		}
5050 	}
5051 	if (argc == 0) {
5052 		/*
5053 		 * if no additional arguments are passed, metarecover should
5054 		 * validate both on-disk and metadb structures as well as
5055 		 * checking that both are consistent with each other
5056 		 */
5057 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5058 			return (-1);
5059 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5060 			return (-1);
5061 		if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0)
5062 			return (-1);
5063 	} else if (strcmp(argv[0], "-d") == 0) {
5064 		/*
5065 		 * Ensure that there is no existing valid record for this
5066 		 * soft-partition. If there is we have nothing to do.
5067 		 */
5068 		if (meta_sp_validate_unit(sp, compnp, options, ep) == 0)
5069 			return (-1);
5070 		/* validate and recover from on-disk structures */
5071 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5072 			return (-1);
5073 		if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0)
5074 			return (-1);
5075 	} else if (strcmp(argv[0], "-m") == 0) {
5076 		/* validate and recover from metadb structures */
5077 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5078 			return (-1);
5079 		if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0)
5080 			return (-1);
5081 	} else {
5082 		/* syntax error */
5083 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5084 		    argc, argv);
5085 		return (-1);
5086 	}
5087 
5088 	return (0);
5089 }
5090 
5091 /*
5092  * FUNCTION:	meta_sp_display_exthdr()
5093  * INPUT:	none
5094  * OUTPUT:	none
5095  * RETURNS:	void
5096  * PURPOSE:	print header line for sp_ext_node_t information.  to be used
5097  *		in conjunction with meta_sp_display_ext().
5098  */
5099 static void
5100 meta_sp_display_exthdr(void)
5101 {
5102 	(void) printf("%20s %5s %7s %20s %20s\n",
5103 	    dgettext(TEXT_DOMAIN, "Name"),
5104 	    dgettext(TEXT_DOMAIN, "Seq#"),
5105 	    dgettext(TEXT_DOMAIN, "Type"),
5106 	    dgettext(TEXT_DOMAIN, "Offset"),
5107 	    dgettext(TEXT_DOMAIN, "Length"));
5108 }
5109 
5110 
5111 /*
5112  * FUNCTION:	meta_sp_display_ext()
5113  * INPUT:	ext	- extent to display
5114  * OUTPUT:	none
5115  * RETURNS:	void
5116  * PURPOSE:	print selected fields from sp_ext_node_t.
5117  */
5118 static void
5119 meta_sp_display_ext(sp_ext_node_t *ext)
5120 {
5121 	/* print extent information */
5122 	if (ext->ext_namep != NULL)
5123 		(void) printf("%20s ", ext->ext_namep->cname);
5124 	else
5125 		(void) printf("%20s ", "NONE");
5126 
5127 	(void) printf("%5u ", ext->ext_seq);
5128 
5129 	switch (ext->ext_type) {
5130 	case EXTTYP_ALLOC:
5131 		(void) printf("%7s ", "ALLOC");
5132 		break;
5133 	case EXTTYP_FREE:
5134 		(void) printf("%7s ", "FREE");
5135 		break;
5136 	case EXTTYP_RESERVED:
5137 		(void) printf("%7s ", "RESV");
5138 		break;
5139 	case EXTTYP_END:
5140 		(void) printf("%7s ", "END");
5141 		break;
5142 	default:
5143 		(void) printf("%7s ", "INVLD");
5144 		break;
5145 	}
5146 
5147 	(void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length);
5148 }
5149 
5150 
5151 /*
5152  * FUNCTION:	meta_sp_checkseq()
5153  * INPUT:	extlist	- list of extents to be checked
5154  * OUTPUT:	none
5155  * RETURNS:	int	- 0 - success, -1 - error
5156  * PURPOSE:	check soft partition sequence numbers.  this function assumes
5157  *		that a list of extents representing 1 or more soft partitions
5158  *		is passed in sorted in sequence number order.  within a
5159  *		single soft partition, there may not be any missing or
5160  *		duplicate sequence numbers.
5161  */
5162 static int
5163 meta_sp_checkseq(sp_ext_node_t *extlist)
5164 {
5165 	sp_ext_node_t *ext;
5166 
5167 	assert(extlist != NULL);
5168 
5169 	for (ext = extlist;
5170 	    ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC;
5171 	    ext = ext->ext_next) {
5172 		if (ext->ext_next->ext_namep != NULL &&
5173 		    strcmp(ext->ext_next->ext_namep->cname,
5174 			ext->ext_namep->cname) != 0)
5175 				continue;
5176 
5177 		if (ext->ext_next->ext_seq != ext->ext_seq + 1) {
5178 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5179 			    "%s: sequence numbers are "
5180 			    "incorrect: %d should be %d\n"),
5181 			    ext->ext_next->ext_namep->cname,
5182 			    ext->ext_next->ext_seq, ext->ext_seq + 1);
5183 			return (-1);
5184 		}
5185 	}
5186 	return (0);
5187 }
5188 
5189 
5190 /*
5191  * FUNCTION:	meta_sp_resolve_name_conflict()
5192  * INPUT:	sp	- name of set we're are recovering in.
5193  *		old_np	- name pointer of soft partition we found on disk.
5194  * OUTPUT:	new_np	- name pointer for new soft partition name.
5195  *		ep	- error pointer returned.
5196  * RETURNS:	int	- 0 - name not replace, 1 - name replaced, -1 - error
5197  * PURPOSE:	Check to see if the name of one of the soft partitions we found
5198  *		on disk already exists in the metadb.  If so, prompt for a new
5199  *		name.  In addition, we keep a static array of names that
5200  *		will be recovered from this device since these names don't
5201  *		exist in the configuration at this point but cannot be
5202  *		recovered more than once.
5203  */
5204 static int
5205 meta_sp_resolve_name_conflict(
5206 	mdsetname_t	*sp,
5207 	mdname_t	*old_np,
5208 	mdname_t	**new_np,
5209 	md_error_t	*ep
5210 )
5211 {
5212 	char		yesno[255];
5213 	char		*yes;
5214 	char		newname[MD_SP_MAX_DEVNAME_PLUS_1];
5215 	int		nunits;
5216 	static int	*used_names = NULL;
5217 
5218 	assert(old_np != NULL);
5219 
5220 	if (used_names == NULL) {
5221 		if ((nunits = meta_get_nunits(ep)) < 0)
5222 			return (-1);
5223 		used_names = Zalloc(nunits * sizeof (int));
5224 	}
5225 
5226 	/* see if it exists already */
5227 	if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 &&
5228 	    metagetmiscname(old_np, ep) == NULL) {
5229 		if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5230 			return (-1);
5231 		else {
5232 			used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1;
5233 			mdclrerror(ep);
5234 			return (0);
5235 		}
5236 	}
5237 
5238 	/* name exists, ask the user for a new one */
5239 	(void) printf(dgettext(TEXT_DOMAIN,
5240 	    "WARNING: A soft partition named %s was found in the extent\n"
5241 	    "headers, but this name already exists in the metadb "
5242 	    "configuration.\n"
5243 	    "In order to continue recovery you must supply\n"
5244 	    "a new name for this soft partition.\n"), old_np->cname);
5245 	(void) printf(dgettext(TEXT_DOMAIN,
5246 	    "Would you like to continue and supply a new name? (yes/no) "));
5247 
5248 	(void) fflush(stdout);
5249 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
5250 	    (strlen(yesno) == 1))
5251 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
5252 		    dgettext(TEXT_DOMAIN, "no"));
5253 	yes = dgettext(TEXT_DOMAIN, "yes");
5254 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
5255 		return (-1);
5256 	}
5257 
5258 	(void) fflush(stdin);
5259 
5260 	/* get the new name */
5261 	for (;;) {
5262 		(void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name "
5263 		    "for this soft partition (dXXXX) "));
5264 		(void) fflush(stdout);
5265 		if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL)
5266 			(void) strcpy(newname, "");
5267 
5268 		/* remove newline character */
5269 		if (newname[strlen(newname) - 1] == '\n')
5270 			newname[strlen(newname) - 1] = '\0';
5271 
5272 		if (!(is_metaname(newname)) ||
5273 		    (meta_init_make_device(&sp, newname, ep) != 0)) {
5274 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5275 			    "Invalid metadevice name\n"));
5276 			(void) fflush(stderr);
5277 			continue;
5278 		}
5279 
5280 		if ((*new_np = metaname(&sp, newname, ep)) == NULL) {
5281 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5282 			    "Invalid metadevice name\n"));
5283 			(void) fflush(stderr);
5284 			continue;
5285 		}
5286 
5287 		assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits);
5288 		/* make sure the name isn't already being used */
5289 		if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] ||
5290 		    metagetmiscname(*new_np, ep) != NULL) {
5291 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5292 			    "That name already exists\n"));
5293 			continue;
5294 		} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5295 			return (-1);
5296 
5297 		break;
5298 	}
5299 
5300 	/* got a new name, place in used array and return */
5301 	used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1;
5302 	mdclrerror(ep);
5303 	return (1);
5304 }
5305 
5306 /*
5307  * FUNCTION:	meta_sp_validate_wm()
5308  * INPUT:	sp	- set name we are recovering in
5309  *		compnp	- name pointer for device we are recovering from
5310  *		options	- metarecover options
5311  * OUTPUT:	ep	- error pointer returned
5312  * RETURNS:	int	- 0 - success, -1 - error
5313  * PURPOSE:	validate and display watermark configuration.  walk the
5314  *		on-disk watermark structures and validate the information
5315  *		found within.  since a watermark configuration is
5316  *		"self-defining", the act of traversing the watermarks
5317  *		is part of the validation process.
5318  */
5319 static int
5320 meta_sp_validate_wm(
5321 	mdsetname_t	*sp,
5322 	mdname_t	*compnp,
5323 	mdcmdopts_t	options,
5324 	md_error_t	*ep
5325 )
5326 {
5327 	sp_ext_node_t	*extlist = NULL;
5328 	sp_ext_node_t	*ext;
5329 	int		num_sps = 0;
5330 	int		rval;
5331 
5332 	if ((options & MDCMD_VERBOSE) != 0)
5333 		(void) printf(dgettext(TEXT_DOMAIN,
5334 		    "Verifying on-disk structures on %s.\n"),
5335 		    compnp->cname);
5336 
5337 	/*
5338 	 * for each watermark, build an ext_node, place on list.
5339 	 */
5340 	rval = meta_sp_extlist_from_wm(sp, compnp, &extlist,
5341 	    meta_sp_cmp_by_nameseq, ep);
5342 
5343 	if ((options & MDCMD_VERBOSE) != 0) {
5344 		/* print out what we found */
5345 		if (extlist == NULL)
5346 			(void) printf(dgettext(TEXT_DOMAIN,
5347 			    "No extent headers found on %s.\n"),
5348 			    compnp->cname);
5349 		else {
5350 			(void) printf(dgettext(TEXT_DOMAIN,
5351 			    "The following extent headers were found on %s.\n"),
5352 			    compnp->cname);
5353 			meta_sp_display_exthdr();
5354 		}
5355 		for (ext = extlist; ext != NULL; ext = ext->ext_next)
5356 			meta_sp_display_ext(ext);
5357 	}
5358 
5359 	if (rval < 0) {
5360 		(void) printf(dgettext(TEXT_DOMAIN,
5361 		    "%s: On-disk structures invalid or "
5362 		    "no soft partitions found.\n"),
5363 		    compnp->cname);
5364 		return (-1);
5365 	}
5366 
5367 	assert(extlist != NULL);
5368 
5369 	/* count number of soft partitions */
5370 	for (ext = extlist;
5371 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5372 	    ext = ext->ext_next) {
5373 		if (ext->ext_next != NULL &&
5374 		    ext->ext_next->ext_namep != NULL &&
5375 		    strcmp(ext->ext_next->ext_namep->cname,
5376 			ext->ext_namep->cname) == 0)
5377 				continue;
5378 		num_sps++;
5379 	}
5380 
5381 	if ((options & MDCMD_VERBOSE) != 0)
5382 		(void) printf(dgettext(TEXT_DOMAIN,
5383 		    "Found %d soft partition(s) on %s.\n"), num_sps,
5384 		    compnp->cname);
5385 
5386 	if (num_sps == 0) {
5387 		(void) printf(dgettext(TEXT_DOMAIN,
5388 		    "%s: No soft partitions.\n"), compnp->cname);
5389 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5390 	}
5391 
5392 	/* check sequence numbers */
5393 	if ((options & MDCMD_VERBOSE) != 0)
5394 		(void) printf(dgettext(TEXT_DOMAIN,
5395 		    "Checking sequence numbers.\n"));
5396 
5397 	if (meta_sp_checkseq(extlist) != 0)
5398 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5399 
5400 	return (0);
5401 }
5402 
5403 /*
5404  * FUNCTION:	meta_sp_validate_unit()
5405  * INPUT:	sp	- name of set we are recovering in
5406  *		compnp	- name of component we are recovering from
5407  *		options	- metarecover options
5408  * OUTPUT:	ep	- error pointer returned
5409  * RETURNS:	int	- 0 - success, -1 - error
5410  * PURPOSE:	validate and display metadb configuration.  begin by getting
5411  *		all soft partitions built on the specified component.  get
5412  *		the unit structure for each one and validate the fields within.
5413  */
5414 static int
5415 meta_sp_validate_unit(
5416 	mdsetname_t	*sp,
5417 	mdname_t	*compnp,
5418 	mdcmdopts_t	options,
5419 	md_error_t	*ep
5420 )
5421 {
5422 	md_sp_t		*msp;
5423 	mdnamelist_t	*spnlp = NULL;
5424 	mdnamelist_t	*namep = NULL;
5425 	int		count;
5426 	uint_t		extn;
5427 	sp_ext_length_t	size;
5428 
5429 	if ((options & MDCMD_VERBOSE) != 0)
5430 		(void) printf(dgettext(TEXT_DOMAIN,
5431 		    "%s: Validating soft partition metadb entries.\n"),
5432 		    compnp->cname);
5433 
5434 	if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR)
5435 		return (-1);
5436 
5437 	/* get all soft partitions on component */
5438 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
5439 
5440 	if (count == 0) {
5441 		(void) printf(dgettext(TEXT_DOMAIN,
5442 		    "%s: No soft partitions.\n"), compnp->cname);
5443 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5444 	} else if (count < 0) {
5445 		return (-1);
5446 	}
5447 
5448 	/* Now go through the soft partitions and check each one */
5449 	for (namep = spnlp; namep != NULL; namep = namep->next) {
5450 		mdname_t	*curnp = namep->namep;
5451 		sp_ext_offset_t	curvoff;
5452 
5453 		/* get the unit structure */
5454 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
5455 			return (-1);
5456 
5457 		/* verify generic unit structure parameters */
5458 		if ((options & MDCMD_VERBOSE) != 0)
5459 			(void) printf(dgettext(TEXT_DOMAIN,
5460 			    "\nVerifying device %s.\n"),
5461 			    curnp->cname);
5462 
5463 		/*
5464 		 * MD_SP_LAST is an invalid state and is always the
5465 		 * highest numbered.
5466 		 */
5467 		if (msp->status >= MD_SP_LAST) {
5468 			(void) printf(dgettext(TEXT_DOMAIN,
5469 			    "%s: status value %u is out of range.\n"),
5470 			    curnp->cname, msp->status);
5471 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5472 			    0, curnp->cname));
5473 		} else if ((options & MDCMD_VERBOSE) != 0) {
5474 			uint_t	tstate = 0;
5475 
5476 			if (metaismeta(msp->compnamep)) {
5477 				if (meta_get_tstate(msp->common.namep->dev,
5478 				    &tstate, ep) != 0)
5479 					return (-1);
5480 			}
5481 			(void) printf(dgettext(TEXT_DOMAIN,
5482 			    "%s: Status \"%s\" is valid.\n"),
5483 			    curnp->cname, meta_sp_status_to_name(msp->status,
5484 			    tstate & MD_DEV_ERRORED));
5485 		}
5486 
5487 		/* Now verify each extent */
5488 		if ((options & MDCMD_VERBOSE) != 0)
5489 			(void) printf("%14s %21s %21s %21s\n",
5490 			    dgettext(TEXT_DOMAIN, "Extent Number"),
5491 			    dgettext(TEXT_DOMAIN, "Virtual Offset"),
5492 			    dgettext(TEXT_DOMAIN, "Physical Offset"),
5493 			    dgettext(TEXT_DOMAIN, "Length"));
5494 
5495 		curvoff = 0ULL;
5496 		for (extn = 0; extn < msp->ext.ext_len; extn++) {
5497 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
5498 
5499 			if ((options & MDCMD_VERBOSE) != 0)
5500 				(void) printf("%14u %21llu %21llu %21llu\n",
5501 				    extn, extp->voff, extp->poff, extp->len);
5502 
5503 			if (extp->voff != curvoff) {
5504 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5505 				    "%s: virtual offset for extent %u "
5506 				    "is inconsistent, expected %llu, "
5507 				    "got %llu.\n"), curnp->cname, extn,
5508 				    curvoff, extp->voff);
5509 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5510 				    0, compnp->cname));
5511 			}
5512 
5513 			/* make sure extent does not drop off the end */
5514 			if ((extp->poff + extp->len) == size) {
5515 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5516 				    "%s: extent %u at offset %llu, "
5517 				    "length %llu exceeds the size of the "
5518 				    "device, %llu.\n"), curnp->cname,
5519 				    extn, extp->poff, extp->len, size);
5520 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5521 				    0, compnp->cname));
5522 			}
5523 
5524 			curvoff += extp->len;
5525 		}
5526 	}
5527 	if (options & MDCMD_PRINT) {
5528 		(void) printf(dgettext(TEXT_DOMAIN,
5529 		    "%s: Soft Partition metadb configuration is valid\n"),
5530 		    compnp->cname);
5531 	}
5532 	return (0);
5533 }
5534 
5535 /*
5536  * FUNCTION:	meta_sp_validate_wm_and_unit()
5537  * INPUT:	sp	- name of set we are recovering in
5538  *		compnp	- name of device we are recovering from
5539  *		options	- metarecover options
5540  * OUTPUT:	ep	- error pointer returned
5541  * RETURNS:	int	- 0 - success, -1 error
5542  * PURPOSE:	cross-validate and display watermarks and metadb records.
5543  *		get both the unit structures for the soft partitions built
5544  *		on the specified component and the watermarks found on that
5545  *		component and check to make sure they are consistent with
5546  *		each other.
5547  */
5548 static int
5549 meta_sp_validate_wm_and_unit(
5550 	mdsetname_t	*sp,
5551 	mdname_t	*np,
5552 	mdcmdopts_t	options,
5553 	md_error_t	*ep
5554 )
5555 {
5556 	sp_ext_node_t	*wmlist = NULL;
5557 	sp_ext_node_t	*unitlist = NULL;
5558 	sp_ext_node_t	*unitext;
5559 	sp_ext_node_t	*wmext;
5560 	sp_ext_offset_t	tmpunitoff;
5561 	mdnamelist_t	*spnlp = NULL;
5562 	int		count;
5563 	int		rval = 0;
5564 	int		verbose = (options & MDCMD_VERBOSE);
5565 
5566 	/* get unit structure list */
5567 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
5568 	if (count <= 0)
5569 		return (-1);
5570 
5571 	meta_sp_list_insert(NULL, NULL, &unitlist,
5572 	    metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
5573 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
5574 
5575 	if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) {
5576 		metafreenamelist(spnlp);
5577 		return (-1);
5578 	}
5579 
5580 	metafreenamelist(spnlp);
5581 
5582 	meta_sp_list_freefill(&unitlist, metagetsize(np, ep));
5583 
5584 	if (meta_sp_extlist_from_wm(sp, np, &wmlist,
5585 	    meta_sp_cmp_by_offset, ep) < 0) {
5586 		meta_sp_list_free(&unitlist);
5587 		return (-1);
5588 	}
5589 
5590 	if (getenv(META_SP_DEBUG)) {
5591 		meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n");
5592 		meta_sp_list_dump(unitlist);
5593 		meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n");
5594 		meta_sp_list_dump(wmlist);
5595 	}
5596 
5597 	/*
5598 	 * step through both lists and compare allocated nodes.  Free
5599 	 * nodes and end watermarks may differ between the two but
5600 	 * that's generally ok, and if they're wrong will typically
5601 	 * cause misplaced allocated extents.
5602 	 */
5603 	if (verbose)
5604 		(void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb "
5605 		    "allocations match extent headers.\n"), np->cname);
5606 
5607 	unitext = unitlist;
5608 	wmext = wmlist;
5609 	while ((wmext != NULL) && (unitext != NULL)) {
5610 		/* find next allocated extents in each list */
5611 		while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC)
5612 			wmext = wmext->ext_next;
5613 
5614 		while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC)
5615 			unitext = unitext->ext_next;
5616 
5617 		if (wmext == NULL || unitext == NULL)
5618 			break;
5619 
5620 		if (verbose) {
5621 			(void) printf(dgettext(TEXT_DOMAIN,
5622 			    "Metadb extent:\n"));
5623 			meta_sp_display_exthdr();
5624 			meta_sp_display_ext(unitext);
5625 			(void) printf(dgettext(TEXT_DOMAIN,
5626 			    "Extent header extent:\n"));
5627 			meta_sp_display_exthdr();
5628 			meta_sp_display_ext(wmext);
5629 			(void) printf("\n");
5630 		}
5631 
5632 		if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0)
5633 			rval = -1;
5634 
5635 		/*
5636 		 * if the offsets aren't equal, only increment the
5637 		 * lowest one in hopes of getting the lists back in sync.
5638 		 */
5639 		tmpunitoff = unitext->ext_offset;
5640 		if (unitext->ext_offset <= wmext->ext_offset)
5641 			unitext = unitext->ext_next;
5642 		if (wmext->ext_offset <= tmpunitoff)
5643 			wmext = wmext->ext_next;
5644 	}
5645 
5646 	/*
5647 	 * if both lists aren't at the end then there are extra
5648 	 * allocated nodes in one of them.
5649 	 */
5650 	if (wmext != NULL) {
5651 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5652 		    "%s: extent headers contain allocations not in "
5653 		    "the metadb\n\n"), np->cname);
5654 		rval = -1;
5655 	}
5656 
5657 	if (unitext != NULL) {
5658 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5659 		    "%s: metadb contains allocations not in the extent "
5660 		    "headers\n\n"), np->cname);
5661 		rval = -1;
5662 	}
5663 
5664 	if (options & MDCMD_PRINT) {
5665 		if (rval == 0) {
5666 			(void) printf(dgettext(TEXT_DOMAIN,
5667 			    "%s: Soft Partition metadb matches extent "
5668 			    "header configuration\n"), np->cname);
5669 		} else {
5670 			(void) printf(dgettext(TEXT_DOMAIN,
5671 			    "%s: Soft Partition metadb does not match extent "
5672 			    "header configuration\n"), np->cname);
5673 		}
5674 	}
5675 
5676 	return (rval);
5677 }
5678 
5679 /*
5680  * FUNCTION:	meta_sp_validate_exts()
5681  * INPUT:	compnp	- name pointer for device we are recovering from
5682  *		wmext	- extent node representing watermark
5683  *		unitext	- extent node from unit structure
5684  * OUTPUT:	ep	- return error pointer
5685  * RETURNS:	int	- 0 - succes, mdmderror return code - error
5686  * PURPOSE:	Takes two extent nodes and checks them against each other.
5687  *		offset, length, sequence number, set, and name are compared.
5688  */
5689 static int
5690 meta_sp_validate_exts(
5691 	mdname_t	*compnp,
5692 	sp_ext_node_t	*wmext,
5693 	sp_ext_node_t	*unitext,
5694 	md_error_t	*ep
5695 )
5696 {
5697 	if (wmext->ext_offset != unitext->ext_offset) {
5698 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5699 		    "%s: unit structure and extent header offsets differ.\n"),
5700 		    compnp->cname);
5701 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5702 	}
5703 
5704 	if (wmext->ext_length != unitext->ext_length) {
5705 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5706 		    "%s: unit structure and extent header lengths differ.\n"),
5707 		    compnp->cname);
5708 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5709 	}
5710 
5711 	if (wmext->ext_seq != unitext->ext_seq) {
5712 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5713 		    "%s: unit structure and extent header sequence numbers "
5714 		    "differ.\n"), compnp->cname);
5715 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5716 	}
5717 
5718 	if (wmext->ext_type != unitext->ext_type) {
5719 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5720 		    "%s: unit structure and extent header types differ.\n"),
5721 		    compnp->cname);
5722 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5723 	}
5724 
5725 	/*
5726 	 * If one has a set pointer and the other doesn't, error.
5727 	 * If both extents have setnames, then make sure they match
5728 	 * If both are NULL, it's ok, they match.
5729 	 */
5730 	if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) {
5731 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5732 		    "%s: unit structure and extent header set values "
5733 		    "differ.\n"), compnp->cname);
5734 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5735 	}
5736 
5737 	if (unitext->ext_setp != NULL) {
5738 		if (strcmp(unitext->ext_setp->setname,
5739 		    wmext->ext_setp->setname) != 0) {
5740 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5741 			    "%s: unit structure and extent header set names "
5742 			    "differ.\n"), compnp->cname);
5743 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5744 			    0, compnp->cname));
5745 		}
5746 	}
5747 
5748 	/*
5749 	 * If one has a name pointer and the other doesn't, error.
5750 	 * If both extents have names, then make sure they match
5751 	 * If both are NULL, it's ok, they match.
5752 	 */
5753 	if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) {
5754 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5755 		    "%s: unit structure and extent header name values "
5756 		    "differ.\n"), compnp->cname);
5757 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5758 	}
5759 
5760 	if (unitext->ext_namep != NULL) {
5761 		if (strcmp(wmext->ext_namep->cname,
5762 		    unitext->ext_namep->cname) != 0) {
5763 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5764 			    "%s: unit structure and extent header names "
5765 			    "differ.\n"), compnp->cname);
5766 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5767 			    0, compnp->cname));
5768 		}
5769 	}
5770 
5771 	return (0);
5772 }
5773 
5774 /*
5775  * FUNCTION:	update_sp_status()
5776  * INPUT:	sp	- name of set we are recovering in
5777  *		minors	- pointer to an array of soft partition minor numbers
5778  *		num_sps	- number of minor numbers in array
5779  *		status	- new status to be applied to all soft parts in array
5780  *		mn_set	- set if current set is a multi-node set
5781  * OUTPUT:	ep	- return error pointer
5782  * RETURNS:	int	- 0 - success, -1 - error
5783  * PURPOSE:	update  status of soft partitions to new status. minors is an
5784  *		array of minor numbers to apply the new status to.
5785  *		If mn_set is set, a message is sent to all nodes in the
5786  *		cluster to update the status locally.
5787  */
5788 static int
5789 update_sp_status(
5790 	mdsetname_t	*sp,
5791 	minor_t		*minors,
5792 	int		num_sps,
5793 	sp_status_t	status,
5794 	bool_t		mn_set,
5795 	md_error_t	*ep
5796 )
5797 {
5798 	int	i;
5799 	int	err = 0;
5800 
5801 	if (mn_set) {
5802 		md_mn_msg_sp_setstat_t	sp_setstat_params;
5803 		int			result;
5804 		md_mn_result_t		*resp = NULL;
5805 
5806 		for (i = 0; i < num_sps; i++) {
5807 			sp_setstat_params.sp_setstat_mnum = minors[i];
5808 			sp_setstat_params.sp_setstat_status = status;
5809 
5810 			result = mdmn_send_message(sp->setno,
5811 			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS,
5812 			    (char *)&sp_setstat_params,
5813 			    sizeof (sp_setstat_params),
5814 			    &resp, ep);
5815 			if (resp != NULL) {
5816 				if (resp->mmr_exitval != 0)
5817 					err = -1;
5818 				free_result(resp);
5819 			}
5820 			if (result != 0) {
5821 				err = -1;
5822 			}
5823 		}
5824 	} else {
5825 		if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0)
5826 			err = -1;
5827 	}
5828 	if (err < 0) {
5829 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5830 		    "Error updating status on recovered soft "
5831 		    "partitions.\n"));
5832 	}
5833 	return (err);
5834 }
5835 
5836 /*
5837  * FUNCTION:	meta_sp_recover_from_wm()
5838  * INPUT:	sp	- name of set we are recovering in
5839  *		compnp	- name pointer for component we are recovering from
5840  *		options	- metarecover options
5841  * OUTPUT:	ep	- return error pointer
5842  * RETURNS:	int	- 0 - success, -1 - error
5843  * PURPOSE:	update metadb records to match watermarks.  begin by getting
5844  *		an extlist representing all soft partitions on the component.
5845  *		then build a unit structure for each soft partition.
5846  *		notify user of changes, then commit each soft partition to
5847  *		the metadb one at a time in the "recovering" state.  update
5848  *		any watermarks that may need it	(to reflect possible name
5849  *		changes), and, finally, set the status of all recovered
5850  *		partitions to the "OK" state at once.
5851  */
5852 static int
5853 meta_sp_recover_from_wm(
5854 	mdsetname_t	*sp,
5855 	mdname_t	*compnp,
5856 	mdcmdopts_t	options,
5857 	md_error_t	*ep
5858 )
5859 {
5860 	sp_ext_node_t		*extlist = NULL;
5861 	sp_ext_node_t		*sp_list = NULL;
5862 	sp_ext_node_t		*update_list = NULL;
5863 	sp_ext_node_t		*ext;
5864 	sp_ext_node_t		*sp_ext;
5865 	mp_unit_t		*mp;
5866 	mp_unit_t		**un_array;
5867 	int			numexts = 0, num_sps = 0, i = 0;
5868 	int			err = 0;
5869 	int			not_recovered = 0;
5870 	int			committed = 0;
5871 	sp_ext_length_t		sp_length = 0LL;
5872 	mdnamelist_t		*keynlp = NULL;
5873 	mdname_t		*np;
5874 	mdname_t		*new_np;
5875 	int			new_name;
5876 	md_set_params_t		set_params;
5877 	minor_t			*minors = NULL;
5878 	char			yesno[255];
5879 	char			*yes;
5880 	bool_t			mn_set = 0;
5881 	md_set_desc		*sd;
5882 	mm_unit_t		*mm;
5883 	md_set_mmown_params_t	*ownpar = NULL;
5884 	int			comp_is_mirror = 0;
5885 
5886 	/*
5887 	 * if this component appears in another metadevice already, do
5888 	 * NOT recover from it.
5889 	 */
5890 	if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0)
5891 		return (-1);
5892 
5893 	/* set flag if dealing with a MN set */
5894 	if (!metaislocalset(sp)) {
5895 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5896 			return (-1);
5897 		}
5898 		if (MD_MNSET_DESC(sd))
5899 			mn_set = 1;
5900 	}
5901 	/*
5902 	 * for each watermark, build an ext_node, place on list.
5903 	 */
5904 	if (meta_sp_extlist_from_wm(sp, compnp, &extlist,
5905 	    meta_sp_cmp_by_nameseq, ep) < 0)
5906 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5907 
5908 	assert(extlist != NULL);
5909 
5910 	/* count number of soft partitions */
5911 	for (ext = extlist;
5912 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5913 	    ext = ext->ext_next) {
5914 		if (ext->ext_next != NULL &&
5915 		    ext->ext_next->ext_namep != NULL &&
5916 		    strcmp(ext->ext_next->ext_namep->cname,
5917 			ext->ext_namep->cname) == 0)
5918 				continue;
5919 		num_sps++;
5920 	}
5921 
5922 	/* allocate array of unit structure pointers */
5923 	un_array = Zalloc(num_sps * sizeof (mp_unit_t *));
5924 
5925 	/*
5926 	 * build unit structures from list of ext_nodes.
5927 	 */
5928 	for (ext = extlist;
5929 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5930 	    ext = ext->ext_next) {
5931 		meta_sp_list_insert(ext->ext_setp, ext->ext_namep,
5932 		    &sp_list, ext->ext_offset, ext->ext_length,
5933 		    ext->ext_type, ext->ext_seq, ext->ext_flags,
5934 		    meta_sp_cmp_by_nameseq);
5935 
5936 		numexts++;
5937 		sp_length += ext->ext_length - MD_SP_WMSIZE;
5938 
5939 		if (ext->ext_next != NULL &&
5940 		    ext->ext_next->ext_namep != NULL &&
5941 		    strcmp(ext->ext_next->ext_namep->cname,
5942 			ext->ext_namep->cname) == 0)
5943 				continue;
5944 
5945 		/*
5946 		 * if we made it here, we are at a soft partition
5947 		 * boundary in the list.
5948 		 */
5949 		if (getenv(META_SP_DEBUG)) {
5950 			meta_sp_debug("meta_recover_from_wm: dumping wm "
5951 			    "list:\n");
5952 			meta_sp_list_dump(sp_list);
5953 		}
5954 
5955 		assert(sp_list != NULL);
5956 		assert(sp_list->ext_namep != NULL);
5957 
5958 		if ((new_name = meta_sp_resolve_name_conflict(sp,
5959 		    sp_list->ext_namep, &new_np, ep)) < 0) {
5960 			err = 1;
5961 			goto out;
5962 		} else if (new_name) {
5963 			for (sp_ext = sp_list;
5964 			    sp_ext != NULL;
5965 			    sp_ext = sp_ext->ext_next) {
5966 				/*
5967 				 * insert into the update list for
5968 				 * watermark update.
5969 				 */
5970 				meta_sp_list_insert(sp_ext->ext_setp,
5971 				    new_np, &update_list, sp_ext->ext_offset,
5972 				    sp_ext->ext_length, sp_ext->ext_type,
5973 				    sp_ext->ext_seq, EXTFLG_UPDATE,
5974 				    meta_sp_cmp_by_offset);
5975 			}
5976 
5977 		}
5978 		if (options & MDCMD_DOIT) {
5979 			/* store name in namespace */
5980 			if (mn_set) {
5981 				/* send message to all nodes to return key */
5982 				md_mn_msg_addkeyname_t	*send_params;
5983 				int			result;
5984 				md_mn_result_t		*resp = NULL;
5985 				int			message_size;
5986 
5987 				message_size =  sizeof (*send_params) +
5988 				    strlen(compnp->cname) + 1;
5989 				send_params = Zalloc(message_size);
5990 				send_params->addkeyname_setno = sp->setno;
5991 				(void) strcpy(&send_params->addkeyname_name[0],
5992 				    compnp->cname);
5993 				result = mdmn_send_message(sp->setno,
5994 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
5995 				    (char *)send_params, message_size, &resp,
5996 				    ep);
5997 				Free(send_params);
5998 				if (resp != NULL) {
5999 					if (resp->mmr_exitval >= 0) {
6000 						compnp->key =
6001 						    (mdkey_t)resp->mmr_exitval;
6002 					} else {
6003 						err = 1;
6004 						free_result(resp);
6005 						goto out;
6006 					}
6007 					free_result(resp);
6008 				}
6009 				if (result != 0) {
6010 					err = 1;
6011 					goto out;
6012 				}
6013 				(void) metanamelist_append(&keynlp, compnp);
6014 			} else {
6015 				if (add_key_name(sp, compnp, &keynlp,
6016 				    ep) != 0) {
6017 					err = 1;
6018 					goto out;
6019 				}
6020 			}
6021 		}
6022 
6023 		/* create the unit structure */
6024 		if ((mp = meta_sp_createunit(
6025 		    (new_name) ? new_np : sp_list->ext_namep, compnp,
6026 		    sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) {
6027 			err = 1;
6028 			goto out;
6029 		}
6030 
6031 		if (getenv(META_SP_DEBUG)) {
6032 			meta_sp_debug("meta_sp_recover_from_wm: "
6033 			    "printing newly created unit structure");
6034 			meta_sp_printunit(mp);
6035 		}
6036 
6037 		/* place in unit structure array */
6038 		un_array[i++] = mp;
6039 
6040 		/* free sp_list */
6041 		meta_sp_list_free(&sp_list);
6042 		sp_list = NULL;
6043 		numexts = 0;
6044 		sp_length = 0LL;
6045 	}
6046 
6047 	/* display configuration updates */
6048 	(void) printf(dgettext(TEXT_DOMAIN,
6049 	    "The following soft partitions were found and will be added to\n"
6050 	    "your metadevice configuration.\n"));
6051 	(void) printf("%5s %15s %18s\n",
6052 	    dgettext(TEXT_DOMAIN, "Name"),
6053 	    dgettext(TEXT_DOMAIN, "Size"),
6054 	    dgettext(TEXT_DOMAIN, "No. of Extents"));
6055 	for (i = 0; i < num_sps; i++) {
6056 		(void) printf("%5s%lu %15llu %9d\n", "d",
6057 		    MD_MIN2UNIT(MD_SID(un_array[i])),
6058 		    un_array[i]->un_length, un_array[i]->un_numexts);
6059 	}
6060 
6061 	if (!(options & MDCMD_DOIT)) {
6062 		not_recovered = 1;
6063 		goto out;
6064 	}
6065 
6066 	/* ask user for confirmation */
6067 	(void) printf(dgettext(TEXT_DOMAIN,
6068 	    "WARNING: You are about to add one or more soft partition\n"
6069 	    "metadevices to your metadevice configuration.  If there\n"
6070 	    "appears to be an error in the soft partition(s) displayed\n"
6071 	    "above, do NOT proceed with this recovery operation.\n"));
6072 	(void) printf(dgettext(TEXT_DOMAIN,
6073 	    "Are you sure you want to do this (yes/no)? "));
6074 
6075 	(void) fflush(stdout);
6076 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6077 	    (strlen(yesno) == 1))
6078 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
6079 		    dgettext(TEXT_DOMAIN, "no"));
6080 	yes = dgettext(TEXT_DOMAIN, "yes");
6081 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
6082 		not_recovered = 1;
6083 		goto out;
6084 	}
6085 
6086 	/* commit records one at a time */
6087 	for (i = 0; i < num_sps; i++) {
6088 		(void) memset(&set_params, 0, sizeof (set_params));
6089 		set_params.mnum = MD_SID(un_array[i]);
6090 		set_params.size = (un_array[i])->c.un_size;
6091 		set_params.mdp = (uintptr_t)(un_array[i]);
6092 		set_params.options =
6093 				meta_check_devicesize(un_array[i]->un_length);
6094 		if (set_params.options == MD_CRO_64BIT) {
6095 			un_array[i]->c.un_revision = MD_64BIT_META_DEV;
6096 		} else {
6097 			un_array[i]->c.un_revision = MD_32BIT_META_DEV;
6098 		}
6099 		MD_SETDRIVERNAME(&set_params, MD_SP,
6100 		    MD_MIN2SET(set_params.mnum));
6101 
6102 		np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep);
6103 
6104 		/*
6105 		 * If this is an MN set, send the MD_IOCSET ioctl to all nodes
6106 		 */
6107 		if (mn_set) {
6108 			md_mn_msg_iocset_t	send_params;
6109 			int			result;
6110 			md_mn_result_t		*resp = NULL;
6111 			int			mess_size;
6112 
6113 			/*
6114 			 * Calculate message size. md_mn_msg_iocset_t only
6115 			 * contains one extent, so increment the size to
6116 			 * include all extents
6117 			 */
6118 			mess_size = sizeof (send_params) -
6119 			    sizeof (mp_ext_t) +
6120 			    (un_array[i]->un_numexts * sizeof (mp_ext_t));
6121 
6122 			send_params.iocset_params = set_params;
6123 			(void) memcpy(&send_params.unit, un_array[i],
6124 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
6125 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
6126 			result = mdmn_send_message(sp->setno,
6127 			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS,
6128 			    (char *)&send_params, mess_size, &resp,
6129 			    ep);
6130 			if (resp != NULL) {
6131 				if (resp->mmr_exitval != 0)
6132 					err = 1;
6133 				free_result(resp);
6134 			}
6135 			if (result != 0) {
6136 				err = 1;
6137 			}
6138 		} else {
6139 			if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
6140 			    np->cname) != 0) {
6141 				err = 1;
6142 			}
6143 		}
6144 
6145 		if (err == 1) {
6146 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6147 			    "%s: Error committing record to metadb.\n"),
6148 			    np->cname);
6149 			goto out;
6150 		}
6151 
6152 		/* note that we've committed a record */
6153 		if (!committed)
6154 			committed = 1;
6155 
6156 		/* update any watermarks that need it */
6157 		if (update_list != NULL) {
6158 			md_sp_t *msp;
6159 
6160 			/*
6161 			 * Check to see if we're trying to create a partition
6162 			 * on a mirror. If so we may have to enforce an
6163 			 * ownership change before writing the watermark out.
6164 			 */
6165 			if (metaismeta(compnp)) {
6166 				char *miscname;
6167 
6168 				miscname = metagetmiscname(compnp, ep);
6169 				if (miscname != NULL)
6170 					comp_is_mirror = (strcmp(miscname,
6171 					    MD_MIRROR) == 0);
6172 				else
6173 					comp_is_mirror = 0;
6174 			}
6175 			/*
6176 			 * If this is a MN set and the component is a mirror,
6177 			 * change ownership to this node in order to write the
6178 			 * watermarks
6179 			 */
6180 			if (mn_set && comp_is_mirror) {
6181 				mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
6182 				if (mm == NULL) {
6183 					err = 1;
6184 					goto out;
6185 				} else {
6186 					err = meta_mn_change_owner(&ownpar,
6187 						sp->setno,
6188 						meta_getminor(compnp->dev),
6189 						sd->sd_mn_mynode->nd_nodeid,
6190 						MD_MN_MM_PREVENT_CHANGE |
6191 						    MD_MN_MM_SPAWN_THREAD);
6192 					if (err != 0)
6193 						goto out;
6194 				}
6195 			}
6196 
6197 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
6198 				err = 1;
6199 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6200 				    "%s: Error updating extent headers.\n"),
6201 				    np->cname);
6202 				goto out;
6203 			}
6204 			if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) {
6205 				err = 1;
6206 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6207 				    "%s: Error updating extent headers "
6208 				    "on disk.\n"), np->cname);
6209 				goto out;
6210 			}
6211 		}
6212 		/*
6213 		 * If we have changed ownership earlier and prevented any
6214 		 * ownership changes, we can now allow ownership changes
6215 		 * again.
6216 		 */
6217 		if (ownpar) {
6218 			(void) meta_mn_change_owner(&ownpar, sp->setno,
6219 			    ownpar->d.mnum,
6220 			    ownpar->d.owner,
6221 			    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
6222 		}
6223 	}
6224 
6225 	/* update status of all soft partitions to OK */
6226 	minors = Zalloc(num_sps * sizeof (minor_t));
6227 	for (i = 0; i < num_sps; i++)
6228 		minors[i] = MD_SID(un_array[i]);
6229 
6230 	err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep);
6231 	if (err != 0)
6232 		goto out;
6233 
6234 	if (options & MDCMD_PRINT)
6235 		(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6236 		    "Soft Partitions recovered from device.\n"),
6237 		    compnp->cname);
6238 out:
6239 	/* free memory */
6240 	if (extlist != NULL)
6241 		meta_sp_list_free(&extlist);
6242 	if (sp_list != NULL)
6243 		meta_sp_list_free(&sp_list);
6244 	if (update_list != NULL)
6245 		meta_sp_list_free(&update_list);
6246 	if (un_array != NULL)	{
6247 		for (i = 0; i < num_sps; i++)
6248 			Free(un_array[i]);
6249 		Free(un_array);
6250 	}
6251 	if (minors != NULL)
6252 		Free(minors);
6253 	if (ownpar != NULL)
6254 		Free(ownpar);
6255 	(void) fflush(stdout);
6256 
6257 	if ((keynlp != NULL) && (committed != 1)) {
6258 		/*
6259 		 * if we haven't committed any softparts, either because of an
6260 		 * error or because the user decided not to proceed, delete
6261 		 * namelist key for the component
6262 		 */
6263 		if (mn_set) {
6264 			mdnamelist_t	*p;
6265 
6266 			for (p = keynlp; (p != NULL); p = p->next) {
6267 				mdname_t		*np = p->namep;
6268 				md_mn_msg_delkeyname_t	send_params;
6269 				md_mn_result_t		*resp = NULL;
6270 
6271 				send_params.delkeyname_dev = np->dev;
6272 				send_params.delkeyname_setno = sp->setno;
6273 				send_params.delkeyname_key = np->key;
6274 				(void) mdmn_send_message(sp->setno,
6275 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6276 				    (char *)&send_params, sizeof (send_params),
6277 				    &resp, ep);
6278 				if (resp != NULL) {
6279 					free_result(resp);
6280 				}
6281 			}
6282 		} else {
6283 			(void) del_key_names(sp, keynlp, NULL);
6284 		}
6285 	}
6286 
6287 	metafreenamelist(keynlp);
6288 
6289 	if (err)
6290 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
6291 
6292 	if (not_recovered)
6293 		if (options & MDCMD_PRINT)
6294 			(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6295 			    "Soft Partitions NOT recovered from device.\n"),
6296 			    compnp->cname);
6297 	return (0);
6298 }
6299 
6300 /*
6301  * FUNCTION:	meta_sp_recover_from_unit()
6302  * INPUT:	sp	- name of set we are recovering in
6303  *		compnp	- name of component we are recovering from
6304  *		options	- metarecover options
6305  * OUTPUT:	ep	- return error pointer
6306  * RETURNS:	int	- 0 - success, -1 - error
6307  * PURPOSE:	update watermarks to match metadb records.  begin by getting
6308  *		a namelist representing all soft partitions on the specified
6309  *		component.  then, build an extlist representing the soft
6310  *		partitions, filling in the freespace extents.  notify user
6311  *		of changes, place all soft partitions into the "recovering"
6312  *		state and update the watermarks.  finally, return all soft
6313  *		partitions to the "OK" state.
6314  */
6315 static int
6316 meta_sp_recover_from_unit(
6317 	mdsetname_t	*sp,
6318 	mdname_t	*compnp,
6319 	mdcmdopts_t	options,
6320 	md_error_t	*ep
6321 )
6322 {
6323 	mdnamelist_t	*spnlp = NULL;
6324 	mdnamelist_t	*nlp = NULL;
6325 	sp_ext_node_t	*ext = NULL;
6326 	sp_ext_node_t	*extlist = NULL;
6327 	int		count;
6328 	char		yesno[255];
6329 	char		*yes;
6330 	int		rval = 0;
6331 	minor_t		*minors = NULL;
6332 	int		i;
6333 	md_sp_t		*msp;
6334 	md_set_desc	*sd;
6335 	bool_t		mn_set = 0;
6336 	daddr_t		start_block;
6337 
6338 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
6339 	if (count <= 0)
6340 		return (-1);
6341 
6342 	/* set flag if dealing with a MN set */
6343 	if (!metaislocalset(sp)) {
6344 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
6345 			return (-1);
6346 		}
6347 		if (MD_MNSET_DESC(sd))
6348 			mn_set = 1;
6349 	}
6350 	/*
6351 	 * Save the XDR unit structure for one of the soft partitions;
6352 	 * we'll use this later to provide metadevice context to
6353 	 * update the watermarks so the device can be resolved by
6354 	 * devid instead of dev_t.
6355 	 */
6356 	if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) {
6357 		metafreenamelist(spnlp);
6358 		return (-1);
6359 	}
6360 
6361 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
6362 	    MD_DISKADDR_ERROR) {
6363 		return (-1);
6364 	}
6365 
6366 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
6367 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
6368 	meta_sp_list_insert(NULL, NULL, &extlist,
6369 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
6370 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
6371 
6372 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
6373 		metafreenamelist(spnlp);
6374 		return (-1);
6375 	}
6376 
6377 	assert(extlist != NULL);
6378 	if ((options & MDCMD_VERBOSE) != 0) {
6379 		(void) printf(dgettext(TEXT_DOMAIN,
6380 		    "Updating extent headers on device %s from metadb.\n\n"),
6381 		    compnp->cname);
6382 		(void) printf(dgettext(TEXT_DOMAIN,
6383 		    "The following extent headers will be written:\n"));
6384 		meta_sp_display_exthdr();
6385 	}
6386 
6387 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
6388 
6389 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
6390 
6391 		/* mark every node for updating except the reserved space */
6392 		if (ext->ext_type != EXTTYP_RESERVED) {
6393 			ext->ext_flags |= EXTFLG_UPDATE;
6394 
6395 			/* print extent information */
6396 			if ((options & MDCMD_VERBOSE) != 0)
6397 				meta_sp_display_ext(ext);
6398 		}
6399 	}
6400 
6401 	/* request verification and then update all watermarks */
6402 	if ((options & MDCMD_DOIT) != 0) {
6403 
6404 		(void) printf(dgettext(TEXT_DOMAIN,
6405 		    "\nWARNING: You are about to overwrite portions of %s\n"
6406 		    "with soft partition metadata. The extent headers will be\n"
6407 		    "written to match the existing metadb configuration.  If\n"
6408 		    "the device was not previously setup with this\n"
6409 		    "configuration, data loss may result.\n\n"),
6410 		    compnp->cname);
6411 		(void) printf(dgettext(TEXT_DOMAIN,
6412 		    "Are you sure you want to do this (yes/no)? "));
6413 
6414 		(void) fflush(stdout);
6415 		if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6416 		    (strlen(yesno) == 1))
6417 			(void) snprintf(yesno, sizeof (yesno),
6418 			    "%s\n", dgettext(TEXT_DOMAIN, "no"));
6419 		yes = dgettext(TEXT_DOMAIN, "yes");
6420 		if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) {
6421 			/* place soft partitions into recovering state */
6422 			minors = Zalloc(count * sizeof (minor_t));
6423 			for (nlp = spnlp, i = 0;
6424 			    nlp != NULL && i < count;
6425 			    nlp = nlp->next, i++) {
6426 				assert(nlp->namep != NULL);
6427 				minors[i] = meta_getminor(nlp->namep->dev);
6428 			}
6429 			if (update_sp_status(sp, minors, count,
6430 			    MD_SP_RECOVER, mn_set, ep) != 0) {
6431 				rval = -1;
6432 				goto out;
6433 			}
6434 
6435 			/* update the watermarks */
6436 			if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
6437 				rval = -1;
6438 				goto out;
6439 			}
6440 
6441 			if (options & MDCMD_PRINT) {
6442 				(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6443 				    "Soft Partitions recovered from metadb\n"),
6444 				    compnp->cname);
6445 			}
6446 
6447 			/* return soft partitions to the OK state */
6448 			if (update_sp_status(sp, minors, count,
6449 			    MD_SP_OK, mn_set, ep) != 0) {
6450 				rval = -1;
6451 				goto out;
6452 			}
6453 
6454 			rval = 0;
6455 			goto out;
6456 		}
6457 	}
6458 
6459 	if (options & MDCMD_PRINT) {
6460 		(void) printf(dgettext(TEXT_DOMAIN,
6461 		    "%s: Soft Partitions NOT recovered from metadb\n"),
6462 		    compnp->cname);
6463 	}
6464 
6465 out:
6466 	if (minors != NULL)
6467 		Free(minors);
6468 	metafreenamelist(spnlp);
6469 	meta_sp_list_free(&extlist);
6470 	(void) fflush(stdout);
6471 	return (rval);
6472 }
6473 
6474 
6475 /*
6476  * FUNCTION:	meta_sp_update_abr()
6477  * INPUT:	sp	- name of set we are recovering in
6478  * OUTPUT:	ep	- return error pointer
6479  * RETURNS:	int	- 0 - success, -1 - error
6480  * PURPOSE:	update the ABR state for all soft partitions in the set. This
6481  *		is called when joining a set. It sends a message to the master
6482  *		node for each soft partition to get the value of tstate and
6483  *		then sets ABR ,if required, by opening the sp, setting ABR
6484  *		and then closing the sp. This approach is taken rather that
6485  *		just issuing the MD_MN_SET_CAP ioctl, in order to deal with
6486  *		the case when we have another node simultaneously unsetting ABR.
6487  */
6488 int
6489 meta_sp_update_abr(
6490 	mdsetname_t	*sp,
6491 	md_error_t	*ep
6492 )
6493 {
6494 	mdnamelist_t	*devnlp = NULL;
6495 	mdnamelist_t	*p;
6496 	mdname_t	*devnp = NULL;
6497 	md_unit_t	*un;
6498 	char		fname[MAXPATHLEN];
6499 	int		mnum, fd;
6500 	volcap_t	vc;
6501 	uint_t		tstate;
6502 
6503 
6504 	if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
6505 		return (-1);
6506 	}
6507 
6508 	/* Exit if no soft partitions in this set */
6509 	if (devnlp == NULL)
6510 		return (0);
6511 
6512 	/* For each soft partition */
6513 	for (p = devnlp; (p != NULL); p = p->next) {
6514 		devnp = p->namep;
6515 
6516 		/* check if this is a top level metadevice */
6517 		if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL)
6518 			goto out;
6519 		if (MD_HAS_PARENT(MD_PARENT(un))) {
6520 			Free(un);
6521 			continue;
6522 		}
6523 		Free(un);
6524 
6525 		/* Get tstate from Master */
6526 		if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) {
6527 			mdname_t	*np;
6528 			np = metamnumname(&sp, meta_getminor(devnp->dev), 0,
6529 			    ep);
6530 			if (np) {
6531 				md_perror(dgettext(TEXT_DOMAIN,
6532 				    "Unable to get tstate for %s"), np->cname);
6533 			}
6534 			continue;
6535 		}
6536 		/* If not set on the master, nothing to do */
6537 		if (!(tstate & MD_ABR_CAP))
6538 			continue;
6539 
6540 		mnum = meta_getminor(devnp->dev);
6541 		(void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u",
6542 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
6543 		if ((fd = open(fname, O_RDWR, 0)) < 0) {
6544 			md_perror(dgettext(TEXT_DOMAIN,
6545 			    "Could not open device %s"), fname);
6546 			continue;
6547 		}
6548 
6549 		/* Set ABR state */
6550 		vc.vc_info = 0;
6551 		vc.vc_set = 0;
6552 		if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
6553 			(void) close(fd);
6554 			continue;
6555 		}
6556 
6557 		vc.vc_set = DKV_ABR_CAP;
6558 		if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
6559 			(void) close(fd);
6560 			goto out;
6561 		}
6562 
6563 		(void) close(fd);
6564 	}
6565 	metafreenamelist(devnlp);
6566 	return (0);
6567 out:
6568 	metafreenamelist(devnlp);
6569 	return (-1);
6570 }
6571 
6572 /*
6573  * FUNCTION:	meta_mn_sp_update_abr()
6574  * INPUT:	arg	- Given set.
6575  * PURPOSE:	update the ABR state for all soft partitions in the set by
6576  *		forking a process to call meta_sp_update_abr()
6577  *		This function is only called via rpc.metad when adding a node
6578  *		to a set, ie this node is beong joined to the set by another
6579  *		node.
6580  */
6581 void *
6582 meta_mn_sp_update_abr(void *arg)
6583 {
6584 	set_t		setno = *((set_t *)arg);
6585 	mdsetname_t	*sp;
6586 	md_error_t	mde = mdnullerror;
6587 	int		fval;
6588 
6589 	/* should have a set */
6590 	assert(setno != NULL);
6591 
6592 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6593 		mde_perror(&mde, "");
6594 		return (NULL);
6595 	}
6596 
6597 	if (!(meta_is_mn_set(sp, &mde))) {
6598 		mde_perror(&mde, "");
6599 		return (NULL);
6600 	}
6601 
6602 	/* fork a process */
6603 	if ((fval = md_daemonize(sp, &mde)) != 0) {
6604 		/*
6605 		 * md_daemonize will fork off a process.  The is the
6606 		 * parent or error.
6607 		 */
6608 		if (fval > 0) {
6609 			return (NULL);
6610 		}
6611 		mde_perror(&mde, "");
6612 		return (NULL);
6613 	}
6614 	/*
6615 	 * Child process should never return back to rpc.metad, but
6616 	 * should exit.
6617 	 * Flush all internally cached data inherited from parent process
6618 	 * since cached data will be cleared when parent process RPC request
6619 	 * has completed (which is possibly before this child process
6620 	 * can complete).
6621 	 * Child process can retrieve and cache its own copy of data from
6622 	 * rpc.metad that won't be changed by the parent process.
6623 	 *
6624 	 * Reset md_in_daemon since this child will be a client of rpc.metad
6625 	 * not part of the rpc.metad daemon itself.
6626 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
6627 	 * this thread is rpc.metad or any other thread.  (If this thread
6628 	 * was rpc.metad it could use some short circuit code to get data
6629 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
6630 	 */
6631 	md_in_daemon = 0;
6632 	metaflushsetname(sp);
6633 	sr_cache_flush_setno(setno);
6634 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6635 		mde_perror(&mde, "");
6636 		md_exit(sp, 1);
6637 	}
6638 
6639 
6640 	/*
6641 	 * Closing stdin/out/err here.
6642 	 */
6643 	(void) close(0);
6644 	(void) close(1);
6645 	(void) close(2);
6646 	assert(fval == 0);
6647 
6648 	(void) meta_sp_update_abr(sp, &mde);
6649 
6650 	md_exit(sp, 0);
6651 	/*NOTREACHED*/
6652 	return (NULL);
6653 }
6654