xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_sp.c (revision 5e01956f3000408c2a2c5a08c8d0acf2c2a9d8ee)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Just in case we're not in a build environment, make sure that
28  * TEXT_DOMAIN gets set to something.
29  */
30 #if !defined(TEXT_DOMAIN)
31 #define	TEXT_DOMAIN "SYS_TEST"
32 #endif
33 
34 /*
35  * soft partition operations
36  *
37  * Soft Partitions provide a virtual disk mechanism which is used to
38  * divide a large volume into many small pieces, each appearing as a
39  * separate device.  A soft partition consists of a series of extents,
40  * each having an offset and a length.  The extents are logically
41  * contiguous, so where the first extent leaves off the second extent
42  * picks up.  Which extent a given "virtual offset" belongs to is
43  * dependent on the size of all the previous extents in the soft
44  * partition.
45  *
46  * Soft partitions are represented in memory by an extent node
47  * (sp_ext_node_t) which contains all of the information necessary to
48  * create a unit structure and update the on-disk format, called
49  * "watermarks".  These extent nodes are typically kept in a doubly
50  * linked list and are manipulated by list manipulation routines.  A
51  * list of extents may represent all of the soft partitions on a volume,
52  * a single soft partition, or perhaps just a set of extents that need
53  * to be updated.  Extent lists may be sorted by extent or by name/seq#,
54  * depending on which compare function is used.  Most of the routines
55  * require the list be sorted by offset to work, and that's the typical
56  * configuration.
57  *
58  * In order to do an allocation, knowledge of all soft partitions on the
59  * volume is required.  Then free space is determined from the space
60  * that is not allocated, and new allocations can be made from the free
61  * space.  Once the new allocations are made, a unit structure is created
62  * and the watermarks are updated.  The status is then changed to "okay"
63  * on the unit structure to commit the transaction.  If updating the
64  * watermarks fails, the unit structure is in an intermediate state and
65  * the driver will not allow access to the device.
66  *
67  * A typical sequence of events is:
68  *     1. Fetch the list of names for all soft partitions on a volume
69  *         meta_sp_get_by_component()
70  *     2. Construct an extent list from the name list
71  *         meta_sp_extlist_from_namelist()
72  *     3. Fill the gaps in the extent list with free extents
73  *         meta_sp_list_freefill()
74  *     4. Allocate from the free extents
75  *         meta_sp_alloc_by_len()
76  *         meta_sp_alloc_by_list()
77  *     5. Create the unit structure from the extent list
78  *         meta_sp_createunit()
79  *         meta_sp_updateunit()
80  *     6. Write out the watermarks
81  *         meta_sp_update_wm()
82  *     7. Set the status to "Okay"
83  *         meta_sp_setstatus()
84  *
85  */
86 
87 #include <stdio.h>
88 #include <meta.h>
89 #include "meta_repartition.h"
90 #include <sys/lvm/md_sp.h>
91 #include <sys/lvm/md_crc.h>
92 #include <strings.h>
93 #include <sys/lvm/md_mirror.h>
94 #include <sys/bitmap.h>
95 
96 extern int	md_in_daemon;
97 
98 typedef struct sp_ext_node {
99 	struct sp_ext_node	*ext_next;	/* next element */
100 	struct sp_ext_node	*ext_prev;	/* previous element */
101 	sp_ext_type_t		ext_type;	/* type of extent */
102 	sp_ext_offset_t		ext_offset;	/* starting offset */
103 	sp_ext_length_t		ext_length;	/* length of this node */
104 	uint_t			ext_flags;	/* extent flags */
105 	uint32_t		ext_seq;	/* watermark seq no */
106 	mdname_t		*ext_namep;	/* name pointer */
107 	mdsetname_t		*ext_setp;	/* set pointer */
108 } sp_ext_node_t;
109 
110 /* extent flags */
111 #define	EXTFLG_UPDATE	(1)
112 
113 /* Extent node compare function for list sorting */
114 typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *);
115 
116 
117 /* Function Prototypes */
118 
119 /* Debugging Functions */
120 static void meta_sp_debug(char *format, ...);
121 static void meta_sp_printunit(mp_unit_t *mp);
122 
123 /* Misc Support Functions */
124 int meta_sp_parsesize(char *s, sp_ext_length_t *szp);
125 static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp);
126 static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp,
127 	md_error_t *ep);
128 static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp,
129     mdnamelist_t **nlpp, int force, md_error_t *ep);
130 static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp,
131     mdname_t *compnp, md_error_t *ep);
132 
133 /* Extent List Manipulation Functions */
134 static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2);
135 static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2);
136 static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np,
137     sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length,
138     sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare);
139 static void meta_sp_list_free(sp_ext_node_t **head);
140 static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext);
141 static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head,
142     sp_ext_type_t exttype, int exclude_wm);
143 static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head,
144     sp_ext_offset_t offset);
145 static void meta_sp_list_freefill(sp_ext_node_t **extlist,
146     sp_ext_length_t size);
147 static void meta_sp_list_dump(sp_ext_node_t *head);
148 static int meta_sp_list_overlaps(sp_ext_node_t *head);
149 
150 /* Extent List Query Functions */
151 static boolean_t meta_sp_enough_space(int desired_number_of_sps,
152 	blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp,
153 	sp_ext_length_t alignment);
154 static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep,
155 	mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp,
156 	md_error_t *ep);
157 static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep,
158 	mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp);
159 
160 
161 /* Extent Allocation Functions */
162 static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np,
163     sp_ext_node_t **extlist, sp_ext_node_t *free_ext,
164     sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq);
165 static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np,
166     sp_ext_node_t **extlist, sp_ext_length_t *lp,
167     sp_ext_offset_t last_off, sp_ext_length_t alignment);
168 static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np,
169     sp_ext_node_t **extlist, sp_ext_node_t *oblist);
170 
171 /* Extent List Population Functions */
172 static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp,
173     sp_ext_node_t **extlist, md_error_t *ep);
174 static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp,
175     sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep);
176 
177 /* Print (metastat) Functions */
178 static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp,
179     mdprtopts_t options, md_error_t *ep);
180 static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate);
181 static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp,
182     char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep);
183 
184 /* Watermark Manipulation Functions */
185 static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp,
186     sp_ext_node_t *extlist, md_error_t *ep);
187 static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep);
188 static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp,
189     mp_watermark_t *wm, sp_ext_offset_t offset,  md_error_t *ep);
190 static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp,
191     md_error_t *ep);
192 
193 /* Unit Structure Manipulation Functions */
194 static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist);
195 static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp,
196     sp_ext_node_t *extlist, int numexts, sp_ext_length_t len,
197     sp_status_t status, md_error_t *ep);
198 static mp_unit_t *meta_sp_updateunit(mdname_t *np,  mp_unit_t *old_un,
199     sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts,
200     md_error_t *ep);
201 static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist,
202     mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep);
203 static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options,
204     int *repart_options, md_error_t *ep);
205 
206 /* Reset (metaclear) Functions */
207 static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp,
208     md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep);
209 
210 /* Recovery (metarecover) Functions */
211 static void meta_sp_display_exthdr(void);
212 static void meta_sp_display_ext(sp_ext_node_t *ext);
213 static int meta_sp_checkseq(sp_ext_node_t *extlist);
214 static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *,
215     mdname_t **, md_error_t *);
216 static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np,
217     mdcmdopts_t options, md_error_t *ep);
218 static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp,
219     mdcmdopts_t options, md_error_t *ep);
220 static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np,
221     mdcmdopts_t options, md_error_t *ep);
222 static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext,
223     sp_ext_node_t *unitext, md_error_t *ep);
224 static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp,
225     mdcmdopts_t options, md_error_t *ep);
226 static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np,
227     mdcmdopts_t options, md_error_t *ep);
228 
229 /*
230  * Private Constants
231  */
232 
233 static const int FORCE_RELOAD_CACHE = 1;
234 static const uint_t NO_FLAGS = 0;
235 static const sp_ext_offset_t NO_OFFSET = 0ULL;
236 static const uint_t NO_SEQUENCE_NUMBER = 0;
237 static const int ONE_SOFT_PARTITION = 1;
238 
239 static unsigned long *sp_parent_printed[MD_MAXSETS];
240 
241 #define	TEST_SOFT_PARTITION_NAMEP NULL
242 #define	TEST_SETNAMEP NULL
243 
244 #define	EXCLUDE_WM	(1)
245 #define	INCLUDE_WM	(0)
246 
247 #define	SP_UNALIGNED	(0LL)
248 
249 /*
250  * **************************************************************************
251  *                          Debugging Functions                             *
252  * **************************************************************************
253  */
254 
255 /*PRINTFLIKE1*/
256 static void
257 meta_sp_debug(char *format, ...)
258 {
259 	static int debug;
260 	static int debug_set = 0;
261 	va_list ap;
262 
263 	if (!debug_set) {
264 		debug = getenv(META_SP_DEBUG) ? 1 : 0;
265 		debug_set = 1;
266 	}
267 
268 	if (debug) {
269 		va_start(ap, format);
270 		(void) vfprintf(stderr, format, ap);
271 		va_end(ap);
272 	}
273 }
274 
275 static void
276 meta_sp_printunit(mp_unit_t *mp)
277 {
278 	int i;
279 
280 	if (mp == NULL)
281 		return;
282 
283 	/* print the common fields we know about */
284 	(void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type);
285 	(void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size);
286 	(void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp));
287 
288 	/* sp-specific fields */
289 	(void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status);
290 	(void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts);
291 	(void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length);
292 	(void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev);
293 	(void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev);
294 	(void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key);
295 
296 	/* print extent information */
297 	(void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n");
298 	for (i = 0; i < mp->un_numexts; i++) {
299 		(void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i,
300 		    mp->un_ext[i].un_voff, mp->un_ext[i].un_poff,
301 		    mp->un_ext[i].un_len);
302 	}
303 }
304 
305 /*
306  * FUNCTION:    meta_sp_parsesize()
307  * INPUT:       s       - the string to parse
308  * OUTPUT:      *szp    - disk block count (0 for "all")
309  * RETURNS:     -1 for error, 0 for success
310  * PURPOSE:     parses the command line parameter that specifies the
311  *              requested size of a soft partition.  The input string
312  *              is either the literal "all" or a numeric value
313  *              followed by a single character, b for disk blocks, k
314  *              for kilobytes, m for megabytes, g for gigabytes, or t
315  *              for terabytes.  p for petabytes and e for exabytes
316  *              have been added as undocumented features for future
317  *              expansion.  For example, 100m is 100 megabytes, while
318  *              50g is 50 gigabytes.  All values are rounded up to the
319  *              nearest block size.
320  */
321 int
322 meta_sp_parsesize(char *s, sp_ext_length_t *szp)
323 {
324 	if (s == NULL || szp == NULL) {
325 		return (-1);
326 	}
327 
328 	/* Check for literal "all" */
329 	if (strcasecmp(s, "all") == 0) {
330 		*szp = 0;
331 		return (0);
332 	}
333 
334 	return (meta_sp_parsesizestring(s, szp));
335 }
336 
337 /*
338  * FUNCTION:	meta_sp_parsesizestring()
339  * INPUT:	s	- the string to parse
340  * OUTPUT:	*szp	- disk block count
341  * RETURNS:	-1 for error, 0 for success
342  * PURPOSE:	parses a string that specifies size. The input string is a
343  *		numeric value followed by a single character, b for disk blocks,
344  *		k for kilobytes, m for megabytes, g for gigabytes, or t for
345  *		terabytes.  p for petabytes and e for exabytes have been added
346  *		as undocumented features for future expansion.  For example,
347  *		100m is 100 megabytes, while 50g is 50 gigabytes.  All values
348  *		are rounded up to the nearest block size.
349  */
350 static int
351 meta_sp_parsesizestring(char *s, sp_ext_length_t *szp)
352 {
353 	sp_ext_length_t	len = 0;
354 	char		len_type[2];
355 
356 	if (s == NULL || szp == NULL) {
357 		return (-1);
358 	}
359 
360 	/*
361 	 * make sure block offset does not overflow 2^64 bytes.
362 	 */
363 	if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) ||
364 	    (len == 0LL) ||
365 	    (len > (1LL << (64 - DEV_BSHIFT))))
366 		return (-1);
367 
368 	switch (len_type[0]) {
369 	case 'B':
370 	case 'b':
371 		len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE));
372 		break;
373 	case 'K':
374 	case 'k':
375 		len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE));
376 		break;
377 	case 'M':
378 	case 'm':
379 		len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE));
380 		break;
381 	case 'g':
382 	case 'G':
383 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE));
384 		break;
385 	case 't':
386 	case 'T':
387 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL,
388 		    DEV_BSIZE));
389 		break;
390 	case 'p':
391 	case 'P':
392 		len = lbtodb(roundup(
393 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
394 		    DEV_BSIZE));
395 		break;
396 	case 'e':
397 	case 'E':
398 		len = lbtodb(roundup(
399 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
400 		    DEV_BSIZE));
401 		break;
402 	default:
403 		/* error */
404 		return (-1);
405 	}
406 
407 	*szp = len;
408 	return (0);
409 }
410 
411 /*
412  * FUNCTION:	meta_sp_setgeom()
413  * INPUT:	np      - the underlying device to setup geometry for
414  *		compnp	- the underlying device to setup geometry for
415  *		mp	- the unit structure to set the geometry for
416  * OUTPUT:	ep	- return error pointer
417  * RETURNS:	int	- -1 if error, 0 otherwise
418  * PURPOSE:	establishes geometry information for a device
419  */
420 static int
421 meta_sp_setgeom(
422 	mdname_t	*np,
423 	mdname_t	*compnp,
424 	mp_unit_t	*mp,
425 	md_error_t	*ep
426 )
427 {
428 	mdgeom_t	*geomp;
429 	uint_t		round_cyl = 0;
430 
431 	if ((geomp = metagetgeom(compnp, ep)) == NULL)
432 		return (-1);
433 	if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct,
434 	    geomp->read_reinstruct, round_cyl, ep) != 0)
435 		return (-1);
436 
437 	return (0);
438 }
439 
440 /*
441  * FUNCTION:	meta_sp_setstatus()
442  * INPUT:	sp	- the set name for the devices to set the status on
443  *		minors	- an array of minor numbers of devices to set status on
444  *		num_units - number of entries in the array
445  *		status	- status value to set all units to
446  * OUTPUT:	ep	- return error pointer
447  * RETURNS:	int	- -1 if error, 0 success
448  * PURPOSE:	sets the status of one or more soft partitions to the
449  *		requested value
450  */
451 int
452 meta_sp_setstatus(
453 	mdsetname_t	*sp,
454 	minor_t		*minors,
455 	int		num_units,
456 	sp_status_t	status,
457 	md_error_t	*ep
458 )
459 {
460 	md_sp_statusset_t	status_params;
461 
462 	assert(minors != NULL);
463 
464 	/* update status of all soft partitions to the status passed in */
465 	(void) memset(&status_params, 0, sizeof (status_params));
466 	status_params.num_units = num_units;
467 	status_params.new_status = status;
468 	status_params.size = num_units * sizeof (minor_t);
469 	status_params.minors = (uintptr_t)minors;
470 	MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno);
471 	if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde,
472 	    NULL) != 0) {
473 		(void) mdstealerror(ep, &status_params.mde);
474 		return (-1);
475 	}
476 	return (0);
477 }
478 
479 /*
480  * FUNCTION:	meta_get_sp_names()
481  * INPUT:	sp	- the set name to get soft partitions from
482  *		options	- options from the command line
483  * OUTPUT:	nlpp	- list of all soft partition names
484  *		ep	- return error pointer
485  * RETURNS:	int	- -1 if error, 0 success
486  * PURPOSE:	returns a list of all soft partitions in the metadb
487  *		for all devices in the specified set
488  */
489 int
490 meta_get_sp_names(
491 	mdsetname_t	*sp,
492 	mdnamelist_t	**nlpp,
493 	int		options,
494 	md_error_t	*ep
495 )
496 {
497 	return (meta_get_names(MD_SP, sp, nlpp, options, ep));
498 }
499 
500 /*
501  * FUNCTION:	meta_get_by_component()
502  * INPUT:	sp	- the set name to get soft partitions from
503  *		compnp	- the name of the device containing the soft
504  *			  partitions that will be returned
505  *		force	- 0 - reads cached namelist if available,
506  *			  1 - reloads cached namelist, frees old namelist
507  * OUTPUT:	nlpp	- list of all soft partition names
508  *		ep	- return error pointer
509  * RETURNS:	int	- -1 error, otherwise the number of soft partitions
510  *			  found on the component (0 = none found).
511  * PURPOSE:	returns a list of all soft partitions on a given device
512  *		from the metadb information
513  */
514 static int
515 meta_sp_get_by_component(
516 	mdsetname_t	*sp,
517 	mdname_t	*compnp,
518 	mdnamelist_t	**nlpp,
519 	int		force,
520 	md_error_t	*ep
521 )
522 {
523 	static mdnamelist_t	*cached_list = NULL;	/* cached namelist */
524 	static int		cached_count = 0;	/* cached count */
525 	mdnamelist_t		*spnlp = NULL;		/* all sp names */
526 	mdnamelist_t		*namep;			/* list iterator */
527 	mdnamelist_t		**tailpp = nlpp;	/* namelist tail */
528 	mdnamelist_t		**cachetailpp;		/* cache tail */
529 	md_sp_t			*msp;			/* unit structure */
530 	int			count = 0;		/* count of sp's */
531 	int			err;
532 	mdname_t		*curnp;
533 
534 	if ((cached_list != NULL) && (!force)) {
535 		/* return a copy of the cached list */
536 		for (namep = cached_list; namep != NULL; namep = namep->next)
537 			tailpp = meta_namelist_append_wrapper(tailpp,
538 			    namep->namep);
539 		return (cached_count);
540 	}
541 
542 	/* free the cache and reset values to zeros to prepare for a new list */
543 	metafreenamelist(cached_list);
544 	cached_count = 0;
545 	cached_list = NULL;
546 	cachetailpp = &cached_list;
547 	*nlpp = NULL;
548 
549 	/* get all the softpartitions first of all */
550 	if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
551 		return (-1);
552 
553 	/*
554 	 * Now for each sp, see if it resides on the component we
555 	 * are interested in, if so then add it to our list
556 	 */
557 	for (namep = spnlp; namep != NULL; namep = namep->next) {
558 		curnp = namep->namep;
559 
560 		/* get the unit structure */
561 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
562 			continue;
563 
564 		/*
565 		 * If the current soft partition is not on the same
566 		 * component, continue the search.  If it is on the same
567 		 * component, add it to our namelist.
568 		 */
569 		err = meta_check_samedrive(compnp, msp->compnamep, ep);
570 		if (err <= 0) {
571 			/* not on the same device, check the next one */
572 			continue;
573 		}
574 
575 		/* it's on the same drive */
576 
577 		/*
578 		 * Check for overlapping partitions if the component is not
579 		 * a metadevice.
580 		 */
581 		if (!metaismeta(msp->compnamep)) {
582 			/*
583 			 * if they're on the same drive, neither
584 			 * should be a metadevice if one isn't
585 			 */
586 			assert(!metaismeta(compnp));
587 
588 			if (meta_check_overlap(msp->compnamep->cname,
589 			    compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0)
590 				continue;
591 
592 			/* in this case it's not an error for them to overlap */
593 			mdclrerror(ep);
594 		}
595 
596 		/* Component is on the same device, add to the used list */
597 		tailpp = meta_namelist_append_wrapper(tailpp, curnp);
598 		cachetailpp = meta_namelist_append_wrapper(cachetailpp,
599 		    curnp);
600 
601 		++count;
602 		++cached_count;
603 	}
604 
605 	assert(count == cached_count);
606 	return (count);
607 
608 out:
609 	metafreenamelist(*nlpp);
610 	*nlpp = NULL;
611 	return (-1);
612 }
613 
614 /*
615  * FUNCTION:    meta_sp_get_default_alignment()
616  * INPUT:       sp      - the pertinent set name
617  *              compnp  - the name of the underlying component
618  * OUTPUT:      ep      - return error pointer
619  * RETURNS:     sp_ext_length_t =0: no default alignment
620  *                              >0: default alignment
621  * PURPOSE:     returns the default alignment for soft partitions to
622  *              be built on top of the specified component or
623  *              metadevice
624  */
625 static sp_ext_length_t
626 meta_sp_get_default_alignment(
627 	mdsetname_t	*sp,
628 	mdname_t	*compnp,
629 	md_error_t	*ep
630 )
631 {
632 	sp_ext_length_t	a = SP_UNALIGNED;
633 	char		*mname;
634 
635 	assert(compnp != NULL);
636 
637 	/*
638 	 * We treat raw devices as opaque, and assume nothing about
639 	 * their alignment requirements.
640 	 */
641 	if (!metaismeta(compnp))
642 		return (SP_UNALIGNED);
643 
644 	/*
645 	 * We already know it's a metadevice from the previous test;
646 	 * metagetmiscname() will tell us which metadevice type we
647 	 * have
648 	 */
649 	mname = metagetmiscname(compnp, ep);
650 	if (mname == NULL)
651 		goto out;
652 
653 	/*
654 	 * For a mirror, we want to deal with the stripe that is the
655 	 * primary side.  If it happens to be asymmetrically
656 	 * configured, there is no simple way to fake a universal
657 	 * alignment.  There's a chance that the least common
658 	 * denominator of the set of interlaces from all stripes of
659 	 * all submirrors would do it, but nobody that really cared
660 	 * that much about this issue would create an asymmetric
661 	 * config to start with.
662 	 *
663 	 * If the component underlying the soft partition is a mirror,
664 	 * then at the exit of this loop, compnp will have been
665 	 * updated to describe the first active submirror.
666 	 */
667 	if (strcmp(mname, MD_MIRROR) == 0) {
668 		md_mirror_t	*mp;
669 		int		smi;
670 		md_submirror_t	*smp;
671 
672 		mp = meta_get_mirror(sp, compnp, ep);
673 		if (mp == NULL)
674 			goto out;
675 
676 		for (smi = 0; smi < NMIRROR; smi++) {
677 
678 			smp = &mp->submirrors[smi];
679 			if (smp->state == SMS_UNUSED)
680 				continue;
681 
682 			compnp = smp->submirnamep;
683 			assert(compnp != NULL);
684 
685 			mname = metagetmiscname(compnp, ep);
686 			if (mname == NULL)
687 				goto out;
688 
689 			break;
690 		}
691 
692 		if (smi == NMIRROR)
693 			goto out;
694 	}
695 
696 	/*
697 	 * Handle stripes and submirrors identically; just return the
698 	 * interlace of the first row.
699 	 */
700 	if (strcmp(mname, MD_STRIPE) == 0) {
701 		md_stripe_t	*stp;
702 
703 		stp = meta_get_stripe(sp, compnp, ep);
704 		if (stp == NULL)
705 			goto out;
706 
707 		a = stp->rows.rows_val[0].interlace;
708 		goto out;
709 	}
710 
711 	/*
712 	 * Raid is even more straightforward; the interlace applies to
713 	 * the entire device.
714 	 */
715 	if (strcmp(mname, MD_RAID) == 0) {
716 		md_raid_t	*rp;
717 
718 		rp = meta_get_raid(sp, compnp, ep);
719 		if (rp == NULL)
720 			goto out;
721 
722 		a = rp->interlace;
723 		goto out;
724 	}
725 
726 	/*
727 	 * If we have arrived here with the alignment still not set,
728 	 * then we expect the error to have been set by one of the
729 	 * routines we called.  If neither is the case, something has
730 	 * really gone wrong above.  (Probably the submirror walk
731 	 * failed to produce a valid submirror, but that would be
732 	 * really bad...)
733 	 */
734 out:
735 	meta_sp_debug("meta_sp_get_default_alignment: miscname %s, "
736 	    "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a);
737 
738 	if (getenv(META_SP_DEBUG) && !mdisok(ep)) {
739 		mde_perror(ep, NULL);
740 	}
741 
742 	assert((a > 0) || (!mdisok(ep)));
743 
744 	return (a);
745 }
746 
747 
748 
749 /*
750  * FUNCTION:	meta_check_insp()
751  * INPUT:	sp	- the set name for the device to check
752  *		np	- the name of the device to check
753  *		slblk	- the starting offset of the device to check
754  *		nblks	- the number of blocks in the device to check
755  * OUTPUT:	ep	- return error pointer
756  * RETURNS:	int	-  0 - device contains soft partitions
757  *			  -1 - device does not contain soft partitions
758  * PURPOSE:	determines whether a device contains any soft partitions
759  */
760 /* ARGSUSED */
761 int
762 meta_check_insp(
763 	mdsetname_t	*sp,
764 	mdname_t	*np,
765 	diskaddr_t	slblk,
766 	diskaddr_t	nblks,
767 	md_error_t	*ep
768 )
769 {
770 	mdnamelist_t	*spnlp = NULL;	/* soft partition name list */
771 	int		count;
772 	int		rval;
773 
774 	/* check set pointer */
775 	assert(sp != NULL);
776 
777 	/*
778 	 * Get a list of the soft partitions that currently reside on
779 	 * the component.  We should ALWAYS force reload the cache,
780 	 * because if we're using the md.tab, we must rebuild
781 	 * the list because it won't contain the previous (if any)
782 	 * soft partition.
783 	 */
784 	/* find all soft partitions on the component */
785 	count = meta_sp_get_by_component(sp, np, &spnlp, 1, ep);
786 
787 	if (count == -1) {
788 		rval = -1;
789 	} else if (count > 0) {
790 		rval = mduseerror(ep, MDE_ALREADY, np->dev,
791 		    spnlp->namep->cname, np->cname);
792 	} else {
793 		rval = 0;
794 	}
795 
796 	metafreenamelist(spnlp);
797 	return (rval);
798 }
799 
800 /*
801  * **************************************************************************
802  *                    Extent List Manipulation Functions                    *
803  * **************************************************************************
804  */
805 
806 /*
807  * FUNCTION:	meta_sp_cmp_by_nameseq()
808  * INPUT:	e1	- first node to compare
809  *		e2	- second node to compare
810  * OUTPUT:	none
811  * RETURNS:	int	- =0 - nodes are equal
812  *			  <0 - e1 should go before e2
813  *			  >0 - e1 should go after e2
814  * PURPOSE:	used for sorted list inserts to build a list sorted by
815  *		name first and sequence number second.
816  */
817 static int
818 meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2)
819 {
820 	int rval;
821 
822 	if (e1->ext_namep == NULL)
823 		return (1);
824 	if (e2->ext_namep == NULL)
825 		return (-1);
826 	if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0)
827 		return (rval);
828 
829 	/* the names are equal, compare sequence numbers */
830 	if (e1->ext_seq > e2->ext_seq)
831 		return (1);
832 	if (e1->ext_seq < e2->ext_seq)
833 		return (-1);
834 	/* sequence numbers are also equal */
835 	return (0);
836 }
837 
838 /*
839  * FUNCTION:	meta_sp_cmp_by_offset()
840  * INPUT:	e1	- first node to compare
841  *		e2	- second node to compare
842  * OUTPUT:	none
843  * RETURNS:	int	- =0 - nodes are equal
844  *			  <0 - e1 should go before e2
845  *			  >0 - e1 should go after e2
846  * PURPOSE:	used for sorted list inserts to build a list sorted by offset
847  */
848 static int
849 meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2)
850 {
851 	if (e1->ext_offset > e2->ext_offset)
852 		return (1);
853 	if (e1->ext_offset < e2->ext_offset)
854 		return (-1);
855 	/* offsets are equal */
856 	return (0);
857 }
858 
859 /*
860  * FUNCTION:	meta_sp_list_insert()
861  * INPUT:	sp	- the set name for the device the node belongs to
862  *		np	- the name of the device the node belongs to
863  *		head	- the head of the list, must be NULL for empty list
864  *		offset	- the physical offset of this extent in sectors
865  *		length	- the length of this extent in sectors
866  *		type	- the type of the extent being inserted
867  *		seq	- the sequence number of the extent being inserted
868  *		flags	- extent flags (eg. whether it needs to be updated)
869  *		compare	- the compare function to use
870  * OUTPUT:	head	- points to the new head if a node was inserted
871  *			  at the beginning
872  * RETURNS:	void
873  * PURPOSE:	inserts an extent node into a sorted doubly linked list.
874  *		The sort order is determined by the compare function.
875  *		Memory is allocated for the node in this function and it
876  *		is up to the caller to free it, possibly using
877  *		meta_sp_list_free().  If a node is inserted at the
878  *		beginning of the list, the head pointer is updated to
879  *		point to the new first node.
880  */
881 static void
882 meta_sp_list_insert(
883 	mdsetname_t	*sp,
884 	mdname_t	*np,
885 	sp_ext_node_t	**head,
886 	sp_ext_offset_t	offset,
887 	sp_ext_length_t	length,
888 	sp_ext_type_t	type,
889 	uint_t		seq,
890 	uint_t		flags,
891 	ext_cmpfunc_t	compare
892 )
893 {
894 	sp_ext_node_t	*newext;
895 	sp_ext_node_t	*curext;
896 
897 	assert(head != NULL);
898 
899 	/* Don't bother adding zero length nodes */
900 	if (length == 0ULL)
901 		return;
902 
903 	/* allocate and fill in new ext_node */
904 	newext = Zalloc(sizeof (sp_ext_node_t));
905 
906 	newext->ext_offset = offset;
907 	newext->ext_length = length;
908 	newext->ext_flags = flags;
909 	newext->ext_type = type;
910 	newext->ext_seq = seq;
911 	newext->ext_setp = sp;
912 	newext->ext_namep = np;
913 
914 	/* first node in the list */
915 	if (*head == NULL) {
916 		newext->ext_next = newext->ext_prev = NULL;
917 		*head = newext;
918 	} else if ((*compare)(*head, newext) >= 0) {
919 		/* the first node has a bigger offset, so insert before it */
920 		assert((*head)->ext_prev == NULL);
921 
922 		newext->ext_prev = NULL;
923 		newext->ext_next = *head;
924 		(*head)->ext_prev = newext;
925 		*head = newext;
926 	} else {
927 		/*
928 		 * find the next node whose offset is greater than
929 		 * the one we want to insert, or the end of the list.
930 		 */
931 		for (curext = *head;
932 		    (curext->ext_next != NULL) &&
933 		    ((*compare)(curext->ext_next, newext) < 0);
934 		    (curext = curext->ext_next))
935 			;
936 
937 		/* link the new node in after the current node */
938 		newext->ext_next = curext->ext_next;
939 		newext->ext_prev = curext;
940 
941 		if (curext->ext_next != NULL)
942 			curext->ext_next->ext_prev = newext;
943 
944 		curext->ext_next = newext;
945 	}
946 }
947 
948 /*
949  * FUNCTION:	meta_sp_list_free()
950  * INPUT:	head	- the head of the list, must be NULL for empty list
951  * OUTPUT:	head	- points to NULL on return
952  * RETURNS:	void
953  * PURPOSE:	walks a double linked extent list and frees each node
954  */
955 static void
956 meta_sp_list_free(sp_ext_node_t **head)
957 {
958 	sp_ext_node_t	*ext;
959 	sp_ext_node_t	*next;
960 
961 	assert(head != NULL);
962 
963 	ext = *head;
964 	while (ext) {
965 		next = ext->ext_next;
966 		Free(ext);
967 		ext = next;
968 	}
969 	*head = NULL;
970 }
971 
972 /*
973  * FUNCTION:	meta_sp_list_remove()
974  * INPUT:	head	- the head of the list, must be NULL for empty list
975  *		ext	- the extent to remove, must be a member of the list
976  * OUTPUT:	head	- points to the new head of the list
977  * RETURNS:	void
978  * PURPOSE:	unlinks the node specified by ext from the list and
979  *		frees it, possibly moving the head pointer forward if
980  *		the head is the node being removed.
981  */
982 static void
983 meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext)
984 {
985 	assert(head != NULL);
986 	assert(*head != NULL);
987 
988 	if (*head == ext)
989 		*head = ext->ext_next;
990 
991 	if (ext->ext_prev != NULL)
992 		ext->ext_prev->ext_next = ext->ext_next;
993 	if (ext->ext_next != NULL)
994 		ext->ext_next->ext_prev = ext->ext_prev;
995 	Free(ext);
996 }
997 
998 /*
999  * FUNCTION:	meta_sp_list_size()
1000  * INPUT:	head	- the head of the list, must be NULL for empty list
1001  *		exttype	- the type of the extents to sum
1002  *		exclude_wm - subtract space for extent headers from total
1003  * OUTPUT:	none
1004  * RETURNS:	sp_ext_length_t	- the sum of all of the lengths
1005  * PURPOSE:	sums the lengths of all extents in the list matching the
1006  *		specified type.  This could be used for computing the
1007  *		amount of free or used space, for example.
1008  */
1009 static sp_ext_length_t
1010 meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm)
1011 {
1012 	sp_ext_node_t	*ext;
1013 	sp_ext_length_t	size = 0LL;
1014 
1015 	for (ext = head; ext != NULL; ext = ext->ext_next)
1016 		if (ext->ext_type == exttype)
1017 			size += ext->ext_length -
1018 			    ((exclude_wm) ? MD_SP_WMSIZE : 0);
1019 
1020 	return (size);
1021 }
1022 
1023 /*
1024  * FUNCTION:	meta_sp_list_find()
1025  * INPUT:	head	- the head of the list, must be NULL for empty list
1026  *		offset	- the offset contained by the node to find
1027  * OUTPUT:	none
1028  * RETURNS:	sp_ext_node_t *	- the node containing the requested offset
1029  *				  or NULL if no such nodes were found.
1030  * PURPOSE:	finds a node in a list containing the requested offset
1031  *		(inclusive).  If multiple nodes contain this offset then
1032  *		only the first will be returned, though typically these
1033  *		lists are managed with non-overlapping nodes.
1034  *
1035  *		*The list MUST be sorted by offset for this function to work.*
1036  */
1037 static sp_ext_node_t *
1038 meta_sp_list_find(
1039 	sp_ext_node_t	*head,
1040 	sp_ext_offset_t	offset
1041 )
1042 {
1043 	sp_ext_node_t	*ext;
1044 
1045 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1046 		/* check if the offset lies within this extent */
1047 		if ((offset >= ext->ext_offset) &&
1048 		    (offset < ext->ext_offset + ext->ext_length)) {
1049 			/*
1050 			 * the requested extent should always be a
1051 			 * subset of an extent in the list.
1052 			 */
1053 			return (ext);
1054 		}
1055 	}
1056 	return (NULL);
1057 }
1058 
1059 /*
1060  * FUNCTION:	meta_sp_list_freefill()
1061  * INPUT:	head	- the head of the list, must be NULL for empty list
1062  *		size	- the size of the volume this extent list is
1063  *			  representing
1064  * OUTPUT:	head	- the new head of the list
1065  * RETURNS:	void
1066  * PURPOSE:	finds gaps in the extent list and fills them with a free
1067  *		node.  If there is a gap at the beginning the head
1068  *		pointer will be changed to point to the new free node.
1069  *		If there is free space at the end, the last free extent
1070  *		will extend all the way out to the size specified.
1071  *
1072  *		*The list MUST be sorted by offset for this function to work.*
1073  */
1074 static void
1075 meta_sp_list_freefill(
1076 	sp_ext_node_t	**head,
1077 	sp_ext_length_t	size
1078 )
1079 {
1080 	sp_ext_node_t	*ext;
1081 	sp_ext_offset_t	curoff = 0LL;
1082 
1083 	for (ext = *head; ext != NULL; ext = ext->ext_next) {
1084 		if (curoff < ext->ext_offset)
1085 			meta_sp_list_insert(NULL, NULL, head,
1086 			    curoff, ext->ext_offset - curoff,
1087 			    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1088 		curoff = ext->ext_offset + ext->ext_length;
1089 	}
1090 
1091 	/* pad inverse list out to the end */
1092 	if (curoff < size)
1093 		meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff,
1094 		    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1095 
1096 	if (getenv(META_SP_DEBUG)) {
1097 		meta_sp_debug("meta_sp_list_freefill: Extent list with "
1098 		    "holes freefilled:\n");
1099 		meta_sp_list_dump(*head);
1100 	}
1101 }
1102 
1103 /*
1104  * FUNCTION:	meta_sp_list_dump()
1105  * INPUT:	head	- the head of the list, must be NULL for empty list
1106  * OUTPUT:	none
1107  * RETURNS:	void
1108  * PURPOSE:	dumps the entire extent list to stdout for easy debugging
1109  */
1110 static void
1111 meta_sp_list_dump(sp_ext_node_t *head)
1112 {
1113 	sp_ext_node_t	*ext;
1114 
1115 	meta_sp_debug("meta_sp_list_dump: dumping extent list:\n");
1116 	meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name",
1117 	    "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev",
1118 	    "Next");
1119 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1120 		if (ext->ext_namep != NULL)
1121 			meta_sp_debug("%5s", ext->ext_namep->cname);
1122 		else
1123 			meta_sp_debug("%5s", "NONE");
1124 
1125 		meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq);
1126 		switch (ext->ext_type) {
1127 		case EXTTYP_ALLOC:
1128 			meta_sp_debug("%7s ", "ALLOC");
1129 			break;
1130 		case EXTTYP_FREE:
1131 			meta_sp_debug("%7s ", "FREE");
1132 			break;
1133 		case EXTTYP_END:
1134 			meta_sp_debug("%7s ", "END");
1135 			break;
1136 		case EXTTYP_RESERVED:
1137 			meta_sp_debug("%7s ", "RESV");
1138 			break;
1139 		default:
1140 			meta_sp_debug("%7s ", "INVLD");
1141 			break;
1142 		}
1143 
1144 		meta_sp_debug("%10llu %10llu %5u %10p %10p\n",
1145 		    ext->ext_offset, ext->ext_length,
1146 		    ext->ext_flags, (void *) ext->ext_prev,
1147 		    (void *) ext->ext_next);
1148 	}
1149 	meta_sp_debug("\n");
1150 }
1151 
1152 /*
1153  * FUNCTION:	meta_sp_list_overlaps()
1154  * INPUT:	head	- the head of the list, must be NULL for empty list
1155  * OUTPUT:	none
1156  * RETURNS:	int	- 1 if extents overlap, 0 if ok
1157  * PURPOSE:	checks a list for overlaps.  The list MUST be sorted by
1158  *		offset for this function to work properly.
1159  */
1160 static int
1161 meta_sp_list_overlaps(sp_ext_node_t *head)
1162 {
1163 	sp_ext_node_t	*ext;
1164 
1165 	for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) {
1166 		if (ext->ext_offset + ext->ext_length >
1167 		    ext->ext_next->ext_offset)
1168 			return (1);
1169 	}
1170 	return (0);
1171 }
1172 
1173 /*
1174  * **************************************************************************
1175  *                        Extent Allocation Functions                       *
1176  * **************************************************************************
1177  */
1178 
1179 /*
1180  * FUNCTION:	meta_sp_alloc_by_ext()
1181  * INPUT:	sp	- the set name for the device the node belongs to
1182  *		np	- the name of the device the node belongs to
1183  *		head	- the head of the list, must be NULL for empty list
1184  *		free_ext	- the free extent being allocated from
1185  *		alloc_offset	- the offset of the allocation
1186  *		alloc_len	- the length of the allocation
1187  *		seq		- the sequence number of the allocation
1188  * OUTPUT:	head	- the new head pointer
1189  * RETURNS:	void
1190  * PURPOSE:	allocates a portion of the free extent free_ext.  The
1191  *		allocated portion starts at alloc_offset and is
1192  *		alloc_length long.  Both (alloc_offset) and (alloc_offset +
1193  *		alloc_length) must be contained within the free extent.
1194  *
1195  *		The free extent is split into as many as 3 pieces - a
1196  *		free extent containing [ free_offset .. alloc_offset ), an
1197  *		allocated extent containing the range [ alloc_offset ..
1198  *		alloc_end ], and another free extent containing the
1199  *		range ( alloc_end .. free_end ].  If either of the two
1200  *		new free extents would be zero length, they are not created.
1201  *
1202  *		Finally, the original free extent is removed.  All newly
1203  *		created extents have the EXTFLG_UPDATE flag set.
1204  */
1205 static void
1206 meta_sp_alloc_by_ext(
1207 	mdsetname_t	*sp,
1208 	mdname_t	*np,
1209 	sp_ext_node_t	**head,
1210 	sp_ext_node_t	*free_ext,
1211 	sp_ext_offset_t	alloc_offset,
1212 	sp_ext_length_t	alloc_length,
1213 	uint_t		seq
1214 )
1215 {
1216 	sp_ext_offset_t	free_offset = free_ext->ext_offset;
1217 	sp_ext_length_t	free_length = free_ext->ext_length;
1218 
1219 	sp_ext_offset_t	alloc_end = alloc_offset + alloc_length;
1220 	sp_ext_offset_t	free_end  = free_offset  + free_length;
1221 
1222 	/* allocated extent must be a subset of the free extent */
1223 	assert(free_offset <= alloc_offset);
1224 	assert(free_end >= alloc_end);
1225 
1226 	meta_sp_list_remove(head, free_ext);
1227 
1228 	if (free_offset < alloc_offset) {
1229 		meta_sp_list_insert(NULL, NULL, head, free_offset,
1230 		    (alloc_offset - free_offset), EXTTYP_FREE, 0,
1231 		    EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1232 	}
1233 
1234 	if (free_end > alloc_end) {
1235 		meta_sp_list_insert(NULL, NULL, head, alloc_end,
1236 		    (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE,
1237 		    meta_sp_cmp_by_offset);
1238 	}
1239 
1240 	meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length,
1241 	    EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1242 
1243 	if (getenv(META_SP_DEBUG)) {
1244 		meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n");
1245 		meta_sp_list_dump(*head);
1246 	}
1247 }
1248 
1249 /*
1250  * FUNCTION:	meta_sp_alloc_by_len()
1251  * INPUT:	sp	- the set name for the device the node belongs to
1252  *		np	- the name of the device the node belongs to
1253  *		head	- the head of the list, must be NULL for empty list
1254  *		*lp	- the requested length to allocate
1255  *		last_off	- the last offset already allocated.
1256  *		alignment	- the desired extent alignmeent
1257  * OUTPUT:	head	- the new head pointer
1258  *		*lp	- the length allocated
1259  * RETURNS:	int	- -1 if error, the number of new extents on success
1260  * PURPOSE:	allocates extents from free space to satisfy the requested
1261  *		length.  If requested length is zero, allocates all
1262  *		remaining free space.  This function provides the meat
1263  *		of the extent allocation algorithm.  Allocation is a
1264  *		three tier process:
1265  *
1266  *		1. If last_off is nonzero and there is free space following
1267  *		   that node, then it is extended to allocate as much of that
1268  *		   free space as possible.  This is useful for metattach.
1269  *		2. If a free extent can be found to satisfy the remaining
1270  *		   requested space, then satisfy the rest of the request
1271  *		   from that extent.
1272  *		3. Start allocating space from any remaining free extents until
1273  *		   the remainder of the request is satisified.
1274  *
1275  *              If alignment is non-zero, then every extent modified
1276  *              or newly allocated will be aligned modulo alignment,
1277  *              with a length that is an integer multiple of
1278  *              alignment.
1279  *
1280  *		The EXTFLG_UPDATE flag is set for all nodes (free and
1281  *		allocated) that require updated watermarks.
1282  *
1283  *		This algorithm may have a negative impact on fragmentation
1284  *		in pathological cases and may be improved if it turns out
1285  *		to be a problem.  This may be exacerbated by particularly
1286  *		large alignments.
1287  *
1288  * NOTE:	It's confusing, so it demands an explanation:
1289  *		- len is used to represent requested data space; it
1290  *		  does not include room for a watermark.  On each full
1291  *		  or partial allocation, len will be decremented by
1292  *		  alloc_len (see next paragraph) until it reaches
1293  *		  zero.
1294  *		- alloc_len is used to represent data space allocated
1295  *		  from a particular extent; it does not include space
1296  *		  for a watermark.  In the rare event that a_length
1297  *		  (see next paragraph) is equal to MD_SP_WMSIZE,
1298  *		  alloc_len will be zero and the resulting MD_SP_WMSIZE
1299  *		  fragment of space will be utterly unusable.
1300  *		- a_length is used to represent all space to be
1301  *		  allocated from a particular extent; it DOES include
1302  *		  space for a watermark.
1303  */
1304 static int
1305 meta_sp_alloc_by_len(
1306 	mdsetname_t	*sp,
1307 	mdname_t	*np,
1308 	sp_ext_node_t	**head,
1309 	sp_ext_length_t	*lp,
1310 	sp_ext_offset_t	last_off,
1311 	sp_ext_offset_t	alignment
1312 )
1313 {
1314 	sp_ext_node_t	*free_ext;
1315 	sp_ext_node_t	*alloc_ext;
1316 	uint_t		last_seq = 0;
1317 	uint_t		numexts = 0;
1318 	sp_ext_length_t	freespace;
1319 	sp_ext_length_t	alloc_len;
1320 	sp_ext_length_t	len;
1321 
1322 	/* We're DOA if we can't read *lp */
1323 	assert(lp != NULL);
1324 	len = *lp;
1325 
1326 	/*
1327 	 * Process the nominal case first: we've been given an actual
1328 	 * size argument, rather than the literal "all"
1329 	 */
1330 
1331 	if (len != 0) {
1332 
1333 		/*
1334 		 * Short circuit the check for free space.  This may
1335 		 * tell us we have enough space when we really don't
1336 		 * because each extent loses space to a watermark, but
1337 		 * it will always tell us there isn't enough space
1338 		 * correctly.  Worst case we do some extra work.
1339 		 */
1340 		freespace = meta_sp_list_size(*head, EXTTYP_FREE,
1341 		    INCLUDE_WM);
1342 
1343 		if (freespace < len)
1344 			return (-1);
1345 
1346 		/*
1347 		 * First see if we can extend the last extent for an
1348 		 * attach.
1349 		 */
1350 		if (last_off != 0LL) {
1351 			int align = 0;
1352 
1353 			alloc_ext =
1354 			    meta_sp_list_find(*head, last_off);
1355 			assert(alloc_ext != NULL);
1356 
1357 			/*
1358 			 * The offset test reflects the
1359 			 * inclusion of the watermark in the extent
1360 			 */
1361 			align = (alignment > 0) &&
1362 			    (((alloc_ext->ext_offset + MD_SP_WMSIZE) %
1363 			    alignment) == 0);
1364 
1365 			/*
1366 			 * If we decided not to align here, we should
1367 			 * also reset "alignment" so we don't bother
1368 			 * later, either.
1369 			 */
1370 			if (!align) {
1371 				alignment = 0;
1372 			}
1373 
1374 			last_seq = alloc_ext->ext_seq;
1375 
1376 			free_ext = meta_sp_list_find(*head,
1377 			    alloc_ext->ext_offset +
1378 			    alloc_ext->ext_length);
1379 
1380 			/*
1381 			 * If a free extent follows our last allocated
1382 			 * extent, then remove the last allocated
1383 			 * extent and increase the size of the free
1384 			 * extent to overlap it, then allocate the
1385 			 * total space from the new free extent.
1386 			 */
1387 			if (free_ext != NULL &&
1388 			    free_ext->ext_type == EXTTYP_FREE) {
1389 				assert(free_ext->ext_offset ==
1390 				    alloc_ext->ext_offset +
1391 				    alloc_ext->ext_length);
1392 
1393 				alloc_len =
1394 				    MIN(len, free_ext->ext_length);
1395 
1396 				if (align && (alloc_len < len)) {
1397 					/* No watermark space needed */
1398 					alloc_len -= alloc_len % alignment;
1399 				}
1400 
1401 				if (alloc_len > 0) {
1402 					free_ext->ext_offset -=
1403 					    alloc_ext->ext_length;
1404 					free_ext->ext_length +=
1405 					    alloc_ext->ext_length;
1406 
1407 					meta_sp_alloc_by_ext(sp, np, head,
1408 					    free_ext, free_ext->ext_offset,
1409 					    alloc_ext->ext_length + alloc_len,
1410 					    last_seq);
1411 
1412 					/*
1413 					 * now remove the original allocated
1414 					 * node.  We may have overlapping
1415 					 * extents for a short time before
1416 					 * this node is removed.
1417 					 */
1418 					meta_sp_list_remove(head, alloc_ext);
1419 					len -= alloc_len;
1420 				}
1421 			}
1422 			last_seq++;
1423 		}
1424 
1425 		if (len == 0LL)
1426 			goto out;
1427 
1428 		/*
1429 		 * Next, see if we can find a single allocation for
1430 		 * the remainder.  This may make fragmentation worse
1431 		 * in some cases, but there's no good way to allocate
1432 		 * that doesn't have a highly fragmented corner case.
1433 		 */
1434 		for (free_ext = *head; free_ext != NULL;
1435 		    free_ext = free_ext->ext_next) {
1436 			sp_ext_offset_t	a_offset;
1437 			sp_ext_offset_t	a_length;
1438 
1439 			if (free_ext->ext_type != EXTTYP_FREE)
1440 				continue;
1441 
1442 			/*
1443 			 * The length test should include space for
1444 			 * the watermark
1445 			 */
1446 
1447 			a_offset = free_ext->ext_offset;
1448 			a_length = free_ext->ext_length;
1449 
1450 			if (alignment > 0) {
1451 
1452 				/*
1453 				 * Shortcut for extents that have been
1454 				 * previously added to pad out the
1455 				 * data space
1456 				 */
1457 				if (a_length < alignment) {
1458 					continue;
1459 				}
1460 
1461 				/*
1462 				 * Round up so the data space begins
1463 				 * on a properly aligned boundary.
1464 				 */
1465 				a_offset += alignment -
1466 				    (a_offset % alignment) - MD_SP_WMSIZE;
1467 
1468 				/*
1469 				 * This is only necessary in case the
1470 				 * watermark size is ever greater than
1471 				 * one.  It'll never happen, of
1472 				 * course; we'll get rid of watermarks
1473 				 * before we make 'em bigger.
1474 				 */
1475 				if (a_offset < free_ext->ext_offset) {
1476 					a_offset += alignment;
1477 				}
1478 
1479 				/*
1480 				 * Adjust the length to account for
1481 				 * the space lost above (if any)
1482 				 */
1483 				a_length -=
1484 				    (a_offset - free_ext->ext_offset);
1485 			}
1486 
1487 			if (a_length >= len + MD_SP_WMSIZE) {
1488 				meta_sp_alloc_by_ext(sp, np, head,
1489 				    free_ext, a_offset,
1490 				    len + MD_SP_WMSIZE, last_seq);
1491 
1492 				len = 0LL;
1493 				numexts++;
1494 				break;
1495 			}
1496 		}
1497 
1498 		if (len == 0LL)
1499 			goto out;
1500 
1501 
1502 		/*
1503 		 * If the request could not be satisfied by extending
1504 		 * the last extent or by a single extent, then put
1505 		 * multiple smaller extents together until the request
1506 		 * is satisfied.
1507 		 */
1508 		for (free_ext = *head; (free_ext != NULL) && (len > 0);
1509 		    free_ext = free_ext->ext_next) {
1510 			sp_ext_offset_t a_offset;
1511 			sp_ext_length_t a_length;
1512 
1513 			if (free_ext->ext_type != EXTTYP_FREE)
1514 				continue;
1515 
1516 			a_offset = free_ext->ext_offset;
1517 			a_length = free_ext->ext_length;
1518 
1519 			if (alignment > 0) {
1520 
1521 				/*
1522 				 * Shortcut for extents that have been
1523 				 * previously added to pad out the
1524 				 * data space
1525 				 */
1526 				if (a_length < alignment) {
1527 					continue;
1528 				}
1529 
1530 				/*
1531 				 * Round up so the data space begins
1532 				 * on a properly aligned boundary.
1533 				 */
1534 				a_offset += alignment -
1535 				    (a_offset % alignment) - MD_SP_WMSIZE;
1536 
1537 				/*
1538 				 * This is only necessary in case the
1539 				 * watermark size is ever greater than
1540 				 * one.  It'll never happen, of
1541 				 * course; we'll get rid of watermarks
1542 				 * before we make 'em bigger.
1543 				 */
1544 				if (a_offset < free_ext->ext_offset) {
1545 					a_offset += alignment;
1546 				}
1547 
1548 				/*
1549 				 * Adjust the length to account for
1550 				 * the space lost above (if any)
1551 				 */
1552 				a_length -=
1553 				    (a_offset - free_ext->ext_offset);
1554 
1555 				/*
1556 				 * Adjust the length to be properly
1557 				 * aligned if it is NOT to be the
1558 				 * last extent in the soft partition.
1559 				 */
1560 				if ((a_length - MD_SP_WMSIZE) < len)
1561 					a_length -=
1562 					    (a_length - MD_SP_WMSIZE)
1563 					    % alignment;
1564 			}
1565 
1566 			alloc_len = MIN(len, a_length - MD_SP_WMSIZE);
1567 			if (alloc_len == 0)
1568 				continue;
1569 
1570 			/*
1571 			 * meta_sp_alloc_by_ext() expects the
1572 			 * allocation length to include the watermark
1573 			 * size, which is why we don't simply pass in
1574 			 * alloc_len here.
1575 			 */
1576 			meta_sp_alloc_by_ext(sp, np, head, free_ext,
1577 			    a_offset, MIN(len + MD_SP_WMSIZE, a_length),
1578 			    last_seq);
1579 
1580 			len -= alloc_len;
1581 			numexts++;
1582 			last_seq++;
1583 		}
1584 
1585 
1586 		/*
1587 		 * If there was not enough space we can throw it all
1588 		 * away since no real work has been done yet.
1589 		 */
1590 		if (len != 0) {
1591 			meta_sp_list_free(head);
1592 			return (-1);
1593 		}
1594 	}
1595 
1596 	/*
1597 	 * Otherwise, the literal "all" was specified: allocate all
1598 	 * available free space.  Don't bother with alignment.
1599 	 */
1600 	else {
1601 		/* First, extend the last extent if this is a grow */
1602 		if (last_off != 0LL) {
1603 			alloc_ext =
1604 			    meta_sp_list_find(*head, last_off);
1605 			assert(alloc_ext != NULL);
1606 
1607 			last_seq = alloc_ext->ext_seq;
1608 
1609 			free_ext = meta_sp_list_find(*head,
1610 			    alloc_ext->ext_offset +
1611 			    alloc_ext->ext_length);
1612 
1613 			/*
1614 			 * If a free extent follows our last allocated
1615 			 * extent, then remove the last allocated
1616 			 * extent and increase the size of the free
1617 			 * extent to overlap it, then allocate the
1618 			 * total space from the new free extent.
1619 			 */
1620 			if (free_ext != NULL &&
1621 			    free_ext->ext_type == EXTTYP_FREE) {
1622 				assert(free_ext->ext_offset ==
1623 				    alloc_ext->ext_offset +
1624 				    alloc_ext->ext_length);
1625 
1626 				len = alloc_len =
1627 				    free_ext->ext_length;
1628 
1629 				free_ext->ext_offset -=
1630 				    alloc_ext->ext_length;
1631 				free_ext->ext_length +=
1632 				    alloc_ext->ext_length;
1633 
1634 				meta_sp_alloc_by_ext(sp, np, head,
1635 				    free_ext, free_ext->ext_offset,
1636 				    alloc_ext->ext_length + alloc_len,
1637 				    last_seq);
1638 
1639 				/*
1640 				 * now remove the original allocated
1641 				 * node.  We may have overlapping
1642 				 * extents for a short time before
1643 				 * this node is removed.
1644 				 */
1645 				meta_sp_list_remove(head, alloc_ext);
1646 			}
1647 
1648 			last_seq++;
1649 		}
1650 
1651 		/* Next, grab all remaining free space */
1652 		for (free_ext = *head; free_ext != NULL;
1653 		    free_ext = free_ext->ext_next) {
1654 
1655 			if (free_ext->ext_type == EXTTYP_FREE) {
1656 				alloc_len =
1657 				    free_ext->ext_length - MD_SP_WMSIZE;
1658 				if (alloc_len == 0)
1659 					continue;
1660 
1661 				/*
1662 				 * meta_sp_alloc_by_ext() expects the
1663 				 * allocation length to include the
1664 				 * watermark size, which is why we
1665 				 * don't simply pass in alloc_len
1666 				 * here.
1667 				 */
1668 				meta_sp_alloc_by_ext(sp, np, head,
1669 				    free_ext, free_ext->ext_offset,
1670 				    free_ext->ext_length,
1671 				    last_seq);
1672 
1673 				len += alloc_len;
1674 				numexts++;
1675 				last_seq++;
1676 			}
1677 		}
1678 	}
1679 
1680 out:
1681 	if (getenv(META_SP_DEBUG)) {
1682 		meta_sp_debug("meta_sp_alloc_by_len: Extent list after "
1683 		    "allocation:\n");
1684 		meta_sp_list_dump(*head);
1685 	}
1686 
1687 	if (*lp == 0) {
1688 		*lp = len;
1689 
1690 		/*
1691 		 * Make sure the callers hit a no space error if we
1692 		 * didn't actually find anything.
1693 		 */
1694 		if (len == 0) {
1695 			return (-1);
1696 		}
1697 	}
1698 
1699 	return (numexts);
1700 }
1701 
1702 /*
1703  * FUNCTION:	meta_sp_alloc_by_list()
1704  * INPUT:	sp	- the set name for the device the node belongs to
1705  *		np	- the name of the device the node belongs to
1706  *		head	- the head of the list, must be NULL for empty list
1707  *		oblist	- an extent list containing requested nodes to allocate
1708  * OUTPUT:	head	- the new head pointer
1709  * RETURNS:	int	- -1 if error, the number of new extents on success
1710  * PURPOSE:	allocates extents from free space to satisfy the requested
1711  *		extent list.  This is primarily used for the -o/-b options
1712  *		where the user may specifically request extents to allocate.
1713  *		Each extent in the oblist must be a subset (inclusive) of a
1714  *		free extent and may not overlap each other.  This
1715  *		function sets the EXTFLG_UPDATE flag for each node that
1716  *		requires a watermark update after allocating.
1717  */
1718 static int
1719 meta_sp_alloc_by_list(
1720 	mdsetname_t	*sp,
1721 	mdname_t	*np,
1722 	sp_ext_node_t	**head,
1723 	sp_ext_node_t	*oblist
1724 )
1725 {
1726 	sp_ext_node_t	*ext;
1727 	sp_ext_node_t	*free_ext;
1728 	uint_t		numexts = 0;
1729 
1730 	for (ext = oblist; ext != NULL; ext = ext->ext_next) {
1731 
1732 		free_ext = meta_sp_list_find(*head,
1733 		    ext->ext_offset - MD_SP_WMSIZE);
1734 
1735 		/* Make sure the allocation is within the free extent */
1736 		if ((free_ext == NULL) ||
1737 		    (ext->ext_offset + ext->ext_length >
1738 		    free_ext->ext_offset + free_ext->ext_length) ||
1739 		    (free_ext->ext_type != EXTTYP_FREE))
1740 			return (-1);
1741 
1742 		meta_sp_alloc_by_ext(sp, np, head, free_ext,
1743 		    ext->ext_offset - MD_SP_WMSIZE,
1744 		    ext->ext_length + MD_SP_WMSIZE, ext->ext_seq);
1745 
1746 		numexts++;
1747 	}
1748 
1749 	assert(meta_sp_list_overlaps(*head) == 0);
1750 
1751 	if (getenv(META_SP_DEBUG)) {
1752 		meta_sp_debug("meta_sp_alloc_by_list: Extent list after "
1753 		    "allocation:\n");
1754 		meta_sp_list_dump(*head);
1755 	}
1756 
1757 	return (numexts);
1758 }
1759 
1760 /*
1761  * **************************************************************************
1762  *                     Extent List Population Functions                     *
1763  * **************************************************************************
1764  */
1765 
1766 /*
1767  * FUNCTION:	meta_sp_extlist_from_namelist()
1768  * INPUT:	sp	- the set name for the device the node belongs to
1769  *		spnplp	- the namelist of soft partitions to build a list from
1770  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1771  *		ep	- return error pointer
1772  * RETURNS:	int	- -1 if error, 0 on success
1773  * PURPOSE:	builds an extent list representing the soft partitions
1774  *		specified in the namelist.  Each extent in each soft
1775  *		partition is added to the list with the type EXTTYP_ALLOC.
1776  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1777  *		extent in the list includes the space occupied by the
1778  *		watermark, which is not included in the unit structures.
1779  */
1780 static int
1781 meta_sp_extlist_from_namelist(
1782 	mdsetname_t	*sp,
1783 	mdnamelist_t	*spnlp,
1784 	sp_ext_node_t	**extlist,
1785 	md_error_t	*ep
1786 )
1787 {
1788 	int		extn;
1789 	md_sp_t		*msp;		/* unit structure of the sp's */
1790 	mdnamelist_t	*namep;
1791 
1792 	assert(sp != NULL);
1793 
1794 	/*
1795 	 * Now go through the soft partitions and add a node to the used
1796 	 * list for each allocated extent.
1797 	 */
1798 	for (namep = spnlp; namep != NULL; namep = namep->next) {
1799 		mdname_t	*curnp = namep->namep;
1800 
1801 		/* get the unit structure */
1802 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
1803 			return (-1);
1804 
1805 		for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1806 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1807 
1808 			/*
1809 			 * subtract from offset and add to the length
1810 			 * to account for the watermark, which is not
1811 			 * contained in the extents in the unit structure.
1812 			 */
1813 			meta_sp_list_insert(sp, curnp, extlist,
1814 			    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
1815 			    EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset);
1816 		}
1817 	}
1818 	return (0);
1819 }
1820 
1821 /*
1822  * FUNCTION:	meta_sp_extlist_from_wm()
1823  * INPUT:	sp	- the set name for the device the node belongs to
1824  *		compnp	- the name of the device to scan watermarks on
1825  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1826  *		ep	- return error pointer
1827  * RETURNS:	int	- -1 if error, 0 on success
1828  * PURPOSE:	builds an extent list representing the soft partitions
1829  *		specified in the namelist.  Each extent in each soft
1830  *		partition is added to the list with the type EXTTYP_ALLOC.
1831  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1832  *		extent in the list includes the space occupied by the
1833  *		watermark, which is not included in the unit structures.
1834  */
1835 static int
1836 meta_sp_extlist_from_wm(
1837 	mdsetname_t	*sp,
1838 	mdname_t	*compnp,
1839 	sp_ext_node_t	**extlist,
1840 	ext_cmpfunc_t	compare,
1841 	md_error_t	*ep
1842 )
1843 {
1844 	mp_watermark_t	wm;
1845 	mdname_t	*np = NULL;
1846 	mdsetname_t	*spsetp = NULL;
1847 	sp_ext_offset_t	cur_off;
1848 	md_set_desc	*sd;
1849 	int		init = 0;
1850 	mdkey_t		key;
1851 	minor_t		mnum;
1852 
1853 	if (!metaislocalset(sp)) {
1854 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1855 			return (-1);
1856 	}
1857 
1858 	if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR)
1859 		return (-1);
1860 
1861 	for (;;) {
1862 		if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) {
1863 			return (-1);
1864 		}
1865 
1866 		/* get the set and name pointers */
1867 		if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) {
1868 			if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) {
1869 				return (-1);
1870 			}
1871 		}
1872 
1873 		/*
1874 		 * For the MN set, meta_init_make_device needs to
1875 		 * be run on all the nodes so the entries for the
1876 		 * softpart device name and its comp can be created
1877 		 * in the same order in the replica namespace.  If
1878 		 * we have it run on mdmn_do_iocset then the mddbs
1879 		 * will be out of sync between master node and slave
1880 		 * nodes.
1881 		 */
1882 		if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) {
1883 
1884 			if (!metaislocalset(sp) && MD_MNSET_DESC(sd)) {
1885 				md_mn_msg_addmdname_t	*send_params;
1886 				int			result;
1887 				md_mn_result_t		*resp = NULL;
1888 				int			message_size;
1889 
1890 				message_size =  sizeof (*send_params) +
1891 				    strlen(wm.wm_mdname) + 1;
1892 				send_params = Zalloc(message_size);
1893 				send_params->addmdname_setno = sp->setno;
1894 				(void) strcpy(&send_params->addmdname_name[0],
1895 				    wm.wm_mdname);
1896 				result = mdmn_send_message(sp->setno,
1897 				    MD_MN_MSG_ADDMDNAME,
1898 				    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0,
1899 				    (char *)send_params, message_size, &resp,
1900 				    ep);
1901 				Free(send_params);
1902 				if (resp != NULL) {
1903 					if (resp->mmr_exitval != 0) {
1904 						free_result(resp);
1905 						return (-1);
1906 					}
1907 					free_result(resp);
1908 				}
1909 				if (result != 0)
1910 					return (-1);
1911 			} else {
1912 
1913 				if (!is_existing_meta_hsp(sp, wm.wm_mdname)) {
1914 					if ((key = meta_init_make_device(&sp,
1915 					    wm.wm_mdname, ep)) <= 0) {
1916 						return (-1);
1917 					}
1918 					init = 1;
1919 				}
1920 			}
1921 
1922 			np = metaname(&spsetp, wm.wm_mdname, META_DEVICE, ep);
1923 			if (np == NULL) {
1924 				if (init) {
1925 					if (meta_getnmentbykey(sp->setno,
1926 					    MD_SIDEWILD, key, NULL, &mnum,
1927 					    NULL, ep) != NULL) {
1928 						(void) metaioctl(MD_IOCREM_DEV,
1929 						    &mnum, ep, NULL);
1930 					}
1931 					(void) del_self_name(sp, key, ep);
1932 				}
1933 				return (-1);
1934 			}
1935 		}
1936 
1937 		/* insert watermark into extent list */
1938 		meta_sp_list_insert(spsetp, np, extlist, cur_off,
1939 		    wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq,
1940 		    EXTFLG_UPDATE, compare);
1941 
1942 		/* if we see the end watermark, we're done */
1943 		if (wm.wm_type == EXTTYP_END)
1944 			break;
1945 
1946 		cur_off += wm.wm_length + 1;
1947 
1948 		/* clear out set and name pointers for next iteration */
1949 		np = NULL;
1950 		spsetp = NULL;
1951 	}
1952 
1953 	return (0);
1954 }
1955 
1956 /*
1957  * **************************************************************************
1958  *                        Print (metastat) Functions                        *
1959  * **************************************************************************
1960  */
1961 
1962 /*
1963  * FUNCTION:	meta_sp_short_print()
1964  * INPUT:	msp	- the unit structure to display
1965  *		fp	- the file pointer to send output to
1966  *		options	- print options from the command line processor
1967  * OUTPUT:	ep	- return error pointer
1968  * RETURNS:	int	- -1 if error, 0 on success
1969  * PURPOSE:	display a short report of the soft partition in md.tab
1970  *		form, primarily used for metastat -p.
1971  */
1972 static int
1973 meta_sp_short_print(
1974 	md_sp_t		*msp,
1975 	char		*fname,
1976 	FILE		*fp,
1977 	mdprtopts_t	options,
1978 	md_error_t	*ep
1979 )
1980 {
1981 	int	extn;
1982 
1983 	if (options & PRINT_LARGEDEVICES) {
1984 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0)
1985 			return (0);
1986 	}
1987 
1988 	if (options & PRINT_FN) {
1989 		if ((msp->common.revision & MD_FN_META_DEV) == 0)
1990 			return (0);
1991 	}
1992 
1993 	/* print name and -p */
1994 	if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF)
1995 		return (mdsyserror(ep, errno, fname));
1996 
1997 	/* print the component */
1998 	/*
1999 	 * Always print the full path name
2000 	 */
2001 	if (fprintf(fp, " %s", msp->compnamep->rname) == EOF)
2002 		return (mdsyserror(ep, errno, fname));
2003 
2004 	/* print out each extent */
2005 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2006 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2007 		if (fprintf(fp, " -o %llu -b %llu ", extp->poff,
2008 		    extp->len) == EOF)
2009 			return (mdsyserror(ep, errno, fname));
2010 	}
2011 
2012 	if (fprintf(fp, "\n") == EOF)
2013 		return (mdsyserror(ep, errno, fname));
2014 
2015 	/* success */
2016 	return (0);
2017 }
2018 
2019 /*
2020  * FUNCTION:	meta_sp_status_to_name()
2021  * INPUT:	xsp_status	- the status value to convert to a string
2022  *		tstate		- transient errored device state. If set the
2023  *				  device is Unavailable
2024  * OUTPUT:	none
2025  * RETURNS:	char *	- a pointer to the string representing the status value
2026  * PURPOSE:	return an internationalized string representing the
2027  *		status value for a soft partition.  The strings are
2028  *		strdup'd and must be freed by the caller.
2029  */
2030 static char *
2031 meta_sp_status_to_name(
2032 	xsp_status_t	xsp_status,
2033 	uint_t		tstate
2034 )
2035 {
2036 	char *rval = NULL;
2037 
2038 	/*
2039 	 * Check to see if we have MD_INACCESSIBLE set. This is the only valid
2040 	 * value for an 'Unavailable' return. tstate can be set because of
2041 	 * other multi-node reasons (e.g. ABR being set)
2042 	 */
2043 	if (tstate & MD_INACCESSIBLE) {
2044 		return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable")));
2045 	}
2046 
2047 	switch (xsp_status) {
2048 	case MD_SP_CREATEPEND:
2049 		rval = Strdup(dgettext(TEXT_DOMAIN, "Creating"));
2050 		break;
2051 	case MD_SP_GROWPEND:
2052 		rval = Strdup(dgettext(TEXT_DOMAIN, "Growing"));
2053 		break;
2054 	case MD_SP_DELPEND:
2055 		rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting"));
2056 		break;
2057 	case MD_SP_OK:
2058 		rval = Strdup(dgettext(TEXT_DOMAIN, "Okay"));
2059 		break;
2060 	case MD_SP_ERR:
2061 		rval = Strdup(dgettext(TEXT_DOMAIN, "Errored"));
2062 		break;
2063 	case MD_SP_RECOVER:
2064 		rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering"));
2065 		break;
2066 	}
2067 
2068 	if (rval == NULL)
2069 		rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid"));
2070 
2071 	return (rval);
2072 }
2073 
2074 /*
2075  * FUNCTION:	meta_sp_report()
2076  * INPUT:	sp	- the set name for the unit being displayed
2077  *		msp	- the unit structure to display
2078  *		nlpp	- pass back the large devs
2079  *		fp	- the file pointer to send output to
2080  *		options	- print options from the command line processor
2081  * OUTPUT:	ep	- return error pointer
2082  * RETURNS:	int	- -1 if error, 0 on success
2083  * PURPOSE:	print a full report of the device specified
2084  */
2085 static int
2086 meta_sp_report(
2087 	mdsetname_t	*sp,
2088 	md_sp_t		*msp,
2089 	mdnamelist_t	**nlpp,
2090 	char		*fname,
2091 	FILE		*fp,
2092 	mdprtopts_t	options,
2093 	md_error_t	*ep
2094 )
2095 {
2096 	uint_t		extn;
2097 	char		*status;
2098 	char		*devid = "";
2099 	mdname_t	*didnp = NULL;
2100 	ddi_devid_t	dtp;
2101 	int		len;
2102 	uint_t		tstate = 0;
2103 
2104 	if (options & PRINT_LARGEDEVICES) {
2105 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0) {
2106 			return (0);
2107 		} else {
2108 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2109 				return (-1);
2110 		}
2111 	}
2112 
2113 	if (options & PRINT_FN) {
2114 		if ((msp->common.revision & MD_FN_META_DEV) == 0) {
2115 			return (0);
2116 		} else {
2117 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2118 				return (-1);
2119 		}
2120 	}
2121 
2122 	if (options & PRINT_HEADER) {
2123 		if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"),
2124 		    msp->common.namep->cname) == EOF)
2125 			return (mdsyserror(ep, errno, fname));
2126 	}
2127 
2128 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Device: %s\n"),
2129 	    msp->compnamep->cname) == EOF)
2130 		return (mdsyserror(ep, errno, fname));
2131 
2132 	/* Determine if device is available before displaying status */
2133 	if (metaismeta(msp->common.namep)) {
2134 		if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0)
2135 			return (-1);
2136 	}
2137 	status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED);
2138 
2139 	/* print out "State" to be consistent with other metadevices */
2140 	if (tstate & MD_ABR_CAP) {
2141 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2142 		    "    State: %s - Application Based Recovery (ABR)\n"),
2143 		    status) == EOF) {
2144 			Free(status);
2145 			return (mdsyserror(ep, errno, fname));
2146 		}
2147 	} else {
2148 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2149 		    "    State: %s\n"), status) == EOF) {
2150 			Free(status);
2151 			return (mdsyserror(ep, errno, fname));
2152 		}
2153 	}
2154 	free(status);
2155 
2156 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Size: %llu blocks (%s)\n"),
2157 	    msp->common.size,
2158 	    meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF)
2159 		return (mdsyserror(ep, errno, fname));
2160 
2161 	/* print component details */
2162 	if (! metaismeta(msp->compnamep)) {
2163 		diskaddr_t	start_blk;
2164 		int		has_mddb;
2165 		char		*has_mddb_str;
2166 
2167 		/* print header */
2168 		/*
2169 		 * Building a format string on the fly that will
2170 		 * be used in (f)printf. This allows the length
2171 		 * of the ctd to vary from small to large without
2172 		 * looking horrible.
2173 		 */
2174 		len = strlen(msp->compnamep->cname);
2175 		len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device")));
2176 		len += 2;
2177 		if (fprintf(fp,
2178 		    "\t%-*.*s %-12.12s %-5.5s %s\n",
2179 		    len, len,
2180 		    dgettext(TEXT_DOMAIN, "Device"),
2181 		    dgettext(TEXT_DOMAIN, "Start Block"),
2182 		    dgettext(TEXT_DOMAIN, "Dbase"),
2183 		    dgettext(TEXT_DOMAIN, "Reloc")) == EOF) {
2184 			return (mdsyserror(ep, errno, fname));
2185 		}
2186 
2187 
2188 		/* get info */
2189 		if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) ==
2190 		    MD_DISKADDR_ERROR)
2191 			return (-1);
2192 
2193 		if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0)
2194 			return (-1);
2195 
2196 		if (has_mddb)
2197 			has_mddb_str = dgettext(TEXT_DOMAIN, "Yes");
2198 		else
2199 			has_mddb_str = dgettext(TEXT_DOMAIN, "No");
2200 
2201 		/* populate the key in the name_p structure */
2202 		didnp = metadevname(&sp, msp->compnamep->dev, ep);
2203 		if (didnp == NULL) {
2204 			return (-1);
2205 		}
2206 
2207 		/* determine if devid does NOT exist */
2208 		if (options & PRINT_DEVID) {
2209 			if ((dtp = meta_getdidbykey(sp->setno,
2210 			    getmyside(sp, ep), didnp->key, ep)) == NULL)
2211 				devid = dgettext(TEXT_DOMAIN, "No ");
2212 			else {
2213 				devid = dgettext(TEXT_DOMAIN, "Yes");
2214 				free(dtp);
2215 			}
2216 		}
2217 
2218 		/* print info */
2219 		/*
2220 		 * This allows the length
2221 		 * of the ctd to vary from small to large without
2222 		 * looking horrible.
2223 		 */
2224 		if (fprintf(fp, "\t%-*s %8lld     %-5.5s %s\n",
2225 		    len, msp->compnamep->cname,
2226 		    start_blk, has_mddb_str, devid) == EOF) {
2227 			return (mdsyserror(ep, errno, fname));
2228 		}
2229 		(void) fprintf(fp, "\n");
2230 	}
2231 
2232 
2233 	/* print the headers */
2234 	if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n",
2235 	    dgettext(TEXT_DOMAIN, "Extent"),
2236 	    dgettext(TEXT_DOMAIN, "Start Block"),
2237 	    dgettext(TEXT_DOMAIN, "Block count")) == EOF)
2238 		return (mdsyserror(ep, errno, fname));
2239 
2240 	/* print out each extent */
2241 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2242 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2243 
2244 		/* If PRINT_TIMES option is ever supported, add output here */
2245 		if (fprintf(fp, "\t%6u %24llu %24llu\n",
2246 		    extn, extp->poff, extp->len) == EOF)
2247 			return (mdsyserror(ep, errno, fname));
2248 	}
2249 
2250 	/* separate records with a newline */
2251 	(void) fprintf(fp, "\n");
2252 	return (0);
2253 }
2254 
2255 /*
2256  * FUNCTION:	meta_sp_print()
2257  * INPUT:	sp	- the set name for the unit being displayed
2258  *		np	- the name of the device to print
2259  *		fname	- ??? not used
2260  *		fp	- the file pointer to send output to
2261  *		options	- print options from the command line processor
2262  * OUTPUT:	ep	- return error pointer
2263  * RETURNS:	int	- -1 if error, 0 on success
2264  * PURPOSE:	print a full report of the device specified by metastat.
2265  *		This is the main entry point for printing.
2266  */
2267 int
2268 meta_sp_print(
2269 	mdsetname_t	*sp,
2270 	mdname_t	*np,
2271 	mdnamelist_t	**nlpp,
2272 	char		*fname,
2273 	FILE		*fp,
2274 	mdprtopts_t	options,
2275 	md_error_t	*ep
2276 )
2277 {
2278 	md_sp_t		*msp;
2279 	md_unit_t	*mdp;
2280 	int		rval = 0;
2281 	set_t		setno;
2282 	minor_t		unit;
2283 
2284 	/* should always have the same set */
2285 	assert(sp != NULL);
2286 
2287 	/* print all the soft partitions */
2288 	if (np == NULL) {
2289 		mdnamelist_t	*nlp = NULL;
2290 		mdnamelist_t	*p;
2291 		int		cnt;
2292 
2293 		if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0)
2294 			return (-1);
2295 		else if (cnt == 0)
2296 			return (0);
2297 
2298 		/* recusively print them out */
2299 		for (p = nlp; (p != NULL); p = p->next) {
2300 			mdname_t	*curnp = p->namep;
2301 
2302 			/*
2303 			 * one problem with the rval of -1 here is that
2304 			 * the error gets "lost" when the next device is
2305 			 * printed, but we want to print them all anyway.
2306 			 */
2307 			rval = meta_sp_print(sp, curnp, nlpp, fname, fp,
2308 			    options, ep);
2309 		}
2310 
2311 		/* clean up, return success */
2312 		metafreenamelist(nlp);
2313 		return (rval);
2314 	}
2315 
2316 	/* get the unit structure */
2317 	if ((msp = meta_get_sp_common(sp, np,
2318 	    ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL)
2319 		return (-1);
2320 
2321 	/* check for parented */
2322 	if ((! (options & PRINT_SUBDEVS)) &&
2323 	    (MD_HAS_PARENT(msp->common.parent))) {
2324 		return (0);
2325 	}
2326 
2327 	/* print appropriate detail */
2328 	if (options & PRINT_SHORT) {
2329 		if (meta_sp_short_print(msp, fname, fp, options, ep) != 0)
2330 			return (-1);
2331 	} else {
2332 		if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0)
2333 			return (-1);
2334 	}
2335 
2336 	/*
2337 	 * Print underlying metadevices if they are parented to us and
2338 	 * if the info for the underlying metadevice has not been printed.
2339 	 */
2340 	if (metaismeta(msp->compnamep)) {
2341 		/* get the unit structure for the subdevice */
2342 		if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL)
2343 			return (-1);
2344 
2345 		setno = MD_MIN2SET(MD_SID(mdp));
2346 		unit = MD_MIN2UNIT(MD_SID(mdp));
2347 
2348 		/* If info not already printed, recurse */
2349 		if (sp_parent_printed[setno] == NULL ||
2350 		    !BT_TEST(sp_parent_printed[setno], unit)) {
2351 			if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp,
2352 			    (options | PRINT_HEADER | PRINT_SUBDEVS),
2353 			    NULL, ep) != 0) {
2354 				return (-1);
2355 			}
2356 			if (sp_parent_printed[setno] == NULL)
2357 				sp_parent_printed[setno] =
2358 				    Zalloc(BT_SIZEOFMAP(MD_MAXUNITS));
2359 			BT_SET(sp_parent_printed[setno], unit);
2360 		}
2361 	}
2362 	return (0);
2363 }
2364 
2365 /*
2366  * **************************************************************************
2367  *                     Watermark Manipulation Functions                     *
2368  * **************************************************************************
2369  */
2370 
2371 /*
2372  * FUNCTION:	meta_sp_get_start()
2373  * INPUT:	sp	- the operating set
2374  *		np 	- device upon which the sp is being built
2375  * OUTPUT:	ep	- return error pointer
2376  * RETURNS:	daddr_t	- -1 if error, otherwise the start block
2377  * PURPOSE:	Encapsulate the determination of the start block of the
2378  *		device upon which the sp is built or being built.
2379  */
2380 static diskaddr_t
2381 meta_sp_get_start(
2382 	mdsetname_t	*sp,
2383 	mdname_t	*np,
2384 	md_error_t	*ep
2385 )
2386 {
2387 	daddr_t		start_block;
2388 
2389 	if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR)
2390 		start_block += MD_SP_START;
2391 
2392 	return (start_block);
2393 }
2394 
2395 /*
2396  * FUNCTION:	meta_sp_update_wm_common()
2397  * INPUT:	sp	- the operating set
2398  *		msp	- a pointer to the XDR unit structure
2399  *		extlist	- the extent list specifying watermarks to update
2400  *		iocval	- either MD_IOC_SPUPDATEWM or MD_MN_IOC_SPUPDATEWM
2401  * OUTPUT:	ep	- return error pointer
2402  * RETURNS:	int	- -1 if error, 0 on success
2403  * PURPOSE:	steps backwards through the extent list updating
2404  *		watermarks for all extents with the EXTFLG_UPDATE flag
2405  *		set.  Writing the watermarks guarantees consistency when
2406  *		extents must be broken into pieces since the original
2407  *		watermark will be the last to be updated, and will be
2408  *		changed to point to a new watermark that is already
2409  *		known to be consistent.  If one of the writes fails, the
2410  *		original watermark stays intact and none of the changes
2411  *		are realized.
2412  */
2413 static int
2414 meta_sp_update_wm_common(
2415 	mdsetname_t	*sp,
2416 	md_sp_t		*msp,
2417 	sp_ext_node_t	*extlist,
2418 	int		iocval,
2419 	md_error_t	*ep
2420 )
2421 {
2422 	sp_ext_node_t	*ext;
2423 	sp_ext_node_t	*tail;
2424 	mp_watermark_t	*wmp, *watermarks;
2425 	xsp_offset_t	*osp, *offsets;
2426 	int		update_count = 0;
2427 	int		rval = 0;
2428 	md_unit_t	*mdp;
2429 	md_sp_update_wm_t	update_params;
2430 
2431 	if (getenv(META_SP_DEBUG)) {
2432 		meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n");
2433 		meta_sp_list_dump(extlist);
2434 	}
2435 
2436 	/*
2437 	 * find the last node so we can write the watermarks backwards
2438 	 * and count watermarks to update so we can allocate space
2439 	 */
2440 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
2441 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2442 			update_count++;
2443 		}
2444 
2445 		if (ext->ext_next == NULL) {
2446 			tail = ext;
2447 		}
2448 	}
2449 	ext = tail;
2450 
2451 	wmp = watermarks =
2452 	    Zalloc(update_count * sizeof (mp_watermark_t));
2453 	osp = offsets =
2454 	    Zalloc(update_count * sizeof (sp_ext_offset_t));
2455 
2456 	while (ext != NULL) {
2457 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2458 			/* update watermark */
2459 			wmp->wm_magic = MD_SP_MAGIC;
2460 			wmp->wm_version = MD_SP_VERSION;
2461 			wmp->wm_type = ext->ext_type;
2462 			wmp->wm_seq = ext->ext_seq;
2463 			wmp->wm_length = ext->ext_length - MD_SP_WMSIZE;
2464 
2465 			/* fill in the volume name and set name */
2466 			if (ext->ext_namep != NULL)
2467 				(void) strcpy(wmp->wm_mdname,
2468 				    ext->ext_namep->cname);
2469 			else
2470 				(void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME);
2471 			if (ext->ext_setp != NULL &&
2472 			    ext->ext_setp->setno != MD_LOCAL_SET)
2473 				(void) strcpy(wmp->wm_setname,
2474 				    ext->ext_setp->setname);
2475 			else
2476 				(void) strcpy(wmp->wm_setname,
2477 				    MD_SP_LOCALSETNAME);
2478 
2479 			/* Generate the checksum */
2480 			wmp->wm_checksum = 0;
2481 			crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum,
2482 			    sizeof (*wmp), NULL);
2483 
2484 			/* record the extent offset */
2485 			*osp = ext->ext_offset;
2486 
2487 			/* Advance the placeholders */
2488 			osp++; wmp++;
2489 		}
2490 		ext = ext->ext_prev;
2491 	}
2492 
2493 	mdp = meta_get_mdunit(sp, msp->common.namep, ep);
2494 	if (mdp == NULL) {
2495 		rval = -1;
2496 		goto out;
2497 	}
2498 
2499 	(void) memset(&update_params, 0, sizeof (update_params));
2500 	update_params.mnum = MD_SID(mdp);
2501 	update_params.count = update_count;
2502 	update_params.wmp = (uintptr_t)watermarks;
2503 	update_params.osp = (uintptr_t)offsets;
2504 	MD_SETDRIVERNAME(&update_params, MD_SP,
2505 	    MD_MIN2SET(update_params.mnum));
2506 
2507 	if (metaioctl(iocval, &update_params, &update_params.mde,
2508 	    msp->common.namep->cname) != 0) {
2509 		(void) mdstealerror(ep, &update_params.mde);
2510 		rval = -1;
2511 		goto out;
2512 	}
2513 
2514 out:
2515 	Free(watermarks);
2516 	Free(offsets);
2517 
2518 	return (rval);
2519 }
2520 
2521 static int
2522 meta_sp_update_wm(
2523 	mdsetname_t	*sp,
2524 	md_sp_t		*msp,
2525 	sp_ext_node_t	*extlist,
2526 	md_error_t	*ep
2527 )
2528 {
2529 	return (meta_sp_update_wm_common(sp, msp, extlist, MD_IOC_SPUPDATEWM,
2530 	    ep));
2531 }
2532 
2533 static int
2534 meta_mn_sp_update_wm(
2535 	mdsetname_t	*sp,
2536 	md_sp_t		*msp,
2537 	sp_ext_node_t	*extlist,
2538 	md_error_t	*ep
2539 )
2540 {
2541 	return (meta_sp_update_wm_common(sp, msp, extlist, MD_MN_IOC_SPUPDATEWM,
2542 	    ep));
2543 }
2544 
2545 /*
2546  * FUNCTION:	meta_sp_clear_wm()
2547  * INPUT:	sp	- the operating set
2548  *		msp	- the unit structure for the soft partition to clear
2549  * OUTPUT:	ep	- return error pointer
2550  * RETURNS:	int	- -1 if error, 0 on success
2551  * PURPOSE:	steps through the extents for a soft partition unit and
2552  *		creates an extent list designed to mark all of the
2553  *		watermarks for those extents as free.  The extent list
2554  *		is then passed to meta_sp_update_wm() to actually write
2555  *		the watermarks out.
2556  */
2557 static int
2558 meta_sp_clear_wm(
2559 	mdsetname_t	*sp,
2560 	md_sp_t		*msp,
2561 	md_error_t	*ep
2562 )
2563 {
2564 	sp_ext_node_t	*extlist = NULL;
2565 	int		numexts = msp->ext.ext_len;
2566 	uint_t		i;
2567 	int		rval = 0;
2568 
2569 	/* for each watermark must set the flag to SP_FREE */
2570 	for (i = 0; i < numexts; i++) {
2571 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
2572 
2573 		meta_sp_list_insert(NULL, NULL, &extlist,
2574 		    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
2575 		    EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
2576 	}
2577 
2578 	/* update watermarks */
2579 	rval = meta_sp_update_wm(sp, msp, extlist, ep);
2580 
2581 	meta_sp_list_free(&extlist);
2582 	return (rval);
2583 }
2584 
2585 /*
2586  * FUNCTION:	meta_sp_read_wm()
2587  * INPUT:	sp	- setname for component
2588  *		compnp	- mdname_t for component
2589  *		offset	- the offset of the watermark to read (sectors)
2590  * OUTPUT:	wm	- the watermark structure to read into
2591  *		ep	- return error pointer
2592  * RETURNS:	int	- -1 if error, 0 on success
2593  * PURPOSE:	seeks out to the requested offset and reads a watermark.
2594  *		It then verifies that the magic number is correct and
2595  *		that the checksum is valid, returning an error if either
2596  *		is wrong.
2597  */
2598 static int
2599 meta_sp_read_wm(
2600 	mdsetname_t	*sp,
2601 	mdname_t	*compnp,
2602 	mp_watermark_t	*wm,
2603 	sp_ext_offset_t	offset,
2604 	md_error_t	*ep
2605 )
2606 {
2607 	md_sp_read_wm_t	read_params;
2608 
2609 	/*
2610 	 * make sure block offset does not overflow 2^64 bytes and it's a
2611 	 * multiple of the block size.
2612 	 */
2613 	assert(offset <= (1LL << (64 - DEV_BSHIFT)));
2614 	/* LINTED */
2615 	assert((sizeof (*wm) % DEV_BSIZE) == 0);
2616 
2617 	(void) memset(wm, 0, sizeof (*wm));
2618 
2619 	(void) memset(&read_params, 0, sizeof (read_params));
2620 	read_params.rdev = compnp->dev;
2621 	read_params.wmp = (uintptr_t)wm;
2622 	read_params.offset = offset;
2623 	MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno);
2624 
2625 	if (metaioctl(MD_IOC_SPREADWM, &read_params,
2626 	    &read_params.mde, compnp->cname) != 0) {
2627 
2628 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2629 		    "Extent header read failed, block %llu.\n"), offset);
2630 		return (mdstealerror(ep, &read_params.mde));
2631 	}
2632 
2633 	/* make sure magic number is correct */
2634 	if (wm->wm_magic != MD_SP_MAGIC) {
2635 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2636 		    "found incorrect magic number %x, expected %x.\n"),
2637 		    wm->wm_magic, MD_SP_MAGIC);
2638 		/*
2639 		 * Pass NULL for the device name as we don't have
2640 		 * valid watermark contents.
2641 		 */
2642 		return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL));
2643 	}
2644 
2645 	if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
2646 	    sizeof (*wm), NULL)) {
2647 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2648 		    "found incorrect checksum %x.\n"),
2649 		    wm->wm_checksum);
2650 		return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname));
2651 	}
2652 
2653 	return (0);
2654 }
2655 
2656 /*
2657  * **************************************************************************
2658  *                  Query Functions
2659  * **************************************************************************
2660  */
2661 
2662 /*
2663  * IMPORTANT NOTE: This is a static function that assumes that
2664  *		   its input parameters have been checked and
2665  *		   have valid values that lie within acceptable
2666  *		   ranges.
2667  *
2668  * FUNCTION:	meta_sp_enough_space()
2669  * INPUT:	desired_number_of_sps - the number of soft partitions desired;
2670  *					must be > 0
2671  *		desired_sp_size - the desired soft partition size in blocks;
2672  *				  must be > 0
2673  *		extent_listpp - a reference to a reference to an extent
2674  *				list that lists the extents on a device;
2675  *				must be a reference to a reference to a
2676  *				valid extent list
2677  *		alignment - the desired data space alignment for the sp's
2678  * OUTPUT:	boolean_t return value
2679  * RETURNS:	boolean_t - B_TRUE if there's enough space in the extent
2680  *			    list to create the desired soft partitions,
2681  *			    B_FALSE if there's not enough space
2682  * PURPOSE:	determines whether there's enough free space in an extent
2683  *		list to allow creation of a set of soft partitions
2684  */
2685 static boolean_t
2686 meta_sp_enough_space(
2687 	int		desired_number_of_sps,
2688 	blkcnt_t	desired_sp_size,
2689 	sp_ext_node_t	**extent_listpp,
2690 	sp_ext_length_t	alignment
2691 )
2692 {
2693 	boolean_t		enough_space;
2694 	int			number_of_sps;
2695 	int			number_of_extents_used;
2696 	sp_ext_length_t		desired_ext_length = desired_sp_size;
2697 
2698 	enough_space = B_TRUE;
2699 	number_of_sps = 0;
2700 	while ((enough_space == B_TRUE) &&
2701 	    (number_of_sps < desired_number_of_sps)) {
2702 		/*
2703 		 * Use the extent allocation algorithm implemented by
2704 		 * meta_sp_alloc_by_len() to test whether the free
2705 		 * extents in the extent list referenced by *extent_listpp
2706 		 * contain enough space to accomodate a soft partition
2707 		 * of size desired_ext_length.
2708 		 *
2709 		 * Repeat the test <desired_number_of_sps> times
2710 		 * or until it fails, whichever comes first,
2711 		 * each time allocating the extents required to
2712 		 * create the soft partition without actually
2713 		 * creating the soft partition.
2714 		 */
2715 		number_of_extents_used = meta_sp_alloc_by_len(
2716 		    TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2717 		    extent_listpp, &desired_ext_length,
2718 		    NO_OFFSET, alignment);
2719 		if (number_of_extents_used == -1) {
2720 			enough_space = B_FALSE;
2721 		} else {
2722 			number_of_sps++;
2723 		}
2724 	}
2725 	return (enough_space);
2726 }
2727 
2728 /*
2729  * IMPORTANT NOTE: This is a static function that calls other functions
2730  *		   that check its mdsetnamep and device_mdnamep
2731  *		   input parameters, but expects extent_listpp to
2732  *		   be a initialized to a valid address to which
2733  *		   it can write a reference to the extent list that
2734  *		   it creates.
2735  *
2736  * FUNCTION:	meta_sp_get_extent_list()
2737  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2738  *			     for the set containing the device for
2739  *			     which the extents are to be listed
2740  *		device_mdnamep - a reference to the mdname_t structure
2741  *				 for the device for which the extents
2742  *				 are to be listed
2743  * OUTPUT:	*extent_listpp - a reference to the extent list for
2744  *				 the device; NULL if the function fails
2745  *		*ep - the libmeta error encountered, if any
2746  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2747  *			    B_FALSE if not
2748  * PURPOSE:	gets the extent list for a device
2749  */
2750 static boolean_t
2751 meta_sp_get_extent_list(
2752 	mdsetname_t	*mdsetnamep,
2753 	mdname_t	*device_mdnamep,
2754 	sp_ext_node_t	**extent_listpp,
2755 	md_error_t	*ep
2756 )
2757 {
2758 	diskaddr_t		device_size_in_blocks;
2759 	mdnamelist_t		*sp_name_listp;
2760 	diskaddr_t		start_block_address_in_blocks;
2761 
2762 	*extent_listpp = NULL;
2763 	sp_name_listp = NULL;
2764 
2765 	start_block_address_in_blocks = meta_sp_get_start(mdsetnamep,
2766 	    device_mdnamep, ep);
2767 	if (start_block_address_in_blocks == MD_DISKADDR_ERROR) {
2768 		if (getenv(META_SP_DEBUG)) {
2769 			mde_perror(ep,
2770 			    "meta_sp_get_extent_list:meta_sp_get_start");
2771 		}
2772 		return (B_FALSE);
2773 	}
2774 
2775 	device_size_in_blocks = metagetsize(device_mdnamep, ep);
2776 	if (device_size_in_blocks == MD_DISKADDR_ERROR) {
2777 		if (getenv(META_SP_DEBUG)) {
2778 			mde_perror(ep,
2779 			    "meta_sp_get_extent_list:metagetsize");
2780 		}
2781 		return (B_FALSE);
2782 	}
2783 
2784 	/*
2785 	 * Sanity check: the start block will have skipped an integer
2786 	 * number of cylinders, C.  C will usually be zero.  If (C > 0),
2787 	 * and the disk slice happens to only be C cylinders in total
2788 	 * size, we'll fail this check.
2789 	 */
2790 	if (device_size_in_blocks <=
2791 	    (start_block_address_in_blocks + MD_SP_WMSIZE)) {
2792 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname);
2793 		return (B_FALSE);
2794 	}
2795 
2796 	/*
2797 	 * After this point, we will have allocated resources, so any
2798 	 * failure returns must be through the supplied "fail" label
2799 	 * to properly deallocate things.
2800 	 */
2801 
2802 	/*
2803 	 * Create an empty extent list that starts one watermark past
2804 	 * the start block of the device and ends one watermark before
2805 	 * the end of the device.
2806 	 */
2807 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2808 	    extent_listpp, NO_OFFSET,
2809 	    (sp_ext_length_t)start_block_address_in_blocks,
2810 	    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2811 	    meta_sp_cmp_by_offset);
2812 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2813 	    extent_listpp, (sp_ext_offset_t)(device_size_in_blocks -
2814 	    MD_SP_WMSIZE), MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER,
2815 	    NO_FLAGS, meta_sp_cmp_by_offset);
2816 
2817 	/*
2818 	 * Get the list of soft partitions that are already on the
2819 	 * device.
2820 	 */
2821 	if (meta_sp_get_by_component(mdsetnamep, device_mdnamep,
2822 	    &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) {
2823 		if (getenv(META_SP_DEBUG)) {
2824 			mde_perror(ep,
2825 			    "meta_sp_get_extent_list:meta_sp_get_by_component");
2826 		}
2827 		goto fail;
2828 	}
2829 
2830 	if (sp_name_listp != NULL) {
2831 		/*
2832 		 * If there are soft partitions on the device, add the
2833 		 * extents used in them to the extent list.
2834 		 */
2835 		if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp,
2836 		    extent_listpp, ep) == -1) {
2837 			if (getenv(META_SP_DEBUG)) {
2838 				mde_perror(ep, "meta_sp_get_extent_list:"
2839 				    "meta_sp_extlist_from_namelist");
2840 			}
2841 			goto fail;
2842 		}
2843 		metafreenamelist(sp_name_listp);
2844 	}
2845 
2846 	/*
2847 	 * Add free extents to the extent list to represent
2848 	 * the remaining regions of free space on the
2849 	 * device.
2850 	 */
2851 	meta_sp_list_freefill(extent_listpp, device_size_in_blocks);
2852 	return (B_TRUE);
2853 
2854 fail:
2855 	if (sp_name_listp != NULL) {
2856 		metafreenamelist(sp_name_listp);
2857 	}
2858 
2859 	if (*extent_listpp != NULL) {
2860 		/*
2861 		 * meta_sp_list_free sets *extent_listpp to NULL.
2862 		 */
2863 		meta_sp_list_free(extent_listpp);
2864 	}
2865 	return (B_FALSE);
2866 }
2867 
2868 /*
2869  * IMPORTANT NOTE: This is a static function that calls other functions
2870  *		   that check its mdsetnamep and mddrivenamep
2871  *		   input parameters, but expects extent_listpp to
2872  *		   be a initialized to a valid address to which
2873  *		   it can write a reference to the extent list that
2874  *		   it creates.
2875  *
2876  * FUNCTION:	meta_sp_get_extent_list_for_drive()
2877  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2878  *			     for the set containing the drive for
2879  *			     which the extents are to be listed
2880  *		mddrivenamep   - a reference to the mddrivename_t structure
2881  *				 for the drive for which the extents
2882  *				 are to be listed
2883  * OUTPUT:	*extent_listpp - a reference to the extent list for
2884  *				 the drive; NULL if the function fails
2885  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2886  *			    B_FALSE if not
2887  * PURPOSE:	gets the extent list for a drive when the entire drive
2888  *		is to be soft partitioned
2889  */
2890 static boolean_t
2891 meta_sp_get_extent_list_for_drive(
2892 	mdsetname_t	*mdsetnamep,
2893 	mddrivename_t	*mddrivenamep,
2894 	sp_ext_node_t	**extent_listpp
2895 )
2896 {
2897 	boolean_t		can_use;
2898 	diskaddr_t		free_space;
2899 	md_error_t		mderror;
2900 	mdvtoc_t		proposed_vtoc;
2901 	int			repartition_options;
2902 	int			return_value;
2903 	md_sp_t			test_sp_struct;
2904 
2905 	can_use = B_TRUE;
2906 	*extent_listpp = NULL;
2907 	mderror = mdnullerror;
2908 	test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0,
2909 	    &mderror);
2910 	if (test_sp_struct.compnamep == NULL) {
2911 		can_use = B_FALSE;
2912 	}
2913 
2914 	if (can_use == B_TRUE) {
2915 		mderror = mdnullerror;
2916 		repartition_options = 0;
2917 		return_value = meta_check_sp(mdsetnamep, &test_sp_struct,
2918 		    MDCMD_USE_WHOLE_DISK, &repartition_options, &mderror);
2919 		if (return_value != 0) {
2920 			can_use = B_FALSE;
2921 		}
2922 	}
2923 
2924 	if (can_use == B_TRUE) {
2925 		mderror = mdnullerror;
2926 		repartition_options = repartition_options |
2927 		    (MD_REPART_FORCE | MD_REPART_DONT_LABEL);
2928 		return_value = meta_repartition_drive(mdsetnamep, mddrivenamep,
2929 		    repartition_options, &proposed_vtoc, &mderror);
2930 		if (return_value != 0) {
2931 			can_use = B_FALSE;
2932 		}
2933 	}
2934 
2935 	if (can_use == B_TRUE) {
2936 		free_space = proposed_vtoc.parts[MD_SLICE0].size;
2937 		if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) {
2938 			can_use = B_FALSE;
2939 		}
2940 	}
2941 
2942 	if (can_use == B_TRUE) {
2943 		/*
2944 		 * Create an extent list that starts with
2945 		 * a reserved extent that ends at the start
2946 		 * of the usable space on slice zero of the
2947 		 * proposed VTOC, ends with an extent that
2948 		 * reserves space for a watermark at the end
2949 		 * of slice zero, and contains a single free
2950 		 * extent that occupies the rest of the space
2951 		 * on the slice.
2952 		 *
2953 		 * NOTE:
2954 		 *
2955 		 * Don't use metagetstart() or metagetsize() to
2956 		 * find the usable space.  They query the mdname_t
2957 		 * structure that represents an actual device to
2958 		 * determine the amount of space on the device that
2959 		 * contains metadata and the total amount of space
2960 		 * on the device.  Since this function creates a
2961 		 * proposed extent list that doesn't reflect the
2962 		 * state of an actual device, there's no mdname_t
2963 		 * structure to be queried.
2964 		 *
2965 		 * When a drive is reformatted to prepare for
2966 		 * soft partitioning, all of slice seven is
2967 		 * reserved for metadata, all of slice zero is
2968 		 * available for soft partitioning, and all other
2969 		 * slices on the drive are empty.  The proposed
2970 		 * extent list for the drive therefore contains
2971 		 * only three extents: a reserved extent that ends
2972 		 * at the start of the usable space on slice zero,
2973 		 * a single free extent that occupies all the usable
2974 		 * space on slice zero, and an ending extent that
2975 		 * reserves space for a watermark at the end of
2976 		 * slice zero.
2977 		 */
2978 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2979 		    extent_listpp, NO_OFFSET, (sp_ext_length_t)(MD_SP_START),
2980 		    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2981 		    meta_sp_cmp_by_offset);
2982 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2983 		    extent_listpp, (sp_ext_offset_t)(free_space - MD_SP_WMSIZE),
2984 		    MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER, NO_FLAGS,
2985 		    meta_sp_cmp_by_offset);
2986 		meta_sp_list_freefill(extent_listpp, free_space);
2987 	}
2988 	return (can_use);
2989 }
2990 
2991 /*
2992  * FUNCTION:	meta_sp_can_create_sps()
2993  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2994  *			     for the set containing the device for
2995  *			     which the extents are to be listed
2996  *		mdnamep - a reference to the mdname_t of the device
2997  *			  on which the soft parititions are to be created
2998  *		number_of_sps - the desired number of soft partitions
2999  *		sp_size - the desired soft partition size
3000  * OUTPUT:	boolean_t return value
3001  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3002  *			    B_FALSE if not
3003  * PURPOSE:	determines whether a set of soft partitions can be created
3004  *		on a device
3005  */
3006 boolean_t
3007 meta_sp_can_create_sps(
3008 	mdsetname_t	*mdsetnamep,
3009 	mdname_t	*mdnamep,
3010 	int		number_of_sps,
3011 	blkcnt_t	sp_size
3012 )
3013 {
3014 	sp_ext_node_t	*extent_listp;
3015 	boolean_t	succeeded;
3016 	md_error_t	mde;
3017 
3018 	if ((number_of_sps > 0) && (sp_size > 0)) {
3019 		succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3020 		    &extent_listp, &mde);
3021 	} else {
3022 		succeeded = B_FALSE;
3023 	}
3024 
3025 	/*
3026 	 * We don't really care about an error return from the
3027 	 * alignment call; that will just result in passing zero,
3028 	 * which will be interpreted as no alignment.
3029 	 */
3030 
3031 	if (succeeded == B_TRUE) {
3032 		succeeded = meta_sp_enough_space(number_of_sps,
3033 		    sp_size, &extent_listp,
3034 		    meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde));
3035 		meta_sp_list_free(&extent_listp);
3036 	}
3037 	return (succeeded);
3038 }
3039 
3040 /*
3041  * FUNCTION:	meta_sp_can_create_sps_on_drive()
3042  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3043  *			     for the set containing the drive for
3044  *			     which the extents are to be listed
3045  *		mddrivenamep - a reference to the mddrivename_t of the drive
3046  *			       on which the soft parititions are to be created
3047  *		number_of_sps - the desired number of soft partitions
3048  *		sp_size - the desired soft partition size
3049  * OUTPUT:	boolean_t return value
3050  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3051  *			    B_FALSE if not
3052  * PURPOSE:	determines whether a set of soft partitions can be created
3053  *		on a drive if the entire drive is soft partitioned
3054  */
3055 boolean_t
3056 meta_sp_can_create_sps_on_drive(
3057 	mdsetname_t	*mdsetnamep,
3058 	mddrivename_t	*mddrivenamep,
3059 	int		number_of_sps,
3060 	blkcnt_t	sp_size
3061 )
3062 {
3063 	sp_ext_node_t	*extent_listp;
3064 	boolean_t	succeeded;
3065 
3066 	if ((number_of_sps > 0) && (sp_size > 0)) {
3067 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3068 		    mddrivenamep, &extent_listp);
3069 	} else {
3070 		succeeded = B_FALSE;
3071 	}
3072 
3073 	/*
3074 	 * We don't care about alignment on the space call because
3075 	 * we're specifically dealing with a drive, which will have no
3076 	 * inherent alignment.
3077 	 */
3078 
3079 	if (succeeded == B_TRUE) {
3080 		succeeded = meta_sp_enough_space(number_of_sps, sp_size,
3081 		    &extent_listp, SP_UNALIGNED);
3082 		meta_sp_list_free(&extent_listp);
3083 	}
3084 	return (succeeded);
3085 }
3086 
3087 /*
3088  * FUNCTION:	meta_sp_get_free_space()
3089  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3090  *			     for the set containing the device for
3091  *			     which the free space is to be returned
3092  *		mdnamep - a reference to the mdname_t of the device
3093  *			  for which the free space is to be returned
3094  * OUTPUT:	blkcnt_t return value
3095  * RETURNS:	blkcnt_t - the number of blocks of free space on the device
3096  * PURPOSE:	returns the number of blocks of free space on a device
3097  */
3098 blkcnt_t
3099 meta_sp_get_free_space(
3100 	mdsetname_t	*mdsetnamep,
3101 	mdname_t	*mdnamep
3102 )
3103 {
3104 	sp_ext_node_t		*extent_listp;
3105 	sp_ext_length_t		free_blocks;
3106 	boolean_t		succeeded;
3107 	md_error_t		mde;
3108 
3109 	extent_listp = NULL;
3110 	free_blocks = 0;
3111 	succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3112 	    &extent_listp, &mde);
3113 	if (succeeded == B_TRUE) {
3114 		free_blocks = meta_sp_list_size(extent_listp,
3115 		    EXTTYP_FREE, INCLUDE_WM);
3116 		meta_sp_list_free(&extent_listp);
3117 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3118 			/*
3119 			 * Subtract a safety margin for watermarks when
3120 			 * computing the number of blocks available for
3121 			 * use.  The actual number of watermarks can't
3122 			 * be calculated without knowing the exact numbers
3123 			 * and sizes of both the free extents and the soft
3124 			 * partitions to be created.  The calculation is
3125 			 * highly complex and error-prone even if those
3126 			 * quantities are known.  The approximate value
3127 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3128 			 * correct value in all practical cases.
3129 			 */
3130 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3131 		} else {
3132 			free_blocks = 0;
3133 		}
3134 	} else {
3135 		mdclrerror(&mde);
3136 	}
3137 
3138 	return (free_blocks);
3139 }
3140 
3141 /*
3142  * FUNCTION:	meta_sp_get_free_space_on_drive()
3143  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3144  *			     for the set containing the drive for
3145  *			     which the free space is to be returned
3146  *		mddrivenamep - a reference to the mddrivename_t of the drive
3147  *			       for which the free space is to be returned
3148  * OUTPUT:	blkcnt_t return value
3149  * RETURNS:	blkcnt_t - the number of blocks of free space on the drive
3150  * PURPOSE:	returns the number of blocks of space usable for soft
3151  *		partitions on an entire drive, if the entire drive is
3152  *		soft partitioned
3153  */
3154 blkcnt_t
3155 meta_sp_get_free_space_on_drive(
3156 	mdsetname_t	*mdsetnamep,
3157 	mddrivename_t	*mddrivenamep
3158 )
3159 {
3160 	sp_ext_node_t		*extent_listp;
3161 	sp_ext_length_t		free_blocks;
3162 	boolean_t		succeeded;
3163 
3164 	extent_listp = NULL;
3165 	free_blocks = 0;
3166 	succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3167 	    mddrivenamep, &extent_listp);
3168 	if (succeeded == B_TRUE) {
3169 		free_blocks = meta_sp_list_size(extent_listp,
3170 		    EXTTYP_FREE, INCLUDE_WM);
3171 		meta_sp_list_free(&extent_listp);
3172 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3173 			/*
3174 			 * Subtract a safety margin for watermarks when
3175 			 * computing the number of blocks available for
3176 			 * use.  The actual number of watermarks can't
3177 			 * be calculated without knowing the exact numbers
3178 			 * and sizes of both the free extents and the soft
3179 			 * partitions to be created.  The calculation is
3180 			 * highly complex and error-prone even if those
3181 			 * quantities are known.  The approximate value
3182 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3183 			 * correct value in all practical cases.
3184 			 */
3185 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3186 		} else {
3187 			free_blocks = 0;
3188 		}
3189 	}
3190 	return (free_blocks);
3191 }
3192 
3193 /*
3194  * FUNCTION:	meta_sp_get_number_of_possible_sps()
3195  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3196  *			     for the set containing the device for
3197  *			     which the number of possible soft partitions
3198  *			     is to be returned
3199  *		mdnamep - a reference to the mdname_t of the device
3200  *			  for which the number of possible soft partitions
3201  *			  is to be returned
3202  * OUTPUT:	int return value
3203  * RETURNS:	int - the number of soft partitions of the desired size
3204  *		      that can be created on the device
3205  * PURPOSE:	returns the number of soft partitions of a given size
3206  *		that can be created on a device
3207  */
3208 int
3209 meta_sp_get_number_of_possible_sps(
3210 	mdsetname_t	*mdsetnamep,
3211 	mdname_t	*mdnamep,
3212 	blkcnt_t	sp_size
3213 )
3214 {
3215 	sp_ext_node_t	*extent_listp;
3216 	int		number_of_possible_sps;
3217 	boolean_t	succeeded;
3218 	md_error_t	mde;
3219 	sp_ext_length_t	alignment;
3220 
3221 	extent_listp = NULL;
3222 	number_of_possible_sps = 0;
3223 	if (sp_size > 0) {
3224 		if ((succeeded = meta_sp_get_extent_list(mdsetnamep,
3225 		    mdnamep, &extent_listp, &mde)) == B_FALSE)
3226 			mdclrerror(&mde);
3227 	} else {
3228 		succeeded = B_FALSE;
3229 	}
3230 
3231 	if (succeeded == B_TRUE) {
3232 		alignment = meta_sp_get_default_alignment(mdsetnamep,
3233 		    mdnamep, &mde);
3234 	}
3235 
3236 	while (succeeded == B_TRUE) {
3237 		/*
3238 		 * Keep allocating space from the extent list
3239 		 * for soft partitions of the desired size until
3240 		 * there's not enough free space left in the list
3241 		 * for another soft partiition of that size.
3242 		 * Add one to the number of possible soft partitions
3243 		 * for each soft partition for which there is
3244 		 * enough free space left.
3245 		 */
3246 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3247 		    sp_size, &extent_listp, alignment);
3248 		if (succeeded == B_TRUE) {
3249 			number_of_possible_sps++;
3250 		}
3251 	}
3252 	if (extent_listp != NULL) {
3253 		meta_sp_list_free(&extent_listp);
3254 	}
3255 	return (number_of_possible_sps);
3256 }
3257 
3258 /*
3259  * FUNCTION:	meta_sp_get_number_of_possible_sps_on_drive()
3260  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3261  *			     for the set containing the drive for
3262  *			     which the number of possible soft partitions
3263  *			     is to be returned
3264  *		mddrivenamep - a reference to the mddrivename_t of the drive
3265  *			       for which the number of possible soft partitions
3266  *			       is to be returned
3267  *		sp_size - the size in blocks of the proposed soft partitions
3268  * OUTPUT:	int return value
3269  * RETURNS:	int - the number of soft partitions of the desired size
3270  *		      that can be created on the drive
3271  * PURPOSE:	returns the number of soft partitions of a given size
3272  *		that can be created on a drive, if the entire drive is
3273  *		soft partitioned
3274  */
3275 int
3276 meta_sp_get_number_of_possible_sps_on_drive(
3277 	mdsetname_t	*mdsetnamep,
3278 	mddrivename_t	*mddrivenamep,
3279 	blkcnt_t	sp_size
3280 )
3281 {
3282 	sp_ext_node_t	*extent_listp;
3283 	int		number_of_possible_sps;
3284 	boolean_t	succeeded;
3285 
3286 	extent_listp = NULL;
3287 	number_of_possible_sps = 0;
3288 	if (sp_size > 0) {
3289 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3290 		    mddrivenamep, &extent_listp);
3291 	} else {
3292 		succeeded = B_FALSE;
3293 	}
3294 	while (succeeded == B_TRUE) {
3295 		/*
3296 		 * Keep allocating space from the extent list
3297 		 * for soft partitions of the desired size until
3298 		 * there's not enough free space left in the list
3299 		 * for another soft partition of that size.
3300 		 * Add one to the number of possible soft partitions
3301 		 * for each soft partition for which there is
3302 		 * enough free space left.
3303 		 *
3304 		 * Since it's a drive, not a metadevice, make no
3305 		 * assumptions about alignment.
3306 		 */
3307 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3308 		    sp_size, &extent_listp, SP_UNALIGNED);
3309 		if (succeeded == B_TRUE) {
3310 			number_of_possible_sps++;
3311 		}
3312 	}
3313 	if (extent_listp != NULL) {
3314 		meta_sp_list_free(&extent_listp);
3315 	}
3316 	return (number_of_possible_sps);
3317 }
3318 
3319 /*
3320  * FUNCTION:	meta_sp_get_possible_sp_size()
3321  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3322  *			     for the set containing the device for
3323  *			     which the possible soft partition size
3324  *			     is to be returned
3325  *		mdnamep - a reference to the mdname_t of the device
3326  *			  for which the possible soft partition size
3327  *			  is to be returned
3328  *		number_of_sps - the desired number of soft partitions
3329  * OUTPUT:	blkcnt_t return value
3330  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3331  * PURPOSE:	returns the maximum possible size of each of a given number of
3332  *		soft partitions of equal size that can be created on a device
3333  */
3334 blkcnt_t
3335 meta_sp_get_possible_sp_size(
3336 	mdsetname_t	*mdsetnamep,
3337 	mdname_t	*mdnamep,
3338 	int		number_of_sps
3339 )
3340 {
3341 	blkcnt_t	free_blocks;
3342 	blkcnt_t	sp_size;
3343 	boolean_t	succeeded;
3344 
3345 	sp_size = 0;
3346 	if (number_of_sps > 0) {
3347 		free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep);
3348 		sp_size = free_blocks / number_of_sps;
3349 		succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3350 		    number_of_sps, sp_size);
3351 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3352 			/*
3353 			 * To compensate for space that may have been
3354 			 * occupied by watermarks, reduce sp_size by a
3355 			 * number of blocks equal to the number of soft
3356 			 * partitions desired, and test again to see
3357 			 * whether the desired number of soft partitions
3358 			 * can be created.
3359 			 */
3360 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3361 			succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3362 			    number_of_sps, sp_size);
3363 		}
3364 		if (sp_size < 0) {
3365 			sp_size = 0;
3366 		}
3367 	}
3368 	return (sp_size);
3369 }
3370 
3371 /*
3372  * FUNCTION:	meta_sp_get_possible_sp_size_on_drive()
3373  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3374  *			     for the set containing the drive for
3375  *			     which the possible soft partition size
3376  *			     is to be returned
3377  *		mddrivenamep - a reference to the mddrivename_t of the drive
3378  *			       for which the possible soft partition size
3379  *			       is to be returned
3380  *		number_of_sps - the desired number of soft partitions
3381  * OUTPUT:	blkcnt_t return value
3382  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3383  * PURPOSE:	returns the maximum possible size of each of a given number of
3384  *		soft partitions of equal size that can be created on a drive
3385  *              if the entire drive is soft partitioned
3386  */
3387 blkcnt_t
3388 meta_sp_get_possible_sp_size_on_drive(
3389 	mdsetname_t	*mdsetnamep,
3390 	mddrivename_t	*mddrivenamep,
3391 	int		number_of_sps
3392 )
3393 {
3394 	blkcnt_t	free_blocks;
3395 	blkcnt_t	sp_size;
3396 	boolean_t	succeeded;
3397 
3398 	sp_size = 0;
3399 	if (number_of_sps > 0) {
3400 		free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep,
3401 		    mddrivenamep);
3402 		sp_size = free_blocks / number_of_sps;
3403 		succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3404 		    mddrivenamep, number_of_sps, sp_size);
3405 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3406 			/*
3407 			 * To compensate for space that may have been
3408 			 * occupied by watermarks, reduce sp_size by a
3409 			 * number of blocks equal to the number of soft
3410 			 * partitions desired, and test again to see
3411 			 * whether the desired number of soft partitions
3412 			 * can be created.
3413 			 */
3414 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3415 			succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3416 			    mddrivenamep, number_of_sps, sp_size);
3417 		}
3418 		if (sp_size < 0) {
3419 			sp_size = 0;
3420 		}
3421 	}
3422 	return (sp_size);
3423 }
3424 
3425 /*
3426  * **************************************************************************
3427  *                  Unit Structure Manipulation Functions                   *
3428  * **************************************************************************
3429  */
3430 
3431 /*
3432  * FUNCTION:	meta_sp_fillextarray()
3433  * INPUT:	mp	- the unit structure to fill
3434  *		extlist	- the list of extents to fill with
3435  * OUTPUT:	none
3436  * RETURNS:	void
3437  * PURPOSE:	fills in the unit structure extent list with the extents
3438  *		specified by extlist.  Only extents in extlist with the
3439  *		EXTFLG_UPDATE flag are changed in the unit structure,
3440  *		and the index into the unit structure is the sequence
3441  *		number in the extent list.  After all of the nodes have
3442  *		been updated the virtual offsets in the unit structure
3443  *		are updated to reflect the new lengths.
3444  */
3445 static void
3446 meta_sp_fillextarray(
3447 	mp_unit_t	*mp,
3448 	sp_ext_node_t	*extlist
3449 )
3450 {
3451 	int	i;
3452 	sp_ext_node_t	*ext;
3453 	sp_ext_offset_t	curvoff = 0LL;
3454 
3455 	assert(mp != NULL);
3456 
3457 	/* go through the allocation list and fill in our unit structure */
3458 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
3459 		if ((ext->ext_type == EXTTYP_ALLOC) &&
3460 		    (ext->ext_flags & EXTFLG_UPDATE) != 0) {
3461 			mp->un_ext[ext->ext_seq].un_poff =
3462 			    ext->ext_offset + MD_SP_WMSIZE;
3463 			mp->un_ext[ext->ext_seq].un_len =
3464 			    ext->ext_length - MD_SP_WMSIZE;
3465 		}
3466 	}
3467 
3468 	for (i = 0; i < mp->un_numexts; i++) {
3469 		assert(mp->un_ext[i].un_poff != 0);
3470 		assert(mp->un_ext[i].un_len  != 0);
3471 		mp->un_ext[i].un_voff = curvoff;
3472 		curvoff += mp->un_ext[i].un_len;
3473 	}
3474 }
3475 
3476 /*
3477  * FUNCTION:	meta_sp_createunit()
3478  * INPUT:	np	- the name of the device to create a unit structure for
3479  *		compnp	- the name of the device the soft partition is on
3480  *		extlist	- the extent list to populate the new unit with
3481  *		numexts	- the number of extents in the extent list
3482  *		len	- the total size of the soft partition (sectors)
3483  *		status	- the initial status of the unit structure
3484  * OUTPUT:	ep	- return error pointer
3485  * RETURNS:	mp_unit_t * - the new unit structure.
3486  * PURPOSE:	allocates and fills in a new soft partition unit
3487  *		structure to be passed to the soft partitioning driver
3488  *		for creation.
3489  */
3490 static mp_unit_t *
3491 meta_sp_createunit(
3492 	mdname_t	*np,
3493 	mdname_t	*compnp,
3494 	sp_ext_node_t	*extlist,
3495 	int		numexts,
3496 	sp_ext_length_t	len,
3497 	sp_status_t	status,
3498 	md_error_t	*ep
3499 )
3500 {
3501 	mp_unit_t	*mp;
3502 	uint_t		ms_size;
3503 
3504 	ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) +
3505 	    (numexts * sizeof (mp->un_ext[0]));
3506 
3507 	mp = Zalloc(ms_size);
3508 
3509 	/* fill in fields in common unit structure */
3510 	mp->c.un_type = MD_METASP;
3511 	mp->c.un_size = ms_size;
3512 	MD_SID(mp) = meta_getminor(np->dev);
3513 	mp->c.un_total_blocks = len;
3514 	mp->c.un_actual_tb = len;
3515 
3516 	/* set up geometry */
3517 	(void) meta_sp_setgeom(np, compnp, mp, ep);
3518 
3519 	/* if we're building on metadevice we can't parent */
3520 	if (metaismeta(compnp))
3521 		MD_CAPAB(mp) = MD_CANT_PARENT;
3522 	else
3523 		MD_CAPAB(mp) = MD_CAN_PARENT;
3524 
3525 	/* fill soft partition-specific fields */
3526 	mp->un_dev = compnp->dev;
3527 	mp->un_key = compnp->key;
3528 
3529 	/* mdname_t start_blk field is not 64-bit! */
3530 	mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk;
3531 	mp->un_status = status;
3532 	mp->un_numexts = numexts;
3533 	mp->un_length = len;
3534 
3535 	/* fill in the extent array */
3536 	meta_sp_fillextarray(mp, extlist);
3537 
3538 	return (mp);
3539 }
3540 
3541 /*
3542  * FUNCTION:	meta_sp_updateunit()
3543  * INPUT:	np       - name structure for the metadevice being updated
3544  *		old_un	 - the original unit structure that is being updated
3545  *		extlist	 - the extent list to populate the new unit with
3546  *		grow_len - the amount by which the partition is being grown
3547  *		numexts	 - the number of extents in the extent list
3548  *		ep       - return error pointer
3549  * OUTPUT:	none
3550  * RETURNS:	mp_unit_t * - the updated unit structure
3551  * PURPOSE:	allocates and fills in a new soft partition unit structure to
3552  *		be passed to the soft partitioning driver for creation.  The
3553  *		old unit structure is first copied in, and then the updated
3554  *		extents are changed in the new unit structure.  This is
3555  *		typically used when the size of an existing unit is changed.
3556  */
3557 static mp_unit_t *
3558 meta_sp_updateunit(
3559 	mdname_t	*np,
3560 	mp_unit_t	*old_un,
3561 	sp_ext_node_t	*extlist,
3562 	sp_ext_length_t	grow_len,
3563 	int		numexts,
3564 	md_error_t	*ep
3565 )
3566 {
3567 	mp_unit_t	*new_un;
3568 	sp_ext_length_t	new_len;
3569 	uint_t		new_size;
3570 
3571 	assert(old_un != NULL);
3572 	assert(extlist != NULL);
3573 
3574 	/* allocate new unit structure and copy in old unit */
3575 	new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) +
3576 	    ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0]));
3577 	new_len = old_un->un_length + grow_len;
3578 	new_un = Zalloc(new_size);
3579 	bcopy(old_un, new_un, old_un->c.un_size);
3580 
3581 	/* update size and geometry information */
3582 	new_un->c.un_size = new_size;
3583 	new_un->un_length = new_len;
3584 	new_un->c.un_total_blocks = new_len;
3585 	new_un->c.un_actual_tb = new_len;
3586 	if (meta_adjust_geom((md_unit_t *)new_un, np,
3587 	    old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct,
3588 	    0, ep) != 0) {
3589 		Free(new_un);
3590 		return (NULL);
3591 	}
3592 
3593 	/* update extent information */
3594 	new_un->un_numexts += numexts;
3595 
3596 	meta_sp_fillextarray(new_un, extlist);
3597 
3598 	return (new_un);
3599 }
3600 
3601 /*
3602  * FUNCTION:	meta_get_sp()
3603  * INPUT:	sp	- the set name for the device to get
3604  *		np	- the name of the device to get
3605  * OUTPUT:	ep	- return error pointer
3606  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition
3607  * PURPOSE:	interface to the rest of libmeta for fetching a unit structure
3608  *		for the named device.  Just a wrapper for meta_get_sp_common().
3609  */
3610 md_sp_t *
3611 meta_get_sp(
3612 	mdsetname_t	*sp,
3613 	mdname_t	*np,
3614 	md_error_t	*ep
3615 )
3616 {
3617 	return (meta_get_sp_common(sp, np, 0, ep));
3618 }
3619 
3620 /*
3621  * FUNCTION:	meta_get_sp_common()
3622  * INPUT:	sp	- the set name for the device to get
3623  *		np	- the name of the device to get
3624  *		fast	- whether to use the cache or not (NOT IMPLEMENTED!)
3625  * OUTPUT:	ep	- return error pointer
3626  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition,
3627  *			    NULL if np is not a soft partition
3628  * PURPOSE:	common routine for fetching a soft partition unit structure
3629  */
3630 md_sp_t *
3631 meta_get_sp_common(
3632 	mdsetname_t	*sp,
3633 	mdname_t	*np,
3634 	int		fast,
3635 	md_error_t	*ep
3636 )
3637 {
3638 	mddrivename_t	*dnp = np->drivenamep;
3639 	char		*miscname;
3640 	mp_unit_t	*mp;
3641 	md_sp_t		*msp;
3642 	int		i;
3643 
3644 	/* must have set */
3645 	assert(sp != NULL);
3646 
3647 	/* short circuit */
3648 	if (dnp->unitp != NULL) {
3649 		if (dnp->unitp->type != MD_METASP)
3650 			return (NULL);
3651 		return ((md_sp_t *)dnp->unitp);
3652 	}
3653 	/* get miscname and unit */
3654 	if ((miscname = metagetmiscname(np, ep)) == NULL)
3655 		return (NULL);
3656 
3657 	if (strcmp(miscname, MD_SP) != 0) {
3658 		(void) mdmderror(ep, MDE_NOT_SP, 0, np->cname);
3659 		return (NULL);
3660 	}
3661 
3662 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
3663 		return (NULL);
3664 
3665 	assert(mp->c.un_type == MD_METASP);
3666 
3667 	/* allocate soft partition */
3668 	msp = Zalloc(sizeof (*msp));
3669 
3670 	/* get the common information */
3671 	msp->common.namep = np;
3672 	msp->common.type = mp->c.un_type;
3673 	msp->common.state = mp->c.un_status;
3674 	msp->common.capabilities = mp->c.un_capabilities;
3675 	msp->common.parent = mp->c.un_parent;
3676 	msp->common.size = mp->c.un_total_blocks;
3677 	msp->common.user_flags = mp->c.un_user_flags;
3678 	msp->common.revision = mp->c.un_revision;
3679 
3680 	/* get soft partition information */
3681 	if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL)
3682 		goto out;
3683 
3684 	/*
3685 	 * Fill in the key and the start block.  Note that the start
3686 	 * block in the unit structure is 64 bits but the name pointer
3687 	 * only supports 32 bits.
3688 	 */
3689 	msp->compnamep->key = mp->un_key;
3690 	msp->compnamep->start_blk = mp->un_start_blk;
3691 
3692 	/* fill in status field */
3693 	msp->status = mp->un_status;
3694 
3695 	/* allocate the extents */
3696 	msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val));
3697 	msp->ext.ext_len = mp->un_numexts;
3698 
3699 	/* do the extents for this soft partition */
3700 	for (i = 0; i < mp->un_numexts; i++) {
3701 		struct mp_ext	*mde = &mp->un_ext[i];
3702 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
3703 
3704 		extp->voff = mde->un_voff;
3705 		extp->poff = mde->un_poff;
3706 		extp->len = mde->un_len;
3707 	}
3708 
3709 	/* cleanup, return success */
3710 	Free(mp);
3711 	dnp->unitp = (md_common_t *)msp;
3712 	return (msp);
3713 
3714 out:
3715 	/* clean up and return error */
3716 	Free(mp);
3717 	Free(msp);
3718 	return (NULL);
3719 }
3720 
3721 
3722 /*
3723  * FUNCTION:	meta_init_sp()
3724  * INPUT:	spp	- the set name for the new device
3725  *		argc	- the remaining argument count for the metainit cmdline
3726  *		argv	- the remainder of the unparsed command line
3727  *		options	- global options parsed by metainit
3728  * OUTPUT:	ep	- return error pointer
3729  * RETURNS:	int	- -1 failure, 0 success
3730  * PURPOSE:	provides the command line parsing and name management overhead
3731  *		for creating a new soft partition.  Ultimately this calls
3732  *		meta_create_sp() which does the real work of allocating space
3733  *		for the new soft partition.
3734  */
3735 int
3736 meta_init_sp(
3737 	mdsetname_t	**spp,
3738 	int		argc,
3739 	char		*argv[],
3740 	mdcmdopts_t	options,
3741 	md_error_t	*ep
3742 )
3743 {
3744 	char		*compname = NULL;
3745 	mdname_t	*spcompnp = NULL;	/* name of component volume */
3746 	char		*devname = argv[0];	/* unit name */
3747 	mdname_t	*np = NULL;		/* name of soft partition */
3748 	md_sp_t		*msp = NULL;
3749 	int		c;
3750 	int		old_optind;
3751 	sp_ext_length_t	len = 0LL;
3752 	int		rval = -1;
3753 	uint_t		seq;
3754 	int		oflag;
3755 	int		failed;
3756 	mddrivename_t	*dnp = NULL;
3757 	sp_ext_length_t	alignment = 0LL;
3758 	sp_ext_node_t	*extlist = NULL;
3759 
3760 	assert(argc > 0);
3761 
3762 	/* expect sp name, -p, optional -e, compname, and size parameters */
3763 	/* grab soft partition name */
3764 	if ((np = metaname(spp, devname, META_DEVICE, ep)) == NULL)
3765 		goto out;
3766 
3767 	/* see if it exists already */
3768 	if (metagetmiscname(np, ep) != NULL) {
3769 		(void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP,
3770 		    meta_getminor(np->dev), devname);
3771 		goto out;
3772 	} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) {
3773 		goto out;
3774 	} else {
3775 		mdclrerror(ep);
3776 	}
3777 	--argc, ++argv;
3778 
3779 	if (argc == 0)
3780 		goto syntax;
3781 
3782 	/* grab -p */
3783 	if (strcmp(argv[0], "-p") != 0)
3784 		goto syntax;
3785 	--argc, ++argv;
3786 
3787 	if (argc == 0)
3788 		goto syntax;
3789 
3790 	/* see if -e is there */
3791 	if (strcmp(argv[0], "-e") == 0) {
3792 		/* use the whole disk */
3793 		options |= MDCMD_USE_WHOLE_DISK;
3794 		--argc, ++argv;
3795 	}
3796 
3797 	if (argc == 0)
3798 		goto syntax;
3799 
3800 	/* get component name */
3801 	compname = Strdup(argv[0]);
3802 
3803 	if (options & MDCMD_USE_WHOLE_DISK) {
3804 		if ((dnp = metadrivename(spp, compname, ep)) == NULL) {
3805 			goto out;
3806 		}
3807 		if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) {
3808 			goto out;
3809 		}
3810 	} else if ((spcompnp = metaname(spp, compname, UNKNOWN, ep)) == NULL) {
3811 		goto out;
3812 	}
3813 	assert(*spp != NULL);
3814 
3815 	if (!(options & MDCMD_NOLOCK)) {
3816 		/* grab set lock */
3817 		if (meta_lock(*spp, TRUE, ep))
3818 			goto out;
3819 
3820 		if (meta_check_ownership(*spp, ep) != 0)
3821 			goto out;
3822 	}
3823 
3824 	/* allocate the soft partition */
3825 	msp = Zalloc(sizeof (*msp));
3826 
3827 	/* setup common */
3828 	msp->common.namep = np;
3829 	msp->common.type = MD_METASP;
3830 
3831 	compname = spcompnp->cname;
3832 
3833 	assert(spcompnp->rname != NULL);
3834 	--argc, ++argv;
3835 
3836 	if (argc == 0) {
3837 		goto syntax;
3838 	}
3839 
3840 	if (*argv[0] == '-') {
3841 		/*
3842 		 * parse any other command line options, this includes
3843 		 * the recovery options -o and -b. The special thing
3844 		 * with these options is that the len needs to be
3845 		 * kept track of otherwise when the geometry of the
3846 		 * "device" is built it will create an invalid geometry
3847 		 */
3848 		old_optind = optind = 0;
3849 		opterr = 0;
3850 		oflag = 0;
3851 		seq = 0;
3852 		failed = 0;
3853 		while ((c = getopt(argc, argv, "A:o:b:")) != -1) {
3854 			sp_ext_offset_t	offset;
3855 			sp_ext_length_t	length;
3856 			longlong_t	tmp_size;
3857 
3858 			switch (c) {
3859 			case 'A':	/* data alignment */
3860 				if (meta_sp_parsesizestring(optarg,
3861 				    &alignment) == -1) {
3862 					failed = 1;
3863 				}
3864 				break;
3865 			case 'o':	/* offset in the partition */
3866 				if (oflag == 1) {
3867 					failed = 1;
3868 				} else {
3869 					tmp_size = atoll(optarg);
3870 					if (tmp_size <= 0) {
3871 						failed = 1;
3872 					} else {
3873 						oflag = 1;
3874 						options |= MDCMD_DIRECT;
3875 
3876 						offset = tmp_size;
3877 					}
3878 				}
3879 
3880 				break;
3881 			case 'b':	/* number of blocks */
3882 				if (oflag == 0) {
3883 					failed = 1;
3884 				} else {
3885 					tmp_size = atoll(optarg);
3886 					if (tmp_size <= 0) {
3887 						failed = 1;
3888 					} else {
3889 						oflag = 0;
3890 
3891 						length = tmp_size;
3892 
3893 						/* we have a pair of values */
3894 						meta_sp_list_insert(*spp, np,
3895 						    &extlist, offset, length,
3896 						    EXTTYP_ALLOC, seq++,
3897 						    EXTFLG_UPDATE,
3898 						    meta_sp_cmp_by_offset);
3899 						len += length;
3900 					}
3901 				}
3902 
3903 				break;
3904 			default:
3905 				argc -= old_optind;
3906 				argv += old_optind;
3907 				goto options;
3908 			}
3909 
3910 			if (failed) {
3911 				argc -= old_optind;
3912 				argv += old_optind;
3913 				goto syntax;
3914 			}
3915 
3916 			old_optind = optind;
3917 		}
3918 		argc -= optind;
3919 		argv += optind;
3920 
3921 		/*
3922 		 * Must have matching pairs of -o and -b flags
3923 		 */
3924 		if (oflag != 0)
3925 			goto syntax;
3926 
3927 		/*
3928 		 * Can't specify both layout (indicated indirectly by
3929 		 * len being set by thye -o/-b cases above) AND
3930 		 * alignment
3931 		 */
3932 		if ((len > 0LL) && (alignment > 0LL))
3933 			goto syntax;
3934 
3935 		/*
3936 		 * sanity check the allocation list
3937 		 */
3938 		if ((extlist != NULL) && meta_sp_list_overlaps(extlist))
3939 			goto syntax;
3940 	}
3941 
3942 	if (len == 0LL) {
3943 		if (argc == 0)
3944 			goto syntax;
3945 		if (meta_sp_parsesize(argv[0], &len) == -1)
3946 			goto syntax;
3947 		--argc, ++argv;
3948 	}
3949 
3950 	msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val));
3951 	msp->ext.ext_val->len = len;
3952 	msp->compnamep = spcompnp;
3953 
3954 	/* we should be at the end */
3955 	if (argc != 0)
3956 		goto syntax;
3957 
3958 	/* create soft partition */
3959 	if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0)
3960 		goto out;
3961 	rval = 0;
3962 
3963 	/* let em know */
3964 	if (options & MDCMD_PRINT) {
3965 		(void) printf(dgettext(TEXT_DOMAIN,
3966 		    "%s: Soft Partition is setup\n"),
3967 		    devname);
3968 		(void) fflush(stdout);
3969 	}
3970 	goto out;
3971 
3972 syntax:
3973 	/* syntax error */
3974 	rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv);
3975 	goto out;
3976 
3977 options:
3978 	/* options error */
3979 	rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv);
3980 	goto out;
3981 
3982 out:
3983 	if (msp != NULL) {
3984 		if (msp->ext.ext_val != NULL) {
3985 			Free(msp->ext.ext_val);
3986 		}
3987 		Free(msp);
3988 	}
3989 
3990 	return (rval);
3991 }
3992 
3993 /*
3994  * FUNCTION:	meta_free_sp()
3995  * INPUT:	msp	- the soft partition unit to free
3996  * OUTPUT:	none
3997  * RETURNS:	void
3998  * PURPOSE:	provides an interface from the rest of libmeta for freeing a
3999  *		soft partition unit
4000  */
4001 void
4002 meta_free_sp(md_sp_t *msp)
4003 {
4004 	Free(msp);
4005 }
4006 
4007 /*
4008  * FUNCTION:	meta_sp_issp()
4009  * INPUT:	sp	- the set name to check
4010  *		np	- the name to check
4011  * OUTPUT:	ep	- return error pointer
4012  * RETURNS:	int	- 0 means sp,np is a soft partition
4013  *			  1 means sp,np is not a soft partition
4014  * PURPOSE:	determines whether the given device is a soft partition
4015  *		device.  This is called by other metadevice check routines.
4016  */
4017 int
4018 meta_sp_issp(
4019 	mdsetname_t	*sp,
4020 	mdname_t	*np,
4021 	md_error_t	*ep
4022 )
4023 {
4024 	if (meta_get_sp_common(sp, np, 0, ep) == NULL)
4025 		return (1);
4026 
4027 	return (0);
4028 }
4029 
4030 /*
4031  * FUNCTION:	meta_check_sp()
4032  * INPUT:	sp	- the set name to check
4033  *		msp	- the unit structure to check
4034  *		options	- creation options
4035  * OUTPUT:	repart_options - options to be passed to
4036  *				meta_repartition_drive()
4037  *		ep	- return error pointer
4038  * RETURNS:	int	-  0 ok to create on this component
4039  *			  -1 error or not ok to create on this component
4040  * PURPOSE:	Checks to determine whether the rules for creation of
4041  *		soft partitions allow creation of a soft partition on
4042  *		the device described by the mdname_t structure referred
4043  *		to by msp->compnamep.
4044  *
4045  *		NOTE: Does NOT check to determine whether the extents
4046  *		      described in the md_sp_t structure referred to by
4047  *		      msp will fit on the device described by the mdname_t
4048  *		      structure located at msp->compnamep.
4049  */
4050 static int
4051 meta_check_sp(
4052 	mdsetname_t	*sp,
4053 	md_sp_t		*msp,
4054 	mdcmdopts_t	options,
4055 	int		*repart_options,
4056 	md_error_t	*ep
4057 )
4058 {
4059 	md_common_t	*mdp;
4060 	mdname_t	*compnp = msp->compnamep;
4061 	uint_t		slice;
4062 	mddrivename_t	*dnp;
4063 	mdname_t	*slicenp;
4064 	mdvtoc_t	*vtocp;
4065 
4066 	/* make sure it is in the set */
4067 	if (meta_check_inset(sp, compnp, ep) != 0)
4068 		return (-1);
4069 
4070 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4071 		uint_t	rep_slice;
4072 
4073 		/*
4074 		 * check to make sure we can partition this drive.
4075 		 * we cannot continue if any of the following are
4076 		 * true:
4077 		 * The drive is a metadevice.
4078 		 * The drive contains a mounted slice.
4079 		 * The drive contains a slice being swapped to.
4080 		 * The drive contains slices which are part of other
4081 		 * metadevices.
4082 		 * The drive contains a metadb.
4083 		 */
4084 		if (metaismeta(compnp))
4085 			return (mddeverror(ep, MDE_IS_META, compnp->dev,
4086 			    compnp->cname));
4087 
4088 		assert(compnp->drivenamep != NULL);
4089 
4090 		/*
4091 		 * ensure that we have slice 0 since the disk will be
4092 		 * repartitioned in the USE_WHOLE_DISK case.  this check
4093 		 * is redundant unless the user incorrectly specifies a
4094 		 * a fully qualified drive AND slice name (i.e.,
4095 		 * /dev/dsk/cXtXdXsX), which will be incorrectly
4096 		 * recognized as a drive name by the metaname code.
4097 		 */
4098 
4099 		if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL)
4100 			return (-1);
4101 		if (slice != MD_SLICE0)
4102 			return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname));
4103 
4104 		dnp = compnp->drivenamep;
4105 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
4106 			return (-1);
4107 
4108 		for (slice = 0; slice < vtocp->nparts; slice++) {
4109 
4110 			/* only check if the slice really exists */
4111 			if (vtocp->parts[slice].size == 0)
4112 				continue;
4113 
4114 			slicenp = metaslicename(dnp, slice, ep);
4115 			if (slicenp == NULL)
4116 				return (-1);
4117 
4118 			/* check to ensure that it is not already in use */
4119 			if (meta_check_inuse(sp,
4120 			    slicenp, MDCHK_INUSE, ep) != 0) {
4121 				return (-1);
4122 			}
4123 
4124 			/*
4125 			 * Up to this point, tests are applied to all
4126 			 * slices uniformly.
4127 			 */
4128 
4129 			if (slice == rep_slice) {
4130 				/*
4131 				 * Tests inside the body of this
4132 				 * conditional are applied only to
4133 				 * slice seven.
4134 				 */
4135 				if (meta_check_inmeta(sp, slicenp,
4136 				    options | MDCHK_ALLOW_MDDB |
4137 				    MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0)
4138 					return (-1);
4139 
4140 				/*
4141 				 * For slice seven, a metadb is NOT an
4142 				 * automatic failure. It merely means
4143 				 * that we're not allowed to muck
4144 				 * about with the partitioning of that
4145 				 * slice.  We indicate this by masking
4146 				 * in the MD_REPART_LEAVE_REP flag.
4147 				 */
4148 				if (metahasmddb(sp, slicenp, ep)) {
4149 					assert(repart_options !=
4150 					    NULL);
4151 					*repart_options |=
4152 					    MD_REPART_LEAVE_REP;
4153 				}
4154 
4155 				/*
4156 				 * Skip the remaining tests for slice
4157 				 * seven
4158 				 */
4159 				continue;
4160 			}
4161 
4162 			/*
4163 			 * Tests below this point will be applied to
4164 			 * all slices EXCEPT for the replica slice.
4165 			 */
4166 
4167 
4168 			/* check if component is in a metadevice */
4169 			if (meta_check_inmeta(sp, slicenp, options, 0,
4170 			    -1, ep) != 0)
4171 				return (-1);
4172 
4173 			/* check to see if component has a metadb */
4174 			if (metahasmddb(sp, slicenp, ep))
4175 				return (mddeverror(ep, MDE_HAS_MDDB,
4176 				    slicenp->dev, slicenp->cname));
4177 		}
4178 		/*
4179 		 * This should be all of the testing necessary when
4180 		 * the MDCMD_USE_WHOLE_DISK flag is set; the rest of
4181 		 * meta_check_sp() is oriented towards component
4182 		 * arguments instead of disks.
4183 		 */
4184 		goto meta_check_sp_ok;
4185 
4186 	}
4187 
4188 	/* check to ensure that it is not already in use */
4189 	if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) {
4190 		return (-1);
4191 	}
4192 
4193 	if (!metaismeta(compnp)) {	/* handle non-metadevices */
4194 
4195 		/*
4196 		 * The component can have one or more soft partitions on it
4197 		 * already, but can't be part of any other type of metadevice,
4198 		 * so if it is used for a metadevice, but the metadevice
4199 		 * isn't a soft partition, return failure.
4200 		 */
4201 
4202 		if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 &&
4203 		    meta_check_insp(sp, compnp, 0, -1, ep) == 0) {
4204 			return (-1);
4205 		}
4206 	} else {			/* handle metadevices */
4207 		/* get underlying unit & check capabilities */
4208 		if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL)
4209 			return (-1);
4210 
4211 		if ((! (mdp->capabilities & MD_CAN_PARENT)) ||
4212 		    (! (mdp->capabilities & MD_CAN_SP)))
4213 			return (mdmderror(ep, MDE_INVAL_UNIT,
4214 			    meta_getminor(compnp->dev), compnp->cname));
4215 	}
4216 
4217 meta_check_sp_ok:
4218 	mdclrerror(ep);
4219 	return (0);
4220 }
4221 
4222 /*
4223  * FUNCTION:	meta_create_sp()
4224  * INPUT:	sp	- the set name to create in
4225  *		msp	- the unit structure to create
4226  *		oblist	- an optional list of requested extents (-o/-b options)
4227  *		options	- creation options
4228  *		alignment - data alignment
4229  * OUTPUT:	ep	- return error pointer
4230  * RETURNS:	int	-  0 success, -1 error
4231  * PURPOSE:	does most of the work for creating a soft partition.  If
4232  *		metainit -p -e was used, first partition the drive.  Then
4233  *		create an extent list based on the existing soft partitions
4234  *		and assume all space not used by them is free.  Storage for
4235  *		the new soft partition is allocated from the free extents
4236  *		based on the length specified on the command line or the
4237  *		oblist passed in.  The unit structure is then committed and
4238  *		the watermarks are updated.  Finally, the status is changed to
4239  *		Okay and the process is complete.
4240  */
4241 static int
4242 meta_create_sp(
4243 	mdsetname_t	*sp,
4244 	md_sp_t		*msp,
4245 	sp_ext_node_t	*oblist,
4246 	mdcmdopts_t	options,
4247 	sp_ext_length_t	alignment,
4248 	md_error_t	*ep
4249 )
4250 {
4251 	mdname_t	*np = msp->common.namep;
4252 	mdname_t	*compnp = msp->compnamep;
4253 	mp_unit_t	*mp = NULL;
4254 	mdnamelist_t	*keynlp = NULL, *spnlp = NULL;
4255 	md_set_params_t	set_params;
4256 	int		rval = -1;
4257 	diskaddr_t	comp_size;
4258 	diskaddr_t	sp_start;
4259 	sp_ext_node_t	*extlist = NULL;
4260 	int		numexts = 0;	/* number of extents */
4261 	int		count = 0;
4262 	int		committed = 0;
4263 	int		repart_options = MD_REPART_FORCE;
4264 	int		create_flag = MD_CRO_32BIT;
4265 	int		mn_set_master = 0;
4266 
4267 	md_set_desc	*sd;
4268 	md_set_mmown_params_t	*ownpar = NULL;
4269 	int		comp_is_mirror = 0;
4270 
4271 	/* validate soft partition */
4272 	if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0)
4273 		return (-1);
4274 
4275 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4276 		if ((options & MDCMD_DOIT) != 0) {
4277 			if (meta_repartition_drive(sp,
4278 			    compnp->drivenamep,
4279 			    repart_options,
4280 			    NULL, /* Don't return the VTOC */
4281 			    ep) != 0)
4282 
4283 				return (-1);
4284 		} else {
4285 			/*
4286 			 * If -n and -e are both specified, it doesn't make
4287 			 * sense to continue without actually partitioning
4288 			 * the drive.
4289 			 */
4290 			return (0);
4291 		}
4292 	}
4293 
4294 	/* populate the start_blk field of the component name */
4295 	if ((sp_start = meta_sp_get_start(sp, compnp, ep)) ==
4296 	    MD_DISKADDR_ERROR) {
4297 		rval = -1;
4298 		goto out;
4299 	}
4300 
4301 	if (options & MDCMD_DOIT) {
4302 		/* store name in namespace */
4303 		if (add_key_name(sp, compnp, &keynlp, ep) != 0) {
4304 			rval = -1;
4305 			goto out;
4306 		}
4307 	}
4308 
4309 	/*
4310 	 * Get a list of the soft partitions that currently reside on
4311 	 * the component.  We should ALWAYS force reload the cache,
4312 	 * because if this is a single creation, there will not BE a
4313 	 * cached list, and if we're using the md.tab, we must rebuild
4314 	 * the list because it won't contain the previous (if any)
4315 	 * soft partition.
4316 	 */
4317 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4318 	if (count < 0) {
4319 		/* error occured */
4320 		rval = -1;
4321 		goto out;
4322 	}
4323 
4324 	/*
4325 	 * get the size of the underlying device.  if the size is smaller
4326 	 * than or equal to the watermark size, we know there isn't
4327 	 * enough space.
4328 	 */
4329 	if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) {
4330 		rval = -1;
4331 		goto out;
4332 	} else if (comp_size <= MD_SP_WMSIZE) {
4333 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname);
4334 		rval = -1;
4335 		goto out;
4336 	}
4337 	/*
4338 	 * seed extlist with reserved space at the beginning of the volume and
4339 	 * enough space for the end watermark.  The end watermark always gets
4340 	 * updated, but if the underlying device changes size it may not be
4341 	 * pointed to until the extent before it is updated.  Since the
4342 	 * end of the reserved space is where the first watermark starts,
4343 	 * the reserved extent should never be marked for updating.
4344 	 */
4345 
4346 	meta_sp_list_insert(NULL, NULL, &extlist,
4347 	    0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4348 	meta_sp_list_insert(NULL, NULL, &extlist,
4349 	    (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE,
4350 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4351 
4352 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4353 		rval = -1;
4354 		goto out;
4355 	}
4356 
4357 	metafreenamelist(spnlp);
4358 
4359 	if (getenv(META_SP_DEBUG)) {
4360 		meta_sp_debug("meta_create_sp: list of used extents:\n");
4361 		meta_sp_list_dump(extlist);
4362 	}
4363 
4364 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4365 
4366 	/* get extent list from -o/-b options or from free space */
4367 	if (options & MDCMD_DIRECT) {
4368 		if (getenv(META_SP_DEBUG)) {
4369 			meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n");
4370 			meta_sp_list_dump(oblist);
4371 		}
4372 
4373 		numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist);
4374 		if (numexts == -1) {
4375 			(void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname);
4376 			rval = -1;
4377 			goto out;
4378 		}
4379 	} else {
4380 		numexts = meta_sp_alloc_by_len(sp, np, &extlist,
4381 		    &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment :
4382 		    meta_sp_get_default_alignment(sp, compnp, ep));
4383 		if (numexts == -1) {
4384 			(void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname);
4385 			rval = -1;
4386 			goto out;
4387 		}
4388 	}
4389 
4390 	assert(extlist != NULL);
4391 
4392 	/* create soft partition */
4393 	mp = meta_sp_createunit(msp->common.namep, msp->compnamep,
4394 	    extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep);
4395 
4396 	create_flag = meta_check_devicesize(mp->c.un_total_blocks);
4397 
4398 	/* if we're not doing anything (metainit -n), return success */
4399 	if (! (options & MDCMD_DOIT)) {
4400 		rval = 0;	/* success */
4401 		goto out;
4402 	}
4403 
4404 	(void) memset(&set_params, 0, sizeof (set_params));
4405 
4406 	if (create_flag == MD_CRO_64BIT) {
4407 		mp->c.un_revision |= MD_64BIT_META_DEV;
4408 		set_params.options = MD_CRO_64BIT;
4409 	} else {
4410 		mp->c.un_revision &= ~MD_64BIT_META_DEV;
4411 		set_params.options = MD_CRO_32BIT;
4412 	}
4413 
4414 	if (getenv(META_SP_DEBUG)) {
4415 		meta_sp_debug("meta_create_sp: printing unit structure\n");
4416 		meta_sp_printunit(mp);
4417 	}
4418 
4419 	/*
4420 	 * Check to see if we're trying to create a partition on a mirror. If so
4421 	 * we may have to enforce an ownership change before writing the
4422 	 * watermark out.
4423 	 */
4424 	if (metaismeta(compnp)) {
4425 		char *miscname;
4426 
4427 		miscname = metagetmiscname(compnp, ep);
4428 		if (miscname != NULL)
4429 			comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0);
4430 		else
4431 			comp_is_mirror = 0;
4432 	} else {
4433 		comp_is_mirror = 0;
4434 	}
4435 
4436 	/*
4437 	 * For a multi-node environment we have to ensure that the master
4438 	 * node owns an underlying mirror before we issue the MD_IOCSET ioctl.
4439 	 * If the master does not own the device we will deadlock as the
4440 	 * implicit write of the watermarks (in sp_ioctl.c) will cause an
4441 	 * ownership change that will block as the MD_IOCSET is still in
4442 	 * progress. To close this window we force an owner change to occur
4443 	 * before issuing the MD_IOCSET. We cannot simply open the device and
4444 	 * write to it as this will only work for the first soft-partition
4445 	 * creation.
4446 	 */
4447 
4448 	if (comp_is_mirror && !metaislocalset(sp)) {
4449 
4450 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4451 			rval = -1;
4452 			goto out;
4453 		}
4454 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
4455 			mn_set_master = 1;
4456 		}
4457 	}
4458 
4459 	set_params.mnum = MD_SID(mp);
4460 	set_params.size = mp->c.un_size;
4461 	set_params.mdp = (uintptr_t)mp;
4462 	MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum));
4463 
4464 	/* first phase of commit. */
4465 	if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
4466 	    np->cname) != 0) {
4467 		(void) mdstealerror(ep, &set_params.mde);
4468 		rval = -1;
4469 		goto out;
4470 	}
4471 
4472 	/* we've successfully committed the record */
4473 	committed = 1;
4474 
4475 	/* write watermarks */
4476 	/*
4477 	 * Special-case for Multi-node sets. As we now have a distributed DRL
4478 	 * update mechanism, we _will_ hit the ioctl-within-ioctl deadlock case
4479 	 * unless we use a 'special' MN-capable ioctl to stage the watermark
4480 	 * update. This only affects the master-node in an MN set.
4481 	 */
4482 	if (mn_set_master) {
4483 		if (meta_mn_sp_update_wm(sp, msp, extlist, ep) < 0) {
4484 			rval = -1;
4485 			goto out;
4486 		}
4487 	} else {
4488 		if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
4489 			rval = -1;
4490 			goto out;
4491 		}
4492 	}
4493 
4494 	/* second phase of commit, set status to MD_SP_OK */
4495 	if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) {
4496 		rval = -1;
4497 		goto out;
4498 	}
4499 	rval = 0;
4500 out:
4501 	Free(mp);
4502 	if (ownpar)
4503 		Free(ownpar);
4504 
4505 	if (extlist != NULL)
4506 		meta_sp_list_free(&extlist);
4507 
4508 	if (rval != 0 && keynlp != NULL && committed != 1)
4509 		(void) del_key_names(sp, keynlp, NULL);
4510 
4511 	metafreenamelist(keynlp);
4512 
4513 	return (rval);
4514 }
4515 
4516 /*
4517  * **************************************************************************
4518  *                      Reset (metaclear) Functions                         *
4519  * **************************************************************************
4520  */
4521 
4522 /*
4523  * FUNCTION:	meta_sp_reset_common()
4524  * INPUT:	sp	- the set name of the device to reset
4525  *		np	- the name of the device to reset
4526  *		msp	- the unit structure to reset
4527  *		options	- metaclear options
4528  * OUTPUT:	ep	- return error pointer
4529  * RETURNS:	int	-  0 success, -1 error
4530  * PURPOSE:	"resets", or more accurately deletes, the soft partition
4531  *		specified.  First the state is set to "deleting" and then the
4532  *		watermarks are all cleared out.  Once the watermarks have been
4533  *		updated, the unit structure is deleted from the metadb.
4534  */
4535 static int
4536 meta_sp_reset_common(
4537 	mdsetname_t	*sp,
4538 	mdname_t	*np,
4539 	md_sp_t		*msp,
4540 	md_sp_reset_t	reset_params,
4541 	mdcmdopts_t	options,
4542 	md_error_t	*ep
4543 )
4544 {
4545 	char	*miscname;
4546 	int	rval = -1;
4547 	int	is_open = 0;
4548 
4549 	/* make sure that nobody owns us */
4550 	if (MD_HAS_PARENT(msp->common.parent))
4551 		return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev),
4552 		    np->cname));
4553 
4554 	/* make sure that the soft partition isn't open */
4555 	if ((is_open = meta_isopen(sp, np, ep, options)) < 0)
4556 		return (-1);
4557 	else if (is_open)
4558 		return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev),
4559 		    np->cname));
4560 
4561 	/* get miscname */
4562 	if ((miscname = metagetmiscname(np, ep)) == NULL)
4563 		return (-1);
4564 
4565 	/* fill in reset params */
4566 	MD_SETDRIVERNAME(&reset_params, miscname, sp->setno);
4567 	reset_params.mnum = meta_getminor(np->dev);
4568 	reset_params.force = (options & MDCMD_FORCE) ? 1 : 0;
4569 
4570 	/*
4571 	 * clear soft partition - phase one.
4572 	 * place the soft partition into the "delete pending" state.
4573 	 */
4574 	if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0)
4575 		return (-1);
4576 
4577 	/*
4578 	 * Now clear the watermarks.  If the force flag is specified,
4579 	 * ignore any errors writing the watermarks and delete the unit
4580 	 * structure anyway.  An error may leave the on-disk format in a
4581 	 * corrupt state.  If force is not specified and we fail here,
4582 	 * the soft partition will remain in the "delete pending" state.
4583 	 */
4584 	if ((meta_sp_clear_wm(sp, msp, ep) < 0) &&
4585 	    ((options & MDCMD_FORCE) == 0))
4586 		goto out;
4587 
4588 	/*
4589 	 * clear soft partition - phase two.
4590 	 * the driver removes the soft partition from the metadb and
4591 	 * zeros out incore version.
4592 	 */
4593 	if (metaioctl(MD_IOCRESET, &reset_params,
4594 	    &reset_params.mde, np->cname) != 0) {
4595 		(void) mdstealerror(ep, &reset_params.mde);
4596 		goto out;
4597 	}
4598 
4599 	/*
4600 	 * Wait for the /dev to be cleaned up. Ignore the return
4601 	 * value since there's not much we can do.
4602 	 */
4603 	(void) meta_update_devtree(meta_getminor(np->dev));
4604 
4605 	rval = 0;	/* success */
4606 
4607 	if (options & MDCMD_PRINT) {
4608 		(void) printf(dgettext(TEXT_DOMAIN,
4609 		    "%s: Soft Partition is cleared\n"),
4610 		    np->cname);
4611 		(void) fflush(stdout);
4612 	}
4613 
4614 	/*
4615 	 * if told to recurse and on a metadevice, then attempt to
4616 	 * clear the subdevices.  Indicate failure if the clear fails.
4617 	 */
4618 	if ((options & MDCMD_RECURSE) &&
4619 	    (metaismeta(msp->compnamep)) &&
4620 	    (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0))
4621 		rval = -1;
4622 
4623 out:
4624 	meta_invalidate_name(np);
4625 	return (rval);
4626 }
4627 
4628 /*
4629  * FUNCTION:	meta_sp_reset()
4630  * INPUT:	sp	- the set name of the device to reset
4631  *		np	- the name of the device to reset
4632  *		options	- metaclear options
4633  * OUTPUT:	ep	- return error pointer
4634  * RETURNS:	int	-  0 success, -1 error
4635  * PURPOSE:	provides the entry point to the rest of libmeta for deleting a
4636  *		soft partition.  If np is NULL, then soft partitions are
4637  *		all deleted at the current level and then recursively deleted.
4638  *		Otherwise, if a name is specified either directly or as a
4639  *		result of a recursive operation, it deletes only that name.
4640  *		Since something sitting under a soft partition may be parented
4641  *		to it, we have to reparent that other device to another soft
4642  *		partition on the same component if we're deleting the one it's
4643  *		parented to.
4644  */
4645 int
4646 meta_sp_reset(
4647 	mdsetname_t	*sp,
4648 	mdname_t	*np,
4649 	mdcmdopts_t	options,
4650 	md_error_t	*ep
4651 )
4652 {
4653 	md_sp_t		*msp;
4654 	int		rval = -1;
4655 	mdnamelist_t	*spnlp = NULL, *nlp = NULL;
4656 	md_sp_reset_t	reset_params;
4657 	int		num_sp;
4658 
4659 	assert(sp != NULL);
4660 
4661 	/* reset/delete all soft paritions */
4662 	if (np == NULL) {
4663 		/*
4664 		 * meta_reset_all sets MDCMD_RECURSE, but this behavior
4665 		 * is incorrect for soft partitions.  We want to clear
4666 		 * all soft partitions at a particular level in the
4667 		 * metadevice stack before moving to the next level.
4668 		 * Thus, we clear MDCMD_RECURSE from the options.
4669 		 */
4670 		options &= ~MDCMD_RECURSE;
4671 
4672 		/* for each soft partition */
4673 		rval = 0;
4674 		if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
4675 			rval = -1;
4676 
4677 		for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) {
4678 			np = nlp->namep;
4679 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4680 				rval = -1;
4681 				break;
4682 			}
4683 			/*
4684 			 * meta_reset_all calls us twice to get soft
4685 			 * partitions at the top and bottom of the stack.
4686 			 * thus, if we have a parent, we'll get deleted
4687 			 * on the next call.
4688 			 */
4689 			if (MD_HAS_PARENT(msp->common.parent))
4690 				continue;
4691 			/*
4692 			 * If this is a multi-node set, we send a series
4693 			 * of individual metaclear commands.
4694 			 */
4695 			if (meta_is_mn_set(sp, ep)) {
4696 				if (meta_mn_send_metaclear_command(sp,
4697 				    np->cname, options, 0, ep) != 0) {
4698 					rval = -1;
4699 					break;
4700 				}
4701 			} else {
4702 				if (meta_sp_reset(sp, np, options, ep) != 0) {
4703 					rval = -1;
4704 					break;
4705 				}
4706 			}
4707 		}
4708 		/* cleanup return status */
4709 		metafreenamelist(spnlp);
4710 		return (rval);
4711 	}
4712 
4713 	/* check the name */
4714 	if (metachkmeta(np, ep) != 0)
4715 		return (-1);
4716 
4717 	/* get the unit structure */
4718 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4719 		return (-1);
4720 
4721 	/* clear out reset parameters */
4722 	(void) memset(&reset_params, 0, sizeof (reset_params));
4723 
4724 	/* if our child is a metadevice, we need to deparent/reparent it */
4725 	if (metaismeta(msp->compnamep)) {
4726 		/* get sp's on this component */
4727 		if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep,
4728 		    &spnlp, 1, ep)) <= 0)
4729 			/* no sp's on this device.  error! */
4730 			return (-1);
4731 		else if (num_sp == 1)
4732 			/* last sp on this device, so we deparent */
4733 			reset_params.new_parent = MD_NO_PARENT;
4734 		else {
4735 			/* have to reparent this metadevice */
4736 			for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4737 				if (meta_getminor(nlp->namep->dev) ==
4738 				    meta_getminor(np->dev))
4739 					continue;
4740 				/*
4741 				 * this isn't the softpart we are deleting,
4742 				 * so use this device as the new parent.
4743 				 */
4744 				reset_params.new_parent =
4745 				    meta_getminor(nlp->namep->dev);
4746 				break;
4747 			}
4748 		}
4749 		metafreenamelist(spnlp);
4750 	}
4751 
4752 	if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0)
4753 		return (-1);
4754 
4755 	return (0);
4756 }
4757 
4758 /*
4759  * FUNCTION:	meta_sp_reset_component()
4760  * INPUT:	sp	- the set name of the device to reset
4761  *		name	- the string name of the device to reset
4762  *		options	- metaclear options
4763  * OUTPUT:	ep	- return error pointer
4764  * RETURNS:	int	-  0 success, -1 error
4765  * PURPOSE:	provides the ability to delete all soft partitions on a
4766  *		specified device (metaclear -p).  It first gets all of the
4767  *		soft partitions on the component and then deletes each one
4768  *		individually.
4769  */
4770 int
4771 meta_sp_reset_component(
4772 	mdsetname_t	*sp,
4773 	char		*name,
4774 	mdcmdopts_t	options,
4775 	md_error_t	*ep
4776 )
4777 {
4778 	mdname_t	*compnp, *np;
4779 	mdnamelist_t	*spnlp = NULL;
4780 	mdnamelist_t	*nlp = NULL;
4781 	md_sp_t		*msp;
4782 	int		count;
4783 	md_sp_reset_t	reset_params;
4784 
4785 	if ((compnp = metaname(&sp, name, UNKNOWN, ep)) == NULL)
4786 		return (-1);
4787 
4788 	/* If we're starting out with no soft partitions, it's an error */
4789 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4790 	if (count == 0)
4791 		return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname));
4792 	else if (count < 0)
4793 		return (-1);
4794 
4795 	/*
4796 	 * clear all soft partitions on this component.
4797 	 * NOTE: we reparent underlying metadevices as we go so that
4798 	 * things stay sane.  Also, if we encounter an error, we stop
4799 	 * and go no further in case recovery might be needed.
4800 	 */
4801 	for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4802 		/* clear out reset parameters */
4803 		(void) memset(&reset_params, 0, sizeof (reset_params));
4804 
4805 		/* check the name */
4806 		np = nlp->namep;
4807 
4808 		if (metachkmeta(np, ep) != 0) {
4809 			metafreenamelist(spnlp);
4810 			return (-1);
4811 		}
4812 
4813 		/* get the unit structure */
4814 		if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4815 			metafreenamelist(spnlp);
4816 			return (-1);
4817 		}
4818 
4819 		/* have to deparent/reparent metadevices */
4820 		if (metaismeta(compnp)) {
4821 			if (nlp->next == NULL)
4822 				reset_params.new_parent = MD_NO_PARENT;
4823 			else
4824 				reset_params.new_parent =
4825 				    meta_getminor(spnlp->next->namep->dev);
4826 		}
4827 
4828 		/* clear soft partition */
4829 		if (meta_sp_reset_common(sp, np, msp, reset_params,
4830 		    options, ep) < 0) {
4831 			metafreenamelist(spnlp);
4832 			return (-1);
4833 		}
4834 	}
4835 	metafreenamelist(spnlp);
4836 	return (0);
4837 }
4838 
4839 /*
4840  * **************************************************************************
4841  *                      Grow (metattach) Functions                          *
4842  * **************************************************************************
4843  */
4844 
4845 /*
4846  * FUNCTION:	meta_sp_attach()
4847  * INPUT:	sp	- the set name of the device to attach to
4848  *		np	- the name of the device to attach to
4849  *		addsize	- the unparsed string holding the amount of space to add
4850  *		options	- metattach options
4851  *		alignment - data alignment
4852  * OUTPUT:	ep	- return error pointer
4853  * RETURNS:	int	-  0 success, -1 error
4854  * PURPOSE:	grows a soft partition by reading in the existing unit
4855  *		structure and setting its state to Growing, allocating more
4856  *		space (similar to meta_create_sp()), updating the watermarks,
4857  *		and then writing out the new unit structure in the Okay state.
4858  */
4859 int
4860 meta_sp_attach(
4861 	mdsetname_t	*sp,
4862 	mdname_t	*np,
4863 	char		*addsize,
4864 	mdcmdopts_t	options,
4865 	sp_ext_length_t	alignment,
4866 	md_error_t	*ep
4867 )
4868 {
4869 	md_grow_params_t	grow_params;
4870 	sp_ext_length_t		grow_len;	/* amount to grow */
4871 	mp_unit_t		*mp, *new_un;
4872 	mdname_t		*compnp = NULL;
4873 
4874 	sp_ext_node_t		*extlist = NULL;
4875 	int			numexts;
4876 	mdnamelist_t		*spnlp = NULL;
4877 	int			count;
4878 	md_sp_t			*msp;
4879 	daddr_t			start_block;
4880 
4881 	/* should have the same set */
4882 	assert(sp != NULL);
4883 	assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev)));
4884 
4885 	/* check name */
4886 	if (metachkmeta(np, ep) != 0)
4887 		return (-1);
4888 
4889 	if (meta_sp_parsesize(addsize, &grow_len) == -1) {
4890 		return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname));
4891 	}
4892 
4893 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
4894 		return (-1);
4895 
4896 	/* make sure we don't have a parent */
4897 	if (MD_HAS_PARENT(mp->c.un_parent)) {
4898 		Free(mp);
4899 		return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname));
4900 	}
4901 
4902 	if (getenv(META_SP_DEBUG)) {
4903 		meta_sp_debug("meta_sp_attach: Unit structure before new "
4904 		    "space:\n");
4905 		meta_sp_printunit(mp);
4906 	}
4907 
4908 	/*
4909 	 * NOTE: the fast option to metakeyname is 0 as opposed to 1
4910 	 * If this was not the case we would suffer the following
4911 	 * assertion failure:
4912 	 * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP
4913 	 * file meta_check.x, line 315
4914 	 * I guess this is because we have not "seen" this drive before
4915 	 * and hence hit the failure - this is of course the attach routine
4916 	 */
4917 	if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) {
4918 		Free(mp);
4919 		return (-1);
4920 	}
4921 
4922 	/* metakeyname does not fill in the key. */
4923 	compnp->key = mp->un_key;
4924 
4925 	/* work out the space on the component that we are dealing with */
4926 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
4927 
4928 	/*
4929 	 * see if the component has been soft partitioned yet, or if an
4930 	 * error occurred.
4931 	 */
4932 	if (count == 0) {
4933 		Free(mp);
4934 		return (mdmderror(ep, MDE_NOT_SP, 0, np->cname));
4935 	} else if (count < 0) {
4936 		Free(mp);
4937 		return (-1);
4938 	}
4939 
4940 	/*
4941 	 * seed extlist with reserved space at the beginning of the volume and
4942 	 * enough space for the end watermark.  The end watermark always gets
4943 	 * updated, but if the underlying device changes size it may not be
4944 	 * pointed to until the extent before it is updated.  Since the
4945 	 * end of the reserved space is where the first watermark starts,
4946 	 * the reserved extent should never be marked for updating.
4947 	 */
4948 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
4949 	    MD_DISKADDR_ERROR) {
4950 		Free(mp);
4951 		return (-1);
4952 	}
4953 
4954 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
4955 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4956 	meta_sp_list_insert(NULL, NULL, &extlist,
4957 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
4958 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4959 
4960 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4961 		Free(mp);
4962 		return (-1);
4963 	}
4964 
4965 	metafreenamelist(spnlp);
4966 
4967 	if (getenv(META_SP_DEBUG)) {
4968 		meta_sp_debug("meta_sp_attach: list of used extents:\n");
4969 		meta_sp_list_dump(extlist);
4970 	}
4971 
4972 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4973 
4974 	assert(mp->un_numexts >= 1);
4975 	numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len,
4976 	    mp->un_ext[mp->un_numexts - 1].un_poff,
4977 	    (alignment > 0) ? alignment :
4978 	    meta_sp_get_default_alignment(sp, compnp, ep));
4979 
4980 	if (numexts == -1) {
4981 		Free(mp);
4982 		return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname));
4983 	}
4984 
4985 	/* allocate new unit structure and copy in old unit */
4986 	if ((new_un = meta_sp_updateunit(np, mp, extlist,
4987 	    grow_len, numexts, ep)) == NULL) {
4988 		Free(mp);
4989 		return (-1);
4990 	}
4991 	Free(mp);
4992 
4993 	/* If running in dryrun mode (-n option), we're done here */
4994 	if ((options & MDCMD_DOIT) == 0) {
4995 		if (options & MDCMD_PRINT) {
4996 			(void) printf(dgettext(TEXT_DOMAIN,
4997 			    "%s: Soft Partition would grow\n"),
4998 			    np->cname);
4999 			(void) fflush(stdout);
5000 		}
5001 		return (0);
5002 	}
5003 
5004 	if (getenv(META_SP_DEBUG)) {
5005 		meta_sp_debug("meta_sp_attach: updated unit structure:\n");
5006 		meta_sp_printunit(new_un);
5007 	}
5008 
5009 	assert(new_un != NULL);
5010 
5011 	(void) memset(&grow_params, 0, sizeof (grow_params));
5012 	if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) {
5013 		grow_params.options = MD_CRO_64BIT;
5014 		new_un->c.un_revision |= MD_64BIT_META_DEV;
5015 	} else {
5016 		grow_params.options = MD_CRO_32BIT;
5017 		new_un->c.un_revision &= ~MD_64BIT_META_DEV;
5018 	}
5019 	grow_params.mnum = MD_SID(new_un);
5020 	grow_params.size = new_un->c.un_size;
5021 	grow_params.mdp = (uintptr_t)new_un;
5022 	MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum));
5023 
5024 	if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde,
5025 	    np->cname) != 0) {
5026 		(void) mdstealerror(ep, &grow_params.mde);
5027 		return (-1);
5028 	}
5029 
5030 	/* update all watermarks */
5031 
5032 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
5033 		return (-1);
5034 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0)
5035 		return (-1);
5036 
5037 
5038 	/* second phase of commit, set status to MD_SP_OK */
5039 	if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0)
5040 		return (-1);
5041 
5042 	meta_invalidate_name(np);
5043 
5044 	if (options & MDCMD_PRINT) {
5045 		(void) printf(dgettext(TEXT_DOMAIN,
5046 		    "%s: Soft Partition has been grown\n"),
5047 		    np->cname);
5048 		(void) fflush(stdout);
5049 	}
5050 
5051 	return (0);
5052 }
5053 
5054 /*
5055  * **************************************************************************
5056  *                    Recovery (metarecover) Functions                      *
5057  * **************************************************************************
5058  */
5059 
5060 /*
5061  * FUNCTION:	meta_recover_sp()
5062  * INPUT:	sp	- the name of the set we are recovering on
5063  *		compnp	- name pointer for device we are recovering on
5064  *		argc	- argument count
5065  *		argv	- left over arguments not parsed by metarecover command
5066  *		options	- metarecover options
5067  * OUTPUT:	ep	- return error pointer
5068  * RETURNS:	int	- 0 - success, -1 - error
5069  * PURPOSE:	parse soft partitioning-specific metarecover options and
5070  *		dispatch to the appropriate function to handle recovery.
5071  */
5072 int
5073 meta_recover_sp(
5074 	mdsetname_t	*sp,
5075 	mdname_t	*compnp,
5076 	int		argc,
5077 	char		*argv[],
5078 	mdcmdopts_t	options,
5079 	md_error_t	*ep
5080 )
5081 {
5082 	md_set_desc	*sd;
5083 
5084 	if (argc > 1) {
5085 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5086 		    argc, argv);
5087 		return (-1);
5088 	}
5089 
5090 	/*
5091 	 * For a MN set, this operation must be performed on the master
5092 	 * as it is responsible for maintaining the watermarks
5093 	 */
5094 	if (!metaislocalset(sp)) {
5095 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
5096 			return (-1);
5097 		if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) {
5098 			(void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno,
5099 			    sd->sd_mn_master_nodenm, NULL, NULL);
5100 			return (-1);
5101 		}
5102 	}
5103 	if (argc == 0) {
5104 		/*
5105 		 * if no additional arguments are passed, metarecover should
5106 		 * validate both on-disk and metadb structures as well as
5107 		 * checking that both are consistent with each other
5108 		 */
5109 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5110 			return (-1);
5111 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5112 			return (-1);
5113 		if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0)
5114 			return (-1);
5115 	} else if (strcmp(argv[0], "-d") == 0) {
5116 		/*
5117 		 * Ensure that there is no existing valid record for this
5118 		 * soft-partition. If there is we have nothing to do.
5119 		 */
5120 		if (meta_sp_validate_unit(sp, compnp, options, ep) == 0)
5121 			return (-1);
5122 		/* validate and recover from on-disk structures */
5123 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5124 			return (-1);
5125 		if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0)
5126 			return (-1);
5127 	} else if (strcmp(argv[0], "-m") == 0) {
5128 		/* validate and recover from metadb structures */
5129 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5130 			return (-1);
5131 		if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0)
5132 			return (-1);
5133 	} else {
5134 		/* syntax error */
5135 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5136 		    argc, argv);
5137 		return (-1);
5138 	}
5139 
5140 	return (0);
5141 }
5142 
5143 /*
5144  * FUNCTION:	meta_sp_display_exthdr()
5145  * INPUT:	none
5146  * OUTPUT:	none
5147  * RETURNS:	void
5148  * PURPOSE:	print header line for sp_ext_node_t information.  to be used
5149  *		in conjunction with meta_sp_display_ext().
5150  */
5151 static void
5152 meta_sp_display_exthdr(void)
5153 {
5154 	(void) printf("%20s %5s %7s %20s %20s\n",
5155 	    dgettext(TEXT_DOMAIN, "Name"),
5156 	    dgettext(TEXT_DOMAIN, "Seq#"),
5157 	    dgettext(TEXT_DOMAIN, "Type"),
5158 	    dgettext(TEXT_DOMAIN, "Offset"),
5159 	    dgettext(TEXT_DOMAIN, "Length"));
5160 }
5161 
5162 
5163 /*
5164  * FUNCTION:	meta_sp_display_ext()
5165  * INPUT:	ext	- extent to display
5166  * OUTPUT:	none
5167  * RETURNS:	void
5168  * PURPOSE:	print selected fields from sp_ext_node_t.
5169  */
5170 static void
5171 meta_sp_display_ext(sp_ext_node_t *ext)
5172 {
5173 	/* print extent information */
5174 	if (ext->ext_namep != NULL)
5175 		(void) printf("%20s ", ext->ext_namep->cname);
5176 	else
5177 		(void) printf("%20s ", "NONE");
5178 
5179 	(void) printf("%5u ", ext->ext_seq);
5180 
5181 	switch (ext->ext_type) {
5182 	case EXTTYP_ALLOC:
5183 		(void) printf("%7s ", "ALLOC");
5184 		break;
5185 	case EXTTYP_FREE:
5186 		(void) printf("%7s ", "FREE");
5187 		break;
5188 	case EXTTYP_RESERVED:
5189 		(void) printf("%7s ", "RESV");
5190 		break;
5191 	case EXTTYP_END:
5192 		(void) printf("%7s ", "END");
5193 		break;
5194 	default:
5195 		(void) printf("%7s ", "INVLD");
5196 		break;
5197 	}
5198 
5199 	(void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length);
5200 }
5201 
5202 
5203 /*
5204  * FUNCTION:	meta_sp_checkseq()
5205  * INPUT:	extlist	- list of extents to be checked
5206  * OUTPUT:	none
5207  * RETURNS:	int	- 0 - success, -1 - error
5208  * PURPOSE:	check soft partition sequence numbers.  this function assumes
5209  *		that a list of extents representing 1 or more soft partitions
5210  *		is passed in sorted in sequence number order.  within a
5211  *		single soft partition, there may not be any missing or
5212  *		duplicate sequence numbers.
5213  */
5214 static int
5215 meta_sp_checkseq(sp_ext_node_t *extlist)
5216 {
5217 	sp_ext_node_t *ext;
5218 
5219 	assert(extlist != NULL);
5220 
5221 	for (ext = extlist;
5222 	    ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC;
5223 	    ext = ext->ext_next) {
5224 		if (ext->ext_next->ext_namep != NULL &&
5225 		    strcmp(ext->ext_next->ext_namep->cname,
5226 		    ext->ext_namep->cname) != 0)
5227 				continue;
5228 
5229 		if (ext->ext_next->ext_seq != ext->ext_seq + 1) {
5230 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5231 			    "%s: sequence numbers are "
5232 			    "incorrect: %d should be %d\n"),
5233 			    ext->ext_next->ext_namep->cname,
5234 			    ext->ext_next->ext_seq, ext->ext_seq + 1);
5235 			return (-1);
5236 		}
5237 	}
5238 	return (0);
5239 }
5240 
5241 
5242 /*
5243  * FUNCTION:	meta_sp_resolve_name_conflict()
5244  * INPUT:	sp	- name of set we're are recovering in.
5245  *		old_np	- name pointer of soft partition we found on disk.
5246  * OUTPUT:	new_np	- name pointer for new soft partition name.
5247  *		ep	- error pointer returned.
5248  * RETURNS:	int	- 0 - name not replace, 1 - name replaced, -1 - error
5249  * PURPOSE:	Check to see if the name of one of the soft partitions we found
5250  *		on disk already exists in the metadb.  If so, prompt for a new
5251  *		name.  In addition, we keep a static array of names that
5252  *		will be recovered from this device since these names don't
5253  *		exist in the configuration at this point but cannot be
5254  *		recovered more than once.
5255  */
5256 static int
5257 meta_sp_resolve_name_conflict(
5258 	mdsetname_t	*sp,
5259 	mdname_t	*old_np,
5260 	mdname_t	**new_np,
5261 	md_error_t	*ep
5262 )
5263 {
5264 	char		yesno[255];
5265 	char		*yes;
5266 	char		newname[MD_SP_MAX_DEVNAME_PLUS_1];
5267 	int		nunits;
5268 	static int	*used_names = NULL;
5269 
5270 	assert(old_np != NULL);
5271 
5272 	if (used_names == NULL) {
5273 		if ((nunits = meta_get_nunits(ep)) < 0)
5274 			return (-1);
5275 		used_names = Zalloc(nunits * sizeof (int));
5276 	}
5277 
5278 	/* see if it exists already */
5279 	if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 &&
5280 	    metagetmiscname(old_np, ep) == NULL) {
5281 		if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5282 			return (-1);
5283 		else {
5284 			used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1;
5285 			mdclrerror(ep);
5286 			return (0);
5287 		}
5288 	}
5289 
5290 	/* name exists, ask the user for a new one */
5291 	(void) printf(dgettext(TEXT_DOMAIN,
5292 	    "WARNING: A soft partition named %s was found in the extent\n"
5293 	    "headers, but this name already exists in the metadb "
5294 	    "configuration.\n"
5295 	    "In order to continue recovery you must supply\n"
5296 	    "a new name for this soft partition.\n"), old_np->cname);
5297 	(void) printf(dgettext(TEXT_DOMAIN,
5298 	    "Would you like to continue and supply a new name? (yes/no) "));
5299 
5300 	(void) fflush(stdout);
5301 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
5302 	    (strlen(yesno) == 1))
5303 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
5304 		    dgettext(TEXT_DOMAIN, "no"));
5305 	yes = dgettext(TEXT_DOMAIN, "yes");
5306 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
5307 		return (-1);
5308 	}
5309 
5310 	(void) fflush(stdin);
5311 
5312 	/* get the new name */
5313 	for (;;) {
5314 		(void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name "
5315 		    "for this soft partition (dXXXX) "));
5316 		(void) fflush(stdout);
5317 		if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL)
5318 			(void) strcpy(newname, "");
5319 
5320 		/* remove newline character */
5321 		if (newname[strlen(newname) - 1] == '\n')
5322 			newname[strlen(newname) - 1] = '\0';
5323 
5324 		if (!(is_metaname(newname)) ||
5325 		    (meta_init_make_device(&sp, newname, ep) <= 0)) {
5326 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5327 			    "Invalid metadevice name\n"));
5328 			(void) fflush(stderr);
5329 			continue;
5330 		}
5331 
5332 		if ((*new_np = metaname(&sp, newname,
5333 		    META_DEVICE, ep)) == NULL) {
5334 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5335 			    "Invalid metadevice name\n"));
5336 			(void) fflush(stderr);
5337 			continue;
5338 		}
5339 
5340 		assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits);
5341 		/* make sure the name isn't already being used */
5342 		if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] ||
5343 		    metagetmiscname(*new_np, ep) != NULL) {
5344 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5345 			    "That name already exists\n"));
5346 			continue;
5347 		} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5348 			return (-1);
5349 
5350 		break;
5351 	}
5352 
5353 	/* got a new name, place in used array and return */
5354 	used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1;
5355 	mdclrerror(ep);
5356 	return (1);
5357 }
5358 
5359 /*
5360  * FUNCTION:	meta_sp_validate_wm()
5361  * INPUT:	sp	- set name we are recovering in
5362  *		compnp	- name pointer for device we are recovering from
5363  *		options	- metarecover options
5364  * OUTPUT:	ep	- error pointer returned
5365  * RETURNS:	int	- 0 - success, -1 - error
5366  * PURPOSE:	validate and display watermark configuration.  walk the
5367  *		on-disk watermark structures and validate the information
5368  *		found within.  since a watermark configuration is
5369  *		"self-defining", the act of traversing the watermarks
5370  *		is part of the validation process.
5371  */
5372 static int
5373 meta_sp_validate_wm(
5374 	mdsetname_t	*sp,
5375 	mdname_t	*compnp,
5376 	mdcmdopts_t	options,
5377 	md_error_t	*ep
5378 )
5379 {
5380 	sp_ext_node_t	*extlist = NULL;
5381 	sp_ext_node_t	*ext;
5382 	int		num_sps = 0;
5383 	int		rval;
5384 
5385 	if ((options & MDCMD_VERBOSE) != 0)
5386 		(void) printf(dgettext(TEXT_DOMAIN,
5387 		    "Verifying on-disk structures on %s.\n"),
5388 		    compnp->cname);
5389 
5390 	/*
5391 	 * for each watermark, build an ext_node, place on list.
5392 	 */
5393 	rval = meta_sp_extlist_from_wm(sp, compnp, &extlist,
5394 	    meta_sp_cmp_by_nameseq, ep);
5395 
5396 	if ((options & MDCMD_VERBOSE) != 0) {
5397 		/* print out what we found */
5398 		if (extlist == NULL)
5399 			(void) printf(dgettext(TEXT_DOMAIN,
5400 			    "No extent headers found on %s.\n"),
5401 			    compnp->cname);
5402 		else {
5403 			(void) printf(dgettext(TEXT_DOMAIN,
5404 			    "The following extent headers were found on %s.\n"),
5405 			    compnp->cname);
5406 			meta_sp_display_exthdr();
5407 		}
5408 		for (ext = extlist; ext != NULL; ext = ext->ext_next)
5409 			meta_sp_display_ext(ext);
5410 	}
5411 
5412 	if (rval < 0) {
5413 		(void) printf(dgettext(TEXT_DOMAIN,
5414 		    "%s: On-disk structures invalid or "
5415 		    "no soft partitions found.\n"),
5416 		    compnp->cname);
5417 		return (-1);
5418 	}
5419 
5420 	assert(extlist != NULL);
5421 
5422 	/* count number of soft partitions */
5423 	for (ext = extlist;
5424 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5425 	    ext = ext->ext_next) {
5426 		if (ext->ext_next != NULL &&
5427 		    ext->ext_next->ext_namep != NULL &&
5428 		    strcmp(ext->ext_next->ext_namep->cname,
5429 		    ext->ext_namep->cname) == 0)
5430 				continue;
5431 		num_sps++;
5432 	}
5433 
5434 	if ((options & MDCMD_VERBOSE) != 0)
5435 		(void) printf(dgettext(TEXT_DOMAIN,
5436 		    "Found %d soft partition(s) on %s.\n"), num_sps,
5437 		    compnp->cname);
5438 
5439 	if (num_sps == 0) {
5440 		(void) printf(dgettext(TEXT_DOMAIN,
5441 		    "%s: No soft partitions.\n"), compnp->cname);
5442 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5443 	}
5444 
5445 	/* check sequence numbers */
5446 	if ((options & MDCMD_VERBOSE) != 0)
5447 		(void) printf(dgettext(TEXT_DOMAIN,
5448 		    "Checking sequence numbers.\n"));
5449 
5450 	if (meta_sp_checkseq(extlist) != 0)
5451 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5452 
5453 	return (0);
5454 }
5455 
5456 /*
5457  * FUNCTION:	meta_sp_validate_unit()
5458  * INPUT:	sp	- name of set we are recovering in
5459  *		compnp	- name of component we are recovering from
5460  *		options	- metarecover options
5461  * OUTPUT:	ep	- error pointer returned
5462  * RETURNS:	int	- 0 - success, -1 - error
5463  * PURPOSE:	validate and display metadb configuration.  begin by getting
5464  *		all soft partitions built on the specified component.  get
5465  *		the unit structure for each one and validate the fields within.
5466  */
5467 static int
5468 meta_sp_validate_unit(
5469 	mdsetname_t	*sp,
5470 	mdname_t	*compnp,
5471 	mdcmdopts_t	options,
5472 	md_error_t	*ep
5473 )
5474 {
5475 	md_sp_t		*msp;
5476 	mdnamelist_t	*spnlp = NULL;
5477 	mdnamelist_t	*namep = NULL;
5478 	int		count;
5479 	uint_t		extn;
5480 	sp_ext_length_t	size;
5481 
5482 	if ((options & MDCMD_VERBOSE) != 0)
5483 		(void) printf(dgettext(TEXT_DOMAIN,
5484 		    "%s: Validating soft partition metadb entries.\n"),
5485 		    compnp->cname);
5486 
5487 	if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR)
5488 		return (-1);
5489 
5490 	/* get all soft partitions on component */
5491 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
5492 
5493 	if (count == 0) {
5494 		(void) printf(dgettext(TEXT_DOMAIN,
5495 		    "%s: No soft partitions.\n"), compnp->cname);
5496 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5497 	} else if (count < 0) {
5498 		return (-1);
5499 	}
5500 
5501 	/* Now go through the soft partitions and check each one */
5502 	for (namep = spnlp; namep != NULL; namep = namep->next) {
5503 		mdname_t	*curnp = namep->namep;
5504 		sp_ext_offset_t	curvoff;
5505 
5506 		/* get the unit structure */
5507 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
5508 			return (-1);
5509 
5510 		/* verify generic unit structure parameters */
5511 		if ((options & MDCMD_VERBOSE) != 0)
5512 			(void) printf(dgettext(TEXT_DOMAIN,
5513 			    "\nVerifying device %s.\n"),
5514 			    curnp->cname);
5515 
5516 		/*
5517 		 * MD_SP_LAST is an invalid state and is always the
5518 		 * highest numbered.
5519 		 */
5520 		if (msp->status >= MD_SP_LAST) {
5521 			(void) printf(dgettext(TEXT_DOMAIN,
5522 			    "%s: status value %u is out of range.\n"),
5523 			    curnp->cname, msp->status);
5524 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5525 			    0, curnp->cname));
5526 		} else if ((options & MDCMD_VERBOSE) != 0) {
5527 			uint_t	tstate = 0;
5528 
5529 			if (metaismeta(msp->compnamep)) {
5530 				if (meta_get_tstate(msp->common.namep->dev,
5531 				    &tstate, ep) != 0)
5532 					return (-1);
5533 			}
5534 			(void) printf(dgettext(TEXT_DOMAIN,
5535 			    "%s: Status \"%s\" is valid.\n"),
5536 			    curnp->cname, meta_sp_status_to_name(msp->status,
5537 			    tstate & MD_DEV_ERRORED));
5538 		}
5539 
5540 		/* Now verify each extent */
5541 		if ((options & MDCMD_VERBOSE) != 0)
5542 			(void) printf("%14s %21s %21s %21s\n",
5543 			    dgettext(TEXT_DOMAIN, "Extent Number"),
5544 			    dgettext(TEXT_DOMAIN, "Virtual Offset"),
5545 			    dgettext(TEXT_DOMAIN, "Physical Offset"),
5546 			    dgettext(TEXT_DOMAIN, "Length"));
5547 
5548 		curvoff = 0ULL;
5549 		for (extn = 0; extn < msp->ext.ext_len; extn++) {
5550 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
5551 
5552 			if ((options & MDCMD_VERBOSE) != 0)
5553 				(void) printf("%14u %21llu %21llu %21llu\n",
5554 				    extn, extp->voff, extp->poff, extp->len);
5555 
5556 			if (extp->voff != curvoff) {
5557 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5558 				    "%s: virtual offset for extent %u "
5559 				    "is inconsistent, expected %llu, "
5560 				    "got %llu.\n"), curnp->cname, extn,
5561 				    curvoff, extp->voff);
5562 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5563 				    0, compnp->cname));
5564 			}
5565 
5566 			/* make sure extent does not drop off the end */
5567 			if ((extp->poff + extp->len) == size) {
5568 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5569 				    "%s: extent %u at offset %llu, "
5570 				    "length %llu exceeds the size of the "
5571 				    "device, %llu.\n"), curnp->cname,
5572 				    extn, extp->poff, extp->len, size);
5573 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5574 				    0, compnp->cname));
5575 			}
5576 
5577 			curvoff += extp->len;
5578 		}
5579 	}
5580 	if (options & MDCMD_PRINT) {
5581 		(void) printf(dgettext(TEXT_DOMAIN,
5582 		    "%s: Soft Partition metadb configuration is valid\n"),
5583 		    compnp->cname);
5584 	}
5585 	return (0);
5586 }
5587 
5588 /*
5589  * FUNCTION:	meta_sp_validate_wm_and_unit()
5590  * INPUT:	sp	- name of set we are recovering in
5591  *		compnp	- name of device we are recovering from
5592  *		options	- metarecover options
5593  * OUTPUT:	ep	- error pointer returned
5594  * RETURNS:	int	- 0 - success, -1 error
5595  * PURPOSE:	cross-validate and display watermarks and metadb records.
5596  *		get both the unit structures for the soft partitions built
5597  *		on the specified component and the watermarks found on that
5598  *		component and check to make sure they are consistent with
5599  *		each other.
5600  */
5601 static int
5602 meta_sp_validate_wm_and_unit(
5603 	mdsetname_t	*sp,
5604 	mdname_t	*np,
5605 	mdcmdopts_t	options,
5606 	md_error_t	*ep
5607 )
5608 {
5609 	sp_ext_node_t	*wmlist = NULL;
5610 	sp_ext_node_t	*unitlist = NULL;
5611 	sp_ext_node_t	*unitext;
5612 	sp_ext_node_t	*wmext;
5613 	sp_ext_offset_t	tmpunitoff;
5614 	mdnamelist_t	*spnlp = NULL;
5615 	int		count;
5616 	int		rval = 0;
5617 	int		verbose = (options & MDCMD_VERBOSE);
5618 
5619 	/* get unit structure list */
5620 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
5621 	if (count <= 0)
5622 		return (-1);
5623 
5624 	meta_sp_list_insert(NULL, NULL, &unitlist,
5625 	    metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
5626 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
5627 
5628 	if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) {
5629 		metafreenamelist(spnlp);
5630 		return (-1);
5631 	}
5632 
5633 	metafreenamelist(spnlp);
5634 
5635 	meta_sp_list_freefill(&unitlist, metagetsize(np, ep));
5636 
5637 	if (meta_sp_extlist_from_wm(sp, np, &wmlist,
5638 	    meta_sp_cmp_by_offset, ep) < 0) {
5639 		meta_sp_list_free(&unitlist);
5640 		return (-1);
5641 	}
5642 
5643 	if (getenv(META_SP_DEBUG)) {
5644 		meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n");
5645 		meta_sp_list_dump(unitlist);
5646 		meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n");
5647 		meta_sp_list_dump(wmlist);
5648 	}
5649 
5650 	/*
5651 	 * step through both lists and compare allocated nodes.  Free
5652 	 * nodes and end watermarks may differ between the two but
5653 	 * that's generally ok, and if they're wrong will typically
5654 	 * cause misplaced allocated extents.
5655 	 */
5656 	if (verbose)
5657 		(void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb "
5658 		    "allocations match extent headers.\n"), np->cname);
5659 
5660 	unitext = unitlist;
5661 	wmext = wmlist;
5662 	while ((wmext != NULL) && (unitext != NULL)) {
5663 		/* find next allocated extents in each list */
5664 		while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC)
5665 			wmext = wmext->ext_next;
5666 
5667 		while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC)
5668 			unitext = unitext->ext_next;
5669 
5670 		if (wmext == NULL || unitext == NULL)
5671 			break;
5672 
5673 		if (verbose) {
5674 			(void) printf(dgettext(TEXT_DOMAIN,
5675 			    "Metadb extent:\n"));
5676 			meta_sp_display_exthdr();
5677 			meta_sp_display_ext(unitext);
5678 			(void) printf(dgettext(TEXT_DOMAIN,
5679 			    "Extent header extent:\n"));
5680 			meta_sp_display_exthdr();
5681 			meta_sp_display_ext(wmext);
5682 			(void) printf("\n");
5683 		}
5684 
5685 		if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0)
5686 			rval = -1;
5687 
5688 		/*
5689 		 * if the offsets aren't equal, only increment the
5690 		 * lowest one in hopes of getting the lists back in sync.
5691 		 */
5692 		tmpunitoff = unitext->ext_offset;
5693 		if (unitext->ext_offset <= wmext->ext_offset)
5694 			unitext = unitext->ext_next;
5695 		if (wmext->ext_offset <= tmpunitoff)
5696 			wmext = wmext->ext_next;
5697 	}
5698 
5699 	/*
5700 	 * if both lists aren't at the end then there are extra
5701 	 * allocated nodes in one of them.
5702 	 */
5703 	if (wmext != NULL) {
5704 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5705 		    "%s: extent headers contain allocations not in "
5706 		    "the metadb\n\n"), np->cname);
5707 		rval = -1;
5708 	}
5709 
5710 	if (unitext != NULL) {
5711 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5712 		    "%s: metadb contains allocations not in the extent "
5713 		    "headers\n\n"), np->cname);
5714 		rval = -1;
5715 	}
5716 
5717 	if (options & MDCMD_PRINT) {
5718 		if (rval == 0) {
5719 			(void) printf(dgettext(TEXT_DOMAIN,
5720 			    "%s: Soft Partition metadb matches extent "
5721 			    "header configuration\n"), np->cname);
5722 		} else {
5723 			(void) printf(dgettext(TEXT_DOMAIN,
5724 			    "%s: Soft Partition metadb does not match extent "
5725 			    "header configuration\n"), np->cname);
5726 		}
5727 	}
5728 
5729 	return (rval);
5730 }
5731 
5732 /*
5733  * FUNCTION:	meta_sp_validate_exts()
5734  * INPUT:	compnp	- name pointer for device we are recovering from
5735  *		wmext	- extent node representing watermark
5736  *		unitext	- extent node from unit structure
5737  * OUTPUT:	ep	- return error pointer
5738  * RETURNS:	int	- 0 - succes, mdmderror return code - error
5739  * PURPOSE:	Takes two extent nodes and checks them against each other.
5740  *		offset, length, sequence number, set, and name are compared.
5741  */
5742 static int
5743 meta_sp_validate_exts(
5744 	mdname_t	*compnp,
5745 	sp_ext_node_t	*wmext,
5746 	sp_ext_node_t	*unitext,
5747 	md_error_t	*ep
5748 )
5749 {
5750 	if (wmext->ext_offset != unitext->ext_offset) {
5751 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5752 		    "%s: unit structure and extent header offsets differ.\n"),
5753 		    compnp->cname);
5754 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5755 	}
5756 
5757 	if (wmext->ext_length != unitext->ext_length) {
5758 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5759 		    "%s: unit structure and extent header lengths differ.\n"),
5760 		    compnp->cname);
5761 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5762 	}
5763 
5764 	if (wmext->ext_seq != unitext->ext_seq) {
5765 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5766 		    "%s: unit structure and extent header sequence numbers "
5767 		    "differ.\n"), compnp->cname);
5768 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5769 	}
5770 
5771 	if (wmext->ext_type != unitext->ext_type) {
5772 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5773 		    "%s: unit structure and extent header types differ.\n"),
5774 		    compnp->cname);
5775 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5776 	}
5777 
5778 	/*
5779 	 * If one has a set pointer and the other doesn't, error.
5780 	 * If both extents have setnames, then make sure they match
5781 	 * If both are NULL, it's ok, they match.
5782 	 */
5783 	if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) {
5784 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5785 		    "%s: unit structure and extent header set values "
5786 		    "differ.\n"), compnp->cname);
5787 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5788 	}
5789 
5790 	if (unitext->ext_setp != NULL) {
5791 		if (strcmp(unitext->ext_setp->setname,
5792 		    wmext->ext_setp->setname) != 0) {
5793 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5794 			    "%s: unit structure and extent header set names "
5795 			    "differ.\n"), compnp->cname);
5796 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5797 			    0, compnp->cname));
5798 		}
5799 	}
5800 
5801 	/*
5802 	 * If one has a name pointer and the other doesn't, error.
5803 	 * If both extents have names, then make sure they match
5804 	 * If both are NULL, it's ok, they match.
5805 	 */
5806 	if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) {
5807 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5808 		    "%s: unit structure and extent header name values "
5809 		    "differ.\n"), compnp->cname);
5810 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5811 	}
5812 
5813 	if (unitext->ext_namep != NULL) {
5814 		if (strcmp(wmext->ext_namep->cname,
5815 		    unitext->ext_namep->cname) != 0) {
5816 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5817 			    "%s: unit structure and extent header names "
5818 			    "differ.\n"), compnp->cname);
5819 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5820 			    0, compnp->cname));
5821 		}
5822 	}
5823 
5824 	return (0);
5825 }
5826 
5827 /*
5828  * FUNCTION:	update_sp_status()
5829  * INPUT:	sp	- name of set we are recovering in
5830  *		minors	- pointer to an array of soft partition minor numbers
5831  *		num_sps	- number of minor numbers in array
5832  *		status	- new status to be applied to all soft parts in array
5833  *		mn_set	- set if current set is a multi-node set
5834  * OUTPUT:	ep	- return error pointer
5835  * RETURNS:	int	- 0 - success, -1 - error
5836  * PURPOSE:	update  status of soft partitions to new status. minors is an
5837  *		array of minor numbers to apply the new status to.
5838  *		If mn_set is set, a message is sent to all nodes in the
5839  *		cluster to update the status locally.
5840  */
5841 static int
5842 update_sp_status(
5843 	mdsetname_t	*sp,
5844 	minor_t		*minors,
5845 	int		num_sps,
5846 	sp_status_t	status,
5847 	bool_t		mn_set,
5848 	md_error_t	*ep
5849 )
5850 {
5851 	int	i;
5852 	int	err = 0;
5853 
5854 	if (mn_set) {
5855 		md_mn_msg_sp_setstat_t	sp_setstat_params;
5856 		int			result;
5857 		md_mn_result_t		*resp = NULL;
5858 
5859 		for (i = 0; i < num_sps; i++) {
5860 			sp_setstat_params.sp_setstat_mnum = minors[i];
5861 			sp_setstat_params.sp_setstat_status = status;
5862 
5863 			result = mdmn_send_message(sp->setno,
5864 			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS, 0,
5865 			    (char *)&sp_setstat_params,
5866 			    sizeof (sp_setstat_params),
5867 			    &resp, ep);
5868 			if (resp != NULL) {
5869 				if (resp->mmr_exitval != 0)
5870 					err = -1;
5871 				free_result(resp);
5872 			}
5873 			if (result != 0) {
5874 				err = -1;
5875 			}
5876 		}
5877 	} else {
5878 		if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0)
5879 			err = -1;
5880 	}
5881 	if (err < 0) {
5882 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5883 		    "Error updating status on recovered soft "
5884 		    "partitions.\n"));
5885 	}
5886 	return (err);
5887 }
5888 
5889 /*
5890  * FUNCTION:	meta_sp_recover_from_wm()
5891  * INPUT:	sp	- name of set we are recovering in
5892  *		compnp	- name pointer for component we are recovering from
5893  *		options	- metarecover options
5894  * OUTPUT:	ep	- return error pointer
5895  * RETURNS:	int	- 0 - success, -1 - error
5896  * PURPOSE:	update metadb records to match watermarks.  begin by getting
5897  *		an extlist representing all soft partitions on the component.
5898  *		then build a unit structure for each soft partition.
5899  *		notify user of changes, then commit each soft partition to
5900  *		the metadb one at a time in the "recovering" state.  update
5901  *		any watermarks that may need it	(to reflect possible name
5902  *		changes), and, finally, set the status of all recovered
5903  *		partitions to the "OK" state at once.
5904  */
5905 static int
5906 meta_sp_recover_from_wm(
5907 	mdsetname_t	*sp,
5908 	mdname_t	*compnp,
5909 	mdcmdopts_t	options,
5910 	md_error_t	*ep
5911 )
5912 {
5913 	sp_ext_node_t		*extlist = NULL;
5914 	sp_ext_node_t		*sp_list = NULL;
5915 	sp_ext_node_t		*update_list = NULL;
5916 	sp_ext_node_t		*ext;
5917 	sp_ext_node_t		*sp_ext;
5918 	mp_unit_t		*mp;
5919 	mp_unit_t		**un_array;
5920 	int			numexts = 0, num_sps = 0, i = 0;
5921 	int			err = 0;
5922 	int			not_recovered = 0;
5923 	int			committed = 0;
5924 	sp_ext_length_t		sp_length = 0LL;
5925 	mdnamelist_t		*keynlp = NULL;
5926 	mdname_t		*np;
5927 	mdname_t		*new_np;
5928 	int			new_name;
5929 	md_set_params_t		set_params;
5930 	minor_t			*minors = NULL;
5931 	char			yesno[255];
5932 	char			*yes;
5933 	bool_t			mn_set = 0;
5934 	md_set_desc		*sd;
5935 	mm_unit_t		*mm;
5936 	md_set_mmown_params_t	*ownpar = NULL;
5937 	int			comp_is_mirror = 0;
5938 
5939 	/*
5940 	 * if this component appears in another metadevice already, do
5941 	 * NOT recover from it.
5942 	 */
5943 	if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0)
5944 		return (-1);
5945 
5946 	/* set flag if dealing with a MN set */
5947 	if (!metaislocalset(sp)) {
5948 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5949 			return (-1);
5950 		}
5951 		if (MD_MNSET_DESC(sd))
5952 			mn_set = 1;
5953 	}
5954 	/*
5955 	 * for each watermark, build an ext_node, place on list.
5956 	 */
5957 	if (meta_sp_extlist_from_wm(sp, compnp, &extlist,
5958 	    meta_sp_cmp_by_nameseq, ep) < 0)
5959 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5960 
5961 	assert(extlist != NULL);
5962 
5963 	/* count number of soft partitions */
5964 	for (ext = extlist;
5965 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5966 	    ext = ext->ext_next) {
5967 		if (ext->ext_next != NULL &&
5968 		    ext->ext_next->ext_namep != NULL &&
5969 		    strcmp(ext->ext_next->ext_namep->cname,
5970 		    ext->ext_namep->cname) == 0)
5971 				continue;
5972 		num_sps++;
5973 	}
5974 
5975 	/* allocate array of unit structure pointers */
5976 	un_array = Zalloc(num_sps * sizeof (mp_unit_t *));
5977 
5978 	/*
5979 	 * build unit structures from list of ext_nodes.
5980 	 */
5981 	for (ext = extlist;
5982 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5983 	    ext = ext->ext_next) {
5984 		meta_sp_list_insert(ext->ext_setp, ext->ext_namep,
5985 		    &sp_list, ext->ext_offset, ext->ext_length,
5986 		    ext->ext_type, ext->ext_seq, ext->ext_flags,
5987 		    meta_sp_cmp_by_nameseq);
5988 
5989 		numexts++;
5990 		sp_length += ext->ext_length - MD_SP_WMSIZE;
5991 
5992 		if (ext->ext_next != NULL &&
5993 		    ext->ext_next->ext_namep != NULL &&
5994 		    strcmp(ext->ext_next->ext_namep->cname,
5995 		    ext->ext_namep->cname) == 0)
5996 				continue;
5997 
5998 		/*
5999 		 * if we made it here, we are at a soft partition
6000 		 * boundary in the list.
6001 		 */
6002 		if (getenv(META_SP_DEBUG)) {
6003 			meta_sp_debug("meta_recover_from_wm: dumping wm "
6004 			    "list:\n");
6005 			meta_sp_list_dump(sp_list);
6006 		}
6007 
6008 		assert(sp_list != NULL);
6009 		assert(sp_list->ext_namep != NULL);
6010 
6011 		if ((new_name = meta_sp_resolve_name_conflict(sp,
6012 		    sp_list->ext_namep, &new_np, ep)) < 0) {
6013 			err = 1;
6014 			goto out;
6015 		} else if (new_name) {
6016 			for (sp_ext = sp_list;
6017 			    sp_ext != NULL;
6018 			    sp_ext = sp_ext->ext_next) {
6019 				/*
6020 				 * insert into the update list for
6021 				 * watermark update.
6022 				 */
6023 				meta_sp_list_insert(sp_ext->ext_setp,
6024 				    new_np, &update_list, sp_ext->ext_offset,
6025 				    sp_ext->ext_length, sp_ext->ext_type,
6026 				    sp_ext->ext_seq, EXTFLG_UPDATE,
6027 				    meta_sp_cmp_by_offset);
6028 			}
6029 
6030 		}
6031 		if (options & MDCMD_DOIT) {
6032 			/* store name in namespace */
6033 			if (mn_set) {
6034 				/* send message to all nodes to return key */
6035 				md_mn_msg_addkeyname_t	*send_params;
6036 				int			result;
6037 				md_mn_result_t		*resp = NULL;
6038 				int			message_size;
6039 
6040 				message_size =  sizeof (*send_params) +
6041 				    strlen(compnp->cname) + 1;
6042 				send_params = Zalloc(message_size);
6043 				send_params->addkeyname_setno = sp->setno;
6044 				(void) strcpy(&send_params->addkeyname_name[0],
6045 				    compnp->cname);
6046 				result = mdmn_send_message(sp->setno,
6047 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6048 				    0, (char *)send_params, message_size, &resp,
6049 				    ep);
6050 				Free(send_params);
6051 				if (resp != NULL) {
6052 					if (resp->mmr_exitval >= 0) {
6053 						compnp->key =
6054 						    (mdkey_t)resp->mmr_exitval;
6055 					} else {
6056 						err = 1;
6057 						free_result(resp);
6058 						goto out;
6059 					}
6060 					free_result(resp);
6061 				}
6062 				if (result != 0) {
6063 					err = 1;
6064 					goto out;
6065 				}
6066 				(void) metanamelist_append(&keynlp, compnp);
6067 			} else {
6068 				if (add_key_name(sp, compnp, &keynlp,
6069 				    ep) != 0) {
6070 					err = 1;
6071 					goto out;
6072 				}
6073 			}
6074 		}
6075 
6076 		/* create the unit structure */
6077 		if ((mp = meta_sp_createunit(
6078 		    (new_name) ? new_np : sp_list->ext_namep, compnp,
6079 		    sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) {
6080 			err = 1;
6081 			goto out;
6082 		}
6083 
6084 		if (getenv(META_SP_DEBUG)) {
6085 			meta_sp_debug("meta_sp_recover_from_wm: "
6086 			    "printing newly created unit structure");
6087 			meta_sp_printunit(mp);
6088 		}
6089 
6090 		/* place in unit structure array */
6091 		un_array[i++] = mp;
6092 
6093 		/* free sp_list */
6094 		meta_sp_list_free(&sp_list);
6095 		sp_list = NULL;
6096 		numexts = 0;
6097 		sp_length = 0LL;
6098 	}
6099 
6100 	/* display configuration updates */
6101 	(void) printf(dgettext(TEXT_DOMAIN,
6102 	    "The following soft partitions were found and will be added to\n"
6103 	    "your metadevice configuration.\n"));
6104 	(void) printf("%5s %15s %18s\n",
6105 	    dgettext(TEXT_DOMAIN, "Name"),
6106 	    dgettext(TEXT_DOMAIN, "Size"),
6107 	    dgettext(TEXT_DOMAIN, "No. of Extents"));
6108 	for (i = 0; i < num_sps; i++) {
6109 		(void) printf("%5s%lu %15llu %9d\n", "d",
6110 		    MD_MIN2UNIT(MD_SID(un_array[i])),
6111 		    un_array[i]->un_length, un_array[i]->un_numexts);
6112 	}
6113 
6114 	if (!(options & MDCMD_DOIT)) {
6115 		not_recovered = 1;
6116 		goto out;
6117 	}
6118 
6119 	/* ask user for confirmation */
6120 	(void) printf(dgettext(TEXT_DOMAIN,
6121 	    "WARNING: You are about to add one or more soft partition\n"
6122 	    "metadevices to your metadevice configuration.  If there\n"
6123 	    "appears to be an error in the soft partition(s) displayed\n"
6124 	    "above, do NOT proceed with this recovery operation.\n"));
6125 	(void) printf(dgettext(TEXT_DOMAIN,
6126 	    "Are you sure you want to do this (yes/no)? "));
6127 
6128 	(void) fflush(stdout);
6129 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6130 	    (strlen(yesno) == 1))
6131 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
6132 		    dgettext(TEXT_DOMAIN, "no"));
6133 	yes = dgettext(TEXT_DOMAIN, "yes");
6134 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
6135 		not_recovered = 1;
6136 		goto out;
6137 	}
6138 
6139 	/* commit records one at a time */
6140 	for (i = 0; i < num_sps; i++) {
6141 		(void) memset(&set_params, 0, sizeof (set_params));
6142 		set_params.mnum = MD_SID(un_array[i]);
6143 		set_params.size = (un_array[i])->c.un_size;
6144 		set_params.mdp = (uintptr_t)(un_array[i]);
6145 		set_params.options =
6146 		    meta_check_devicesize(un_array[i]->un_length);
6147 		if (set_params.options == MD_CRO_64BIT) {
6148 			un_array[i]->c.un_revision |= MD_64BIT_META_DEV;
6149 		} else {
6150 			un_array[i]->c.un_revision &= ~MD_64BIT_META_DEV;
6151 		}
6152 		MD_SETDRIVERNAME(&set_params, MD_SP,
6153 		    MD_MIN2SET(set_params.mnum));
6154 
6155 		np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep);
6156 
6157 		/*
6158 		 * If this is an MN set, send the MD_IOCSET ioctl to all nodes
6159 		 */
6160 		if (mn_set) {
6161 			md_mn_msg_iocset_t	send_params;
6162 			int			result;
6163 			md_mn_result_t		*resp = NULL;
6164 			int			mess_size;
6165 
6166 			/*
6167 			 * Calculate message size. md_mn_msg_iocset_t only
6168 			 * contains one extent, so increment the size to
6169 			 * include all extents
6170 			 */
6171 			mess_size = sizeof (send_params) -
6172 			    sizeof (mp_ext_t) +
6173 			    (un_array[i]->un_numexts * sizeof (mp_ext_t));
6174 
6175 			send_params.iocset_params = set_params;
6176 			(void) memcpy(&send_params.unit, un_array[i],
6177 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
6178 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
6179 			result = mdmn_send_message(sp->setno,
6180 			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS, 0,
6181 			    (char *)&send_params, mess_size, &resp,
6182 			    ep);
6183 			if (resp != NULL) {
6184 				if (resp->mmr_exitval != 0)
6185 					err = 1;
6186 				free_result(resp);
6187 			}
6188 			if (result != 0) {
6189 				err = 1;
6190 			}
6191 		} else {
6192 			if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
6193 			    np->cname) != 0) {
6194 				err = 1;
6195 			}
6196 		}
6197 
6198 		if (err == 1) {
6199 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6200 			    "%s: Error committing record to metadb.\n"),
6201 			    np->cname);
6202 			goto out;
6203 		}
6204 
6205 		/* note that we've committed a record */
6206 		if (!committed)
6207 			committed = 1;
6208 
6209 		/* update any watermarks that need it */
6210 		if (update_list != NULL) {
6211 			md_sp_t *msp;
6212 
6213 			/*
6214 			 * Check to see if we're trying to create a partition
6215 			 * on a mirror. If so we may have to enforce an
6216 			 * ownership change before writing the watermark out.
6217 			 */
6218 			if (metaismeta(compnp)) {
6219 				char *miscname;
6220 
6221 				miscname = metagetmiscname(compnp, ep);
6222 				if (miscname != NULL)
6223 					comp_is_mirror = (strcmp(miscname,
6224 					    MD_MIRROR) == 0);
6225 				else
6226 					comp_is_mirror = 0;
6227 			}
6228 			/*
6229 			 * If this is a MN set and the component is a mirror,
6230 			 * change ownership to this node in order to write the
6231 			 * watermarks
6232 			 */
6233 			if (mn_set && comp_is_mirror) {
6234 				mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
6235 				if (mm == NULL) {
6236 					err = 1;
6237 					goto out;
6238 				} else {
6239 					err = meta_mn_change_owner(&ownpar,
6240 					    sp->setno,
6241 					    meta_getminor(compnp->dev),
6242 					    sd->sd_mn_mynode->nd_nodeid,
6243 					    MD_MN_MM_PREVENT_CHANGE |
6244 					    MD_MN_MM_SPAWN_THREAD);
6245 					if (err != 0)
6246 						goto out;
6247 				}
6248 			}
6249 
6250 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
6251 				err = 1;
6252 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6253 				    "%s: Error updating extent headers.\n"),
6254 				    np->cname);
6255 				goto out;
6256 			}
6257 			if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) {
6258 				err = 1;
6259 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6260 				    "%s: Error updating extent headers "
6261 				    "on disk.\n"), np->cname);
6262 				goto out;
6263 			}
6264 		}
6265 		/*
6266 		 * If we have changed ownership earlier and prevented any
6267 		 * ownership changes, we can now allow ownership changes
6268 		 * again.
6269 		 */
6270 		if (ownpar) {
6271 			(void) meta_mn_change_owner(&ownpar, sp->setno,
6272 			    ownpar->d.mnum,
6273 			    ownpar->d.owner,
6274 			    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
6275 		}
6276 	}
6277 
6278 	/* update status of all soft partitions to OK */
6279 	minors = Zalloc(num_sps * sizeof (minor_t));
6280 	for (i = 0; i < num_sps; i++)
6281 		minors[i] = MD_SID(un_array[i]);
6282 
6283 	err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep);
6284 	if (err != 0)
6285 		goto out;
6286 
6287 	if (options & MDCMD_PRINT)
6288 		(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6289 		    "Soft Partitions recovered from device.\n"),
6290 		    compnp->cname);
6291 out:
6292 	/* free memory */
6293 	if (extlist != NULL)
6294 		meta_sp_list_free(&extlist);
6295 	if (sp_list != NULL)
6296 		meta_sp_list_free(&sp_list);
6297 	if (update_list != NULL)
6298 		meta_sp_list_free(&update_list);
6299 	if (un_array != NULL)	{
6300 		for (i = 0; i < num_sps; i++)
6301 			Free(un_array[i]);
6302 		Free(un_array);
6303 	}
6304 	if (minors != NULL)
6305 		Free(minors);
6306 	if (ownpar != NULL)
6307 		Free(ownpar);
6308 	(void) fflush(stdout);
6309 
6310 	if ((keynlp != NULL) && (committed != 1)) {
6311 		/*
6312 		 * if we haven't committed any softparts, either because of an
6313 		 * error or because the user decided not to proceed, delete
6314 		 * namelist key for the component
6315 		 */
6316 		if (mn_set) {
6317 			mdnamelist_t	*p;
6318 
6319 			for (p = keynlp; (p != NULL); p = p->next) {
6320 				mdname_t		*np = p->namep;
6321 				md_mn_msg_delkeyname_t	send_params;
6322 				md_mn_result_t		*resp = NULL;
6323 
6324 				send_params.delkeyname_dev = np->dev;
6325 				send_params.delkeyname_setno = sp->setno;
6326 				send_params.delkeyname_key = np->key;
6327 				(void) mdmn_send_message(sp->setno,
6328 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6329 				    0, (char *)&send_params,
6330 				    sizeof (send_params),
6331 				    &resp, ep);
6332 				if (resp != NULL) {
6333 					free_result(resp);
6334 				}
6335 			}
6336 		} else {
6337 			(void) del_key_names(sp, keynlp, NULL);
6338 		}
6339 	}
6340 
6341 	metafreenamelist(keynlp);
6342 
6343 	if (err)
6344 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
6345 
6346 	if (not_recovered)
6347 		if (options & MDCMD_PRINT)
6348 			(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6349 			    "Soft Partitions NOT recovered from device.\n"),
6350 			    compnp->cname);
6351 	return (0);
6352 }
6353 
6354 /*
6355  * FUNCTION:	meta_sp_recover_from_unit()
6356  * INPUT:	sp	- name of set we are recovering in
6357  *		compnp	- name of component we are recovering from
6358  *		options	- metarecover options
6359  * OUTPUT:	ep	- return error pointer
6360  * RETURNS:	int	- 0 - success, -1 - error
6361  * PURPOSE:	update watermarks to match metadb records.  begin by getting
6362  *		a namelist representing all soft partitions on the specified
6363  *		component.  then, build an extlist representing the soft
6364  *		partitions, filling in the freespace extents.  notify user
6365  *		of changes, place all soft partitions into the "recovering"
6366  *		state and update the watermarks.  finally, return all soft
6367  *		partitions to the "OK" state.
6368  */
6369 static int
6370 meta_sp_recover_from_unit(
6371 	mdsetname_t	*sp,
6372 	mdname_t	*compnp,
6373 	mdcmdopts_t	options,
6374 	md_error_t	*ep
6375 )
6376 {
6377 	mdnamelist_t	*spnlp = NULL;
6378 	mdnamelist_t	*nlp = NULL;
6379 	sp_ext_node_t	*ext = NULL;
6380 	sp_ext_node_t	*extlist = NULL;
6381 	int		count;
6382 	char		yesno[255];
6383 	char		*yes;
6384 	int		rval = 0;
6385 	minor_t		*minors = NULL;
6386 	int		i;
6387 	md_sp_t		*msp;
6388 	md_set_desc	*sd;
6389 	bool_t		mn_set = 0;
6390 	daddr_t		start_block;
6391 
6392 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
6393 	if (count <= 0)
6394 		return (-1);
6395 
6396 	/* set flag if dealing with a MN set */
6397 	if (!metaislocalset(sp)) {
6398 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
6399 			return (-1);
6400 		}
6401 		if (MD_MNSET_DESC(sd))
6402 			mn_set = 1;
6403 	}
6404 	/*
6405 	 * Save the XDR unit structure for one of the soft partitions;
6406 	 * we'll use this later to provide metadevice context to
6407 	 * update the watermarks so the device can be resolved by
6408 	 * devid instead of dev_t.
6409 	 */
6410 	if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) {
6411 		metafreenamelist(spnlp);
6412 		return (-1);
6413 	}
6414 
6415 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
6416 	    MD_DISKADDR_ERROR) {
6417 		return (-1);
6418 	}
6419 
6420 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
6421 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
6422 	meta_sp_list_insert(NULL, NULL, &extlist,
6423 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
6424 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
6425 
6426 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
6427 		metafreenamelist(spnlp);
6428 		return (-1);
6429 	}
6430 
6431 	assert(extlist != NULL);
6432 	if ((options & MDCMD_VERBOSE) != 0) {
6433 		(void) printf(dgettext(TEXT_DOMAIN,
6434 		    "Updating extent headers on device %s from metadb.\n\n"),
6435 		    compnp->cname);
6436 		(void) printf(dgettext(TEXT_DOMAIN,
6437 		    "The following extent headers will be written:\n"));
6438 		meta_sp_display_exthdr();
6439 	}
6440 
6441 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
6442 
6443 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
6444 
6445 		/* mark every node for updating except the reserved space */
6446 		if (ext->ext_type != EXTTYP_RESERVED) {
6447 			ext->ext_flags |= EXTFLG_UPDATE;
6448 
6449 			/* print extent information */
6450 			if ((options & MDCMD_VERBOSE) != 0)
6451 				meta_sp_display_ext(ext);
6452 		}
6453 	}
6454 
6455 	/* request verification and then update all watermarks */
6456 	if ((options & MDCMD_DOIT) != 0) {
6457 
6458 		(void) printf(dgettext(TEXT_DOMAIN,
6459 		    "\nWARNING: You are about to overwrite portions of %s\n"
6460 		    "with soft partition metadata. The extent headers will be\n"
6461 		    "written to match the existing metadb configuration.  If\n"
6462 		    "the device was not previously setup with this\n"
6463 		    "configuration, data loss may result.\n\n"),
6464 		    compnp->cname);
6465 		(void) printf(dgettext(TEXT_DOMAIN,
6466 		    "Are you sure you want to do this (yes/no)? "));
6467 
6468 		(void) fflush(stdout);
6469 		if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6470 		    (strlen(yesno) == 1))
6471 			(void) snprintf(yesno, sizeof (yesno),
6472 			    "%s\n", dgettext(TEXT_DOMAIN, "no"));
6473 		yes = dgettext(TEXT_DOMAIN, "yes");
6474 		if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) {
6475 			/* place soft partitions into recovering state */
6476 			minors = Zalloc(count * sizeof (minor_t));
6477 			for (nlp = spnlp, i = 0;
6478 			    nlp != NULL && i < count;
6479 			    nlp = nlp->next, i++) {
6480 				assert(nlp->namep != NULL);
6481 				minors[i] = meta_getminor(nlp->namep->dev);
6482 			}
6483 			if (update_sp_status(sp, minors, count,
6484 			    MD_SP_RECOVER, mn_set, ep) != 0) {
6485 				rval = -1;
6486 				goto out;
6487 			}
6488 
6489 			/* update the watermarks */
6490 			if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
6491 				rval = -1;
6492 				goto out;
6493 			}
6494 
6495 			if (options & MDCMD_PRINT) {
6496 				(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6497 				    "Soft Partitions recovered from metadb\n"),
6498 				    compnp->cname);
6499 			}
6500 
6501 			/* return soft partitions to the OK state */
6502 			if (update_sp_status(sp, minors, count,
6503 			    MD_SP_OK, mn_set, ep) != 0) {
6504 				rval = -1;
6505 				goto out;
6506 			}
6507 
6508 			rval = 0;
6509 			goto out;
6510 		}
6511 	}
6512 
6513 	if (options & MDCMD_PRINT) {
6514 		(void) printf(dgettext(TEXT_DOMAIN,
6515 		    "%s: Soft Partitions NOT recovered from metadb\n"),
6516 		    compnp->cname);
6517 	}
6518 
6519 out:
6520 	if (minors != NULL)
6521 		Free(minors);
6522 	metafreenamelist(spnlp);
6523 	meta_sp_list_free(&extlist);
6524 	(void) fflush(stdout);
6525 	return (rval);
6526 }
6527 
6528 
6529 /*
6530  * FUNCTION:	meta_sp_update_abr()
6531  * INPUT:	sp	- name of set we are recovering in
6532  * OUTPUT:	ep	- return error pointer
6533  * RETURNS:	int	- 0 - success, -1 - error
6534  * PURPOSE:	update the ABR state for all soft partitions in the set. This
6535  *		is called when joining a set. It sends a message to the master
6536  *		node for each soft partition to get the value of tstate and
6537  *		then sets ABR ,if required, by opening the sp, setting ABR
6538  *		and then closing the sp. This approach is taken rather that
6539  *		just issuing the MD_MN_SET_CAP ioctl, in order to deal with
6540  *		the case when we have another node simultaneously unsetting ABR.
6541  */
6542 int
6543 meta_sp_update_abr(
6544 	mdsetname_t	*sp,
6545 	md_error_t	*ep
6546 )
6547 {
6548 	mdnamelist_t	*devnlp = NULL;
6549 	mdnamelist_t	*p;
6550 	mdname_t	*devnp = NULL;
6551 	md_unit_t	*un;
6552 	char		fname[MAXPATHLEN];
6553 	int		mnum, fd;
6554 	volcap_t	vc;
6555 	uint_t		tstate;
6556 
6557 
6558 	if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
6559 		return (-1);
6560 	}
6561 
6562 	/* Exit if no soft partitions in this set */
6563 	if (devnlp == NULL)
6564 		return (0);
6565 
6566 	/* For each soft partition */
6567 	for (p = devnlp; (p != NULL); p = p->next) {
6568 		devnp = p->namep;
6569 
6570 		/* check if this is a top level metadevice */
6571 		if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL)
6572 			goto out;
6573 		if (MD_HAS_PARENT(MD_PARENT(un))) {
6574 			Free(un);
6575 			continue;
6576 		}
6577 		Free(un);
6578 
6579 		/* Get tstate from Master */
6580 		if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) {
6581 			mdname_t	*np;
6582 			np = metamnumname(&sp, meta_getminor(devnp->dev), 0,
6583 			    ep);
6584 			if (np) {
6585 				md_perror(dgettext(TEXT_DOMAIN,
6586 				    "Unable to get tstate for %s"), np->cname);
6587 			}
6588 			continue;
6589 		}
6590 		/* If not set on the master, nothing to do */
6591 		if (!(tstate & MD_ABR_CAP))
6592 			continue;
6593 
6594 		mnum = meta_getminor(devnp->dev);
6595 		(void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u",
6596 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
6597 		if ((fd = open(fname, O_RDWR, 0)) < 0) {
6598 			md_perror(dgettext(TEXT_DOMAIN,
6599 			    "Could not open device %s"), fname);
6600 			continue;
6601 		}
6602 
6603 		/* Set ABR state */
6604 		vc.vc_info = 0;
6605 		vc.vc_set = 0;
6606 		if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
6607 			(void) close(fd);
6608 			continue;
6609 		}
6610 
6611 		vc.vc_set = DKV_ABR_CAP;
6612 		if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
6613 			(void) close(fd);
6614 			goto out;
6615 		}
6616 
6617 		(void) close(fd);
6618 	}
6619 	metafreenamelist(devnlp);
6620 	return (0);
6621 out:
6622 	metafreenamelist(devnlp);
6623 	return (-1);
6624 }
6625 
6626 /*
6627  * FUNCTION:	meta_mn_sp_update_abr()
6628  * INPUT:	arg	- Given set.
6629  * PURPOSE:	update the ABR state for all soft partitions in the set by
6630  *		forking a process to call meta_sp_update_abr()
6631  *		This function is only called via rpc.metad when adding a node
6632  *		to a set, ie this node is beong joined to the set by another
6633  *		node.
6634  */
6635 void *
6636 meta_mn_sp_update_abr(void *arg)
6637 {
6638 	set_t		setno = *((set_t *)arg);
6639 	mdsetname_t	*sp;
6640 	md_error_t	mde = mdnullerror;
6641 	int		fval;
6642 
6643 	/* should have a set */
6644 	assert(setno != NULL);
6645 
6646 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6647 		mde_perror(&mde, "");
6648 		return (NULL);
6649 	}
6650 
6651 	if (!(meta_is_mn_set(sp, &mde))) {
6652 		mde_perror(&mde, "");
6653 		return (NULL);
6654 	}
6655 
6656 	/* fork a process */
6657 	if ((fval = md_daemonize(sp, &mde)) != 0) {
6658 		/*
6659 		 * md_daemonize will fork off a process.  The is the
6660 		 * parent or error.
6661 		 */
6662 		if (fval > 0) {
6663 			return (NULL);
6664 		}
6665 		mde_perror(&mde, "");
6666 		return (NULL);
6667 	}
6668 	/*
6669 	 * Child process should never return back to rpc.metad, but
6670 	 * should exit.
6671 	 * Flush all internally cached data inherited from parent process
6672 	 * since cached data will be cleared when parent process RPC request
6673 	 * has completed (which is possibly before this child process
6674 	 * can complete).
6675 	 * Child process can retrieve and cache its own copy of data from
6676 	 * rpc.metad that won't be changed by the parent process.
6677 	 *
6678 	 * Reset md_in_daemon since this child will be a client of rpc.metad
6679 	 * not part of the rpc.metad daemon itself.
6680 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
6681 	 * this thread is rpc.metad or any other thread.  (If this thread
6682 	 * was rpc.metad it could use some short circuit code to get data
6683 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
6684 	 */
6685 	md_in_daemon = 0;
6686 	metaflushsetname(sp);
6687 	sr_cache_flush_setno(setno);
6688 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6689 		mde_perror(&mde, "");
6690 		md_exit(sp, 1);
6691 	}
6692 
6693 
6694 	/*
6695 	 * Closing stdin/out/err here.
6696 	 */
6697 	(void) close(0);
6698 	(void) close(1);
6699 	(void) close(2);
6700 	assert(fval == 0);
6701 
6702 	(void) meta_sp_update_abr(sp, &mde);
6703 
6704 	md_exit(sp, 0);
6705 	/*NOTREACHED*/
6706 	return (NULL);
6707 }
6708 
6709 int
6710 meta_sp_check_component(
6711 	mdsetname_t	*sp,
6712 	mdname_t	*np,
6713 	md_error_t	*ep
6714 )
6715 {
6716 	md_sp_t	*msp;
6717 	minor_t	mnum = 0;
6718 	md_dev64_t	dev = 0;
6719 	mdnm_params_t	nm;
6720 	md_getdevs_params_t	mgd;
6721 	side_t	sideno;
6722 	char	*miscname;
6723 	md_dev64_t	*mydev = NULL;
6724 	char	*pname = NULL, *t;
6725 	char	*ctd_name = NULL;
6726 	char	*devname = NULL;
6727 	int	len;
6728 	int	rval = -1;
6729 
6730 	(void) memset(&nm, '\0', sizeof (nm));
6731 	if ((msp = meta_get_sp_common(sp, np, 0, ep)) == NULL)
6732 		return (-1);
6733 
6734 	if ((miscname = metagetmiscname(np, ep)) == NULL)
6735 		return (-1);
6736 
6737 	sideno = getmyside(sp, ep);
6738 
6739 	meta_sp_debug("meta_sp_check_component: %s is on %s key: %d"
6740 	    " dev: %llu\n",
6741 	    np->cname, msp->compnamep->cname, msp->compnamep->key,
6742 	    msp->compnamep->dev);
6743 
6744 	/*
6745 	 * Now get the data from the unit structure. The compnamep stuff
6746 	 * contains the data from the namespace and we need the un_dev
6747 	 * from the unit structure.
6748 	 */
6749 	(void) memset(&mgd, '\0', sizeof (mgd));
6750 	MD_SETDRIVERNAME(&mgd, miscname, sp->setno);
6751 	mgd.cnt = 1;		    /* sp's only have one subdevice */
6752 	mgd.mnum = meta_getminor(np->dev);
6753 
6754 	mydev = Zalloc(sizeof (*mydev));
6755 	mgd.devs = (uintptr_t)mydev;
6756 
6757 	if (metaioctl(MD_IOCGET_DEVS, &mgd, &mgd.mde, np->cname) != 0) {
6758 		meta_sp_debug("meta_sp_check_component: ioctl failed\n");
6759 		(void) mdstealerror(ep, &mgd.mde);
6760 		rval = 0;
6761 		goto out;
6762 	} else if (mgd.cnt <= 0) {
6763 		assert(mgd.cnt >= 0);
6764 		rval = 0;
6765 		goto out;
6766 	}
6767 
6768 	/* Get the devname from the name space. */
6769 	if ((devname = meta_getnmentbykey(sp->setno, sideno,
6770 	    msp->compnamep->key, NULL, &mnum, &dev, ep)) == NULL) {
6771 		meta_sp_debug("meta_sp_check_component: key %d not"
6772 		    "found\n", msp->compnamep->key);
6773 		goto out;
6774 	}
6775 
6776 	meta_sp_debug("dev %s from component: (%lu, %lu)\n",
6777 	    devname,
6778 	    meta_getmajor(*mydev),
6779 	    meta_getminor(*mydev));
6780 	meta_sp_debug("minor from the namespace: %lu\n", mnum);
6781 
6782 	if (mnum != meta_getminor(*mydev)) {
6783 		/*
6784 		 * The minor numbers are different. Update the namespace
6785 		 * with the information from the component.
6786 		 */
6787 
6788 		t = strrchr(devname, '/');
6789 		t++;
6790 		ctd_name = Strdup(t);
6791 
6792 		meta_sp_debug("meta_sp_check_component: ctd_name: %s\n",
6793 		    ctd_name);
6794 
6795 		len = strlen(devname);
6796 		t = strrchr(devname, '/');
6797 		t++;
6798 		pname = Zalloc((len - strlen(t)) + 1);
6799 		(void) strncpy(pname, devname, (len - strlen(t)));
6800 		meta_sp_debug("pathname: %s\n", pname);
6801 
6802 		meta_sp_debug("updating the minor number to %lu\n", nm.mnum);
6803 
6804 		if (meta_update_namespace(sp->setno, sideno,
6805 		    ctd_name, *mydev, msp->compnamep->key, pname,
6806 		    ep) != 0) {
6807 			goto out;
6808 		}
6809 	}
6810 out:
6811 	if (pname != NULL)
6812 		Free(pname);
6813 	if (ctd_name != NULL)
6814 		Free(ctd_name);
6815 	if (devname != NULL)
6816 		Free(devname);
6817 	if (mydev != NULL)
6818 		Free(mydev);
6819 	return (rval);
6820 }
6821