xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_sp.c (revision e4d060fb4c00d44cd578713eb9a921f594b733b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Just in case we're not in a build environment, make sure that
29  * TEXT_DOMAIN gets set to something.
30  */
31 #if !defined(TEXT_DOMAIN)
32 #define	TEXT_DOMAIN "SYS_TEST"
33 #endif
34 
35 /*
36  * soft partition operations
37  *
38  * Soft Partitions provide a virtual disk mechanism which is used to
39  * divide a large volume into many small pieces, each appearing as a
40  * separate device.  A soft partition consists of a series of extents,
41  * each having an offset and a length.  The extents are logically
42  * contiguous, so where the first extent leaves off the second extent
43  * picks up.  Which extent a given "virtual offset" belongs to is
44  * dependent on the size of all the previous extents in the soft
45  * partition.
46  *
47  * Soft partitions are represented in memory by an extent node
48  * (sp_ext_node_t) which contains all of the information necessary to
49  * create a unit structure and update the on-disk format, called
50  * "watermarks".  These extent nodes are typically kept in a doubly
51  * linked list and are manipulated by list manipulation routines.  A
52  * list of extents may represent all of the soft partitions on a volume,
53  * a single soft partition, or perhaps just a set of extents that need
54  * to be updated.  Extent lists may be sorted by extent or by name/seq#,
55  * depending on which compare function is used.  Most of the routines
56  * require the list be sorted by offset to work, and that's the typical
57  * configuration.
58  *
59  * In order to do an allocation, knowledge of all soft partitions on the
60  * volume is required.  Then free space is determined from the space
61  * that is not allocated, and new allocations can be made from the free
62  * space.  Once the new allocations are made, a unit structure is created
63  * and the watermarks are updated.  The status is then changed to "okay"
64  * on the unit structure to commit the transaction.  If updating the
65  * watermarks fails, the unit structure is in an intermediate state and
66  * the driver will not allow access to the device.
67  *
68  * A typical sequence of events is:
69  *     1. Fetch the list of names for all soft partitions on a volume
70  *         meta_sp_get_by_component()
71  *     2. Construct an extent list from the name list
72  *         meta_sp_extlist_from_namelist()
73  *     3. Fill the gaps in the extent list with free extents
74  *         meta_sp_list_freefill()
75  *     4. Allocate from the free extents
76  *         meta_sp_alloc_by_len()
77  *         meta_sp_alloc_by_list()
78  *     5. Create the unit structure from the extent list
79  *         meta_sp_createunit()
80  *         meta_sp_updateunit()
81  *     6. Write out the watermarks
82  *         meta_sp_update_wm()
83  *     7. Set the status to "Okay"
84  *         meta_sp_setstatus()
85  *
86  */
87 
88 #include <stdio.h>
89 #include <meta.h>
90 #include "meta_repartition.h"
91 #include <sys/lvm/md_sp.h>
92 #include <sys/lvm/md_crc.h>
93 #include <strings.h>
94 #include <sys/lvm/md_mirror.h>
95 #include <sys/bitmap.h>
96 
97 extern int	md_in_daemon;
98 
99 typedef struct sp_ext_node {
100 	struct sp_ext_node	*ext_next;	/* next element */
101 	struct sp_ext_node	*ext_prev;	/* previous element */
102 	sp_ext_type_t		ext_type;	/* type of extent */
103 	sp_ext_offset_t		ext_offset;	/* starting offset */
104 	sp_ext_length_t		ext_length;	/* length of this node */
105 	uint_t			ext_flags;	/* extent flags */
106 	uint32_t		ext_seq;	/* watermark seq no */
107 	mdname_t		*ext_namep;	/* name pointer */
108 	mdsetname_t		*ext_setp;	/* set pointer */
109 } sp_ext_node_t;
110 
111 /* extent flags */
112 #define	EXTFLG_UPDATE	(1)
113 
114 /* Extent node compare function for list sorting */
115 typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *);
116 
117 
118 /* Function Prototypes */
119 
120 /* Debugging Functions */
121 static void meta_sp_debug(char *format, ...);
122 static void meta_sp_printunit(mp_unit_t *mp);
123 
124 /* Misc Support Functions */
125 int meta_sp_parsesize(char *s, sp_ext_length_t *szp);
126 static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp);
127 static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp,
128 	md_error_t *ep);
129 static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp,
130     mdnamelist_t **nlpp, int force, md_error_t *ep);
131 static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp,
132     mdname_t *compnp, md_error_t *ep);
133 
134 /* Extent List Manipulation Functions */
135 static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2);
136 static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2);
137 static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np,
138     sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length,
139     sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare);
140 static void meta_sp_list_free(sp_ext_node_t **head);
141 static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext);
142 static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head,
143     sp_ext_type_t exttype, int exclude_wm);
144 static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head,
145     sp_ext_offset_t offset);
146 static void meta_sp_list_freefill(sp_ext_node_t **extlist,
147     sp_ext_length_t size);
148 static void meta_sp_list_dump(sp_ext_node_t *head);
149 static int meta_sp_list_overlaps(sp_ext_node_t *head);
150 
151 /* Extent List Query Functions */
152 static boolean_t meta_sp_enough_space(int desired_number_of_sps,
153 	blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp,
154 	sp_ext_length_t alignment);
155 static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep,
156 	mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp,
157 	md_error_t *ep);
158 static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep,
159 	mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp);
160 
161 
162 /* Extent Allocation Functions */
163 static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np,
164     sp_ext_node_t **extlist, sp_ext_node_t *free_ext,
165     sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq);
166 static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np,
167     sp_ext_node_t **extlist, sp_ext_length_t *lp,
168     sp_ext_offset_t last_off, sp_ext_length_t alignment);
169 static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np,
170     sp_ext_node_t **extlist, sp_ext_node_t *oblist);
171 
172 /* Extent List Population Functions */
173 static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp,
174     sp_ext_node_t **extlist, md_error_t *ep);
175 static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp,
176     sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep);
177 
178 /* Print (metastat) Functions */
179 static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp,
180     mdprtopts_t options, md_error_t *ep);
181 static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate);
182 static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp,
183     char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep);
184 
185 /* Watermark Manipulation Functions */
186 static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp,
187     sp_ext_node_t *extlist, md_error_t *ep);
188 static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep);
189 static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp,
190     mp_watermark_t *wm, sp_ext_offset_t offset,  md_error_t *ep);
191 static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp,
192     md_error_t *ep);
193 
194 /* Unit Structure Manipulation Functions */
195 static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist);
196 static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp,
197     sp_ext_node_t *extlist, int numexts, sp_ext_length_t len,
198     sp_status_t status, md_error_t *ep);
199 static mp_unit_t *meta_sp_updateunit(mdname_t *np,  mp_unit_t *old_un,
200     sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts,
201     md_error_t *ep);
202 static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist,
203     mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep);
204 static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options,
205     int *repart_options, md_error_t *ep);
206 
207 /* Reset (metaclear) Functions */
208 static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp,
209     md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep);
210 
211 /* Recovery (metarecover) Functions */
212 static void meta_sp_display_exthdr(void);
213 static void meta_sp_display_ext(sp_ext_node_t *ext);
214 static int meta_sp_checkseq(sp_ext_node_t *extlist);
215 static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *,
216     mdname_t **, md_error_t *);
217 static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np,
218     mdcmdopts_t options, md_error_t *ep);
219 static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp,
220     mdcmdopts_t options, md_error_t *ep);
221 static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np,
222     mdcmdopts_t options, md_error_t *ep);
223 static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext,
224     sp_ext_node_t *unitext, md_error_t *ep);
225 static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp,
226     mdcmdopts_t options, md_error_t *ep);
227 static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np,
228     mdcmdopts_t options, md_error_t *ep);
229 
230 /*
231  * Private Constants
232  */
233 
234 static const int FORCE_RELOAD_CACHE = 1;
235 static const uint_t NO_FLAGS = 0;
236 static const sp_ext_offset_t NO_OFFSET = 0ULL;
237 static const uint_t NO_SEQUENCE_NUMBER = 0;
238 static const int ONE_SOFT_PARTITION = 1;
239 
240 static unsigned long *sp_parent_printed[MD_MAXSETS];
241 
242 #define	TEST_SOFT_PARTITION_NAMEP NULL
243 #define	TEST_SETNAMEP NULL
244 
245 #define	EXCLUDE_WM	(1)
246 #define	INCLUDE_WM	(0)
247 
248 #define	SP_UNALIGNED	(0LL)
249 
250 /*
251  * **************************************************************************
252  *                          Debugging Functions                             *
253  * **************************************************************************
254  */
255 
256 /*PRINTFLIKE1*/
257 static void
258 meta_sp_debug(char *format, ...)
259 {
260 	static int debug;
261 	static int debug_set = 0;
262 	va_list ap;
263 
264 	if (!debug_set) {
265 		debug = getenv(META_SP_DEBUG) ? 1 : 0;
266 		debug_set = 1;
267 	}
268 
269 	if (debug) {
270 		va_start(ap, format);
271 		(void) vfprintf(stderr, format, ap);
272 		va_end(ap);
273 	}
274 }
275 
276 static void
277 meta_sp_printunit(mp_unit_t *mp)
278 {
279 	int i;
280 
281 	if (mp == NULL)
282 		return;
283 
284 	/* print the common fields we know about */
285 	(void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type);
286 	(void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size);
287 	(void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp));
288 
289 	/* sp-specific fields */
290 	(void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status);
291 	(void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts);
292 	(void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length);
293 	(void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev);
294 	(void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev);
295 	(void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key);
296 
297 	/* print extent information */
298 	(void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n");
299 	for (i = 0; i < mp->un_numexts; i++) {
300 		(void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i,
301 		    mp->un_ext[i].un_voff, mp->un_ext[i].un_poff,
302 		    mp->un_ext[i].un_len);
303 	}
304 }
305 
306 /*
307  * FUNCTION:    meta_sp_parsesize()
308  * INPUT:       s       - the string to parse
309  * OUTPUT:      *szp    - disk block count (0 for "all")
310  * RETURNS:     -1 for error, 0 for success
311  * PURPOSE:     parses the command line parameter that specifies the
312  *              requested size of a soft partition.  The input string
313  *              is either the literal "all" or a numeric value
314  *              followed by a single character, b for disk blocks, k
315  *              for kilobytes, m for megabytes, g for gigabytes, or t
316  *              for terabytes.  p for petabytes and e for exabytes
317  *              have been added as undocumented features for future
318  *              expansion.  For example, 100m is 100 megabytes, while
319  *              50g is 50 gigabytes.  All values are rounded up to the
320  *              nearest block size.
321  */
322 int
323 meta_sp_parsesize(char *s, sp_ext_length_t *szp)
324 {
325 	if (s == NULL || szp == NULL) {
326 		return (-1);
327 	}
328 
329 	/* Check for literal "all" */
330 	if (strcasecmp(s, "all") == 0) {
331 		*szp = 0;
332 		return (0);
333 	}
334 
335 	return (meta_sp_parsesizestring(s, szp));
336 }
337 
338 /*
339  * FUNCTION:	meta_sp_parsesizestring()
340  * INPUT:	s	- the string to parse
341  * OUTPUT:	*szp	- disk block count
342  * RETURNS:	-1 for error, 0 for success
343  * PURPOSE:	parses a string that specifies size. The input string is a
344  *		numeric value followed by a single character, b for disk blocks,
345  *		k for kilobytes, m for megabytes, g for gigabytes, or t for
346  *		terabytes.  p for petabytes and e for exabytes have been added
347  *		as undocumented features for future expansion.  For example,
348  *		100m is 100 megabytes, while 50g is 50 gigabytes.  All values
349  *		are rounded up to the nearest block size.
350  */
351 static int
352 meta_sp_parsesizestring(char *s, sp_ext_length_t *szp)
353 {
354 	sp_ext_length_t	len = 0;
355 	char		len_type[2];
356 
357 	if (s == NULL || szp == NULL) {
358 		return (-1);
359 	}
360 
361 	/*
362 	 * make sure block offset does not overflow 2^64 bytes.
363 	 */
364 	if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) ||
365 	    (len == 0LL) ||
366 	    (len > (1LL << (64 - DEV_BSHIFT))))
367 		return (-1);
368 
369 	switch (len_type[0]) {
370 	case 'B':
371 	case 'b':
372 		len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE));
373 		break;
374 	case 'K':
375 	case 'k':
376 		len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE));
377 		break;
378 	case 'M':
379 	case 'm':
380 		len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE));
381 		break;
382 	case 'g':
383 	case 'G':
384 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE));
385 		break;
386 	case 't':
387 	case 'T':
388 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL,
389 		    DEV_BSIZE));
390 		break;
391 	case 'p':
392 	case 'P':
393 		len = lbtodb(roundup(
394 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
395 		    DEV_BSIZE));
396 		break;
397 	case 'e':
398 	case 'E':
399 		len = lbtodb(roundup(
400 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
401 		    DEV_BSIZE));
402 		break;
403 	default:
404 		/* error */
405 		return (-1);
406 	}
407 
408 	*szp = len;
409 	return (0);
410 }
411 
412 /*
413  * FUNCTION:	meta_sp_setgeom()
414  * INPUT:	np      - the underlying device to setup geometry for
415  *		compnp	- the underlying device to setup geometry for
416  *		mp	- the unit structure to set the geometry for
417  * OUTPUT:	ep	- return error pointer
418  * RETURNS:	int	- -1 if error, 0 otherwise
419  * PURPOSE:	establishes geometry information for a device
420  */
421 static int
422 meta_sp_setgeom(
423 	mdname_t	*np,
424 	mdname_t	*compnp,
425 	mp_unit_t	*mp,
426 	md_error_t	*ep
427 )
428 {
429 	mdgeom_t	*geomp;
430 	uint_t		round_cyl = 0;
431 
432 	if ((geomp = metagetgeom(compnp, ep)) == NULL)
433 		return (-1);
434 	if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct,
435 	    geomp->read_reinstruct, round_cyl, ep) != 0)
436 		return (-1);
437 
438 	return (0);
439 }
440 
441 /*
442  * FUNCTION:	meta_sp_setstatus()
443  * INPUT:	sp	- the set name for the devices to set the status on
444  *		minors	- an array of minor numbers of devices to set status on
445  *		num_units - number of entries in the array
446  *		status	- status value to set all units to
447  * OUTPUT:	ep	- return error pointer
448  * RETURNS:	int	- -1 if error, 0 success
449  * PURPOSE:	sets the status of one or more soft partitions to the
450  *		requested value
451  */
452 int
453 meta_sp_setstatus(
454 	mdsetname_t	*sp,
455 	minor_t		*minors,
456 	int		num_units,
457 	sp_status_t	status,
458 	md_error_t	*ep
459 )
460 {
461 	md_sp_statusset_t	status_params;
462 
463 	assert(minors != NULL);
464 
465 	/* update status of all soft partitions to the status passed in */
466 	(void) memset(&status_params, 0, sizeof (status_params));
467 	status_params.num_units = num_units;
468 	status_params.new_status = status;
469 	status_params.size = num_units * sizeof (minor_t);
470 	status_params.minors = (uintptr_t)minors;
471 	MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno);
472 	if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde,
473 	    NULL) != 0) {
474 		(void) mdstealerror(ep, &status_params.mde);
475 		return (-1);
476 	}
477 	return (0);
478 }
479 
480 /*
481  * FUNCTION:	meta_get_sp_names()
482  * INPUT:	sp	- the set name to get soft partitions from
483  *		options	- options from the command line
484  * OUTPUT:	nlpp	- list of all soft partition names
485  *		ep	- return error pointer
486  * RETURNS:	int	- -1 if error, 0 success
487  * PURPOSE:	returns a list of all soft partitions in the metadb
488  *		for all devices in the specified set
489  */
490 int
491 meta_get_sp_names(
492 	mdsetname_t	*sp,
493 	mdnamelist_t	**nlpp,
494 	int		options,
495 	md_error_t	*ep
496 )
497 {
498 	return (meta_get_names(MD_SP, sp, nlpp, options, ep));
499 }
500 
501 /*
502  * FUNCTION:	meta_get_by_component()
503  * INPUT:	sp	- the set name to get soft partitions from
504  *		compnp	- the name of the device containing the soft
505  *			  partitions that will be returned
506  *		force	- 0 - reads cached namelist if available,
507  *			  1 - reloads cached namelist, frees old namelist
508  * OUTPUT:	nlpp	- list of all soft partition names
509  *		ep	- return error pointer
510  * RETURNS:	int	- -1 error, otherwise the number of soft partitions
511  *			  found on the component (0 = none found).
512  * PURPOSE:	returns a list of all soft partitions on a given device
513  *		from the metadb information
514  */
515 static int
516 meta_sp_get_by_component(
517 	mdsetname_t	*sp,
518 	mdname_t	*compnp,
519 	mdnamelist_t	**nlpp,
520 	int		force,
521 	md_error_t	*ep
522 )
523 {
524 	static mdnamelist_t	*cached_list = NULL;	/* cached namelist */
525 	static int		cached_count = 0;	/* cached count */
526 	mdnamelist_t		*spnlp = NULL;		/* all sp names */
527 	mdnamelist_t		*namep;			/* list iterator */
528 	mdnamelist_t		**tailpp = nlpp;	/* namelist tail */
529 	mdnamelist_t		**cachetailpp;		/* cache tail */
530 	md_sp_t			*msp;			/* unit structure */
531 	int			count = 0;		/* count of sp's */
532 	int			err;
533 	mdname_t		*curnp;
534 
535 	if ((cached_list != NULL) && (!force)) {
536 		/* return a copy of the cached list */
537 		for (namep = cached_list; namep != NULL; namep = namep->next)
538 			tailpp = meta_namelist_append_wrapper(tailpp,
539 			    namep->namep);
540 		return (cached_count);
541 	}
542 
543 	/* free the cache and reset values to zeros to prepare for a new list */
544 	metafreenamelist(cached_list);
545 	cached_count = 0;
546 	cached_list = NULL;
547 	cachetailpp = &cached_list;
548 	*nlpp = NULL;
549 
550 	/* get all the softpartitions first of all */
551 	if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
552 		return (-1);
553 
554 	/*
555 	 * Now for each sp, see if it resides on the component we
556 	 * are interested in, if so then add it to our list
557 	 */
558 	for (namep = spnlp; namep != NULL; namep = namep->next) {
559 		curnp = namep->namep;
560 
561 		/* get the unit structure */
562 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
563 			continue;
564 
565 		/*
566 		 * If the current soft partition is not on the same
567 		 * component, continue the search.  If it is on the same
568 		 * component, add it to our namelist.
569 		 */
570 		err = meta_check_samedrive(compnp, msp->compnamep, ep);
571 		if (err <= 0) {
572 			/* not on the same device, check the next one */
573 			continue;
574 		}
575 
576 		/* it's on the same drive */
577 
578 		/*
579 		 * Check for overlapping partitions if the component is not
580 		 * a metadevice.
581 		 */
582 		if (!metaismeta(msp->compnamep)) {
583 			/*
584 			 * if they're on the same drive, neither
585 			 * should be a metadevice if one isn't
586 			 */
587 			assert(!metaismeta(compnp));
588 
589 			if (meta_check_overlap(msp->compnamep->cname,
590 			    compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0)
591 				continue;
592 
593 			/* in this case it's not an error for them to overlap */
594 			mdclrerror(ep);
595 		}
596 
597 		/* Component is on the same device, add to the used list */
598 		tailpp = meta_namelist_append_wrapper(tailpp, curnp);
599 		cachetailpp = meta_namelist_append_wrapper(cachetailpp,
600 		    curnp);
601 
602 		++count;
603 		++cached_count;
604 	}
605 
606 	assert(count == cached_count);
607 	return (count);
608 
609 out:
610 	metafreenamelist(*nlpp);
611 	*nlpp = NULL;
612 	return (-1);
613 }
614 
615 /*
616  * FUNCTION:    meta_sp_get_default_alignment()
617  * INPUT:       sp      - the pertinent set name
618  *              compnp  - the name of the underlying component
619  * OUTPUT:      ep      - return error pointer
620  * RETURNS:     sp_ext_length_t =0: no default alignment
621  *                              >0: default alignment
622  * PURPOSE:     returns the default alignment for soft partitions to
623  *              be built on top of the specified component or
624  *              metadevice
625  */
626 static sp_ext_length_t
627 meta_sp_get_default_alignment(
628 	mdsetname_t	*sp,
629 	mdname_t	*compnp,
630 	md_error_t	*ep
631 )
632 {
633 	sp_ext_length_t	a = SP_UNALIGNED;
634 	char		*mname;
635 
636 	assert(compnp != NULL);
637 
638 	/*
639 	 * We treat raw devices as opaque, and assume nothing about
640 	 * their alignment requirements.
641 	 */
642 	if (!metaismeta(compnp))
643 		return (SP_UNALIGNED);
644 
645 	/*
646 	 * We already know it's a metadevice from the previous test;
647 	 * metagetmiscname() will tell us which metadevice type we
648 	 * have
649 	 */
650 	mname = metagetmiscname(compnp, ep);
651 	if (mname == NULL)
652 		goto out;
653 
654 	/*
655 	 * For a mirror, we want to deal with the stripe that is the
656 	 * primary side.  If it happens to be asymmetrically
657 	 * configured, there is no simple way to fake a universal
658 	 * alignment.  There's a chance that the least common
659 	 * denominator of the set of interlaces from all stripes of
660 	 * all submirrors would do it, but nobody that really cared
661 	 * that much about this issue would create an asymmetric
662 	 * config to start with.
663 	 *
664 	 * If the component underlying the soft partition is a mirror,
665 	 * then at the exit of this loop, compnp will have been
666 	 * updated to describe the first active submirror.
667 	 */
668 	if (strcmp(mname, MD_MIRROR) == 0) {
669 		md_mirror_t	*mp;
670 		int		smi;
671 		md_submirror_t	*smp;
672 
673 		mp = meta_get_mirror(sp, compnp, ep);
674 		if (mp == NULL)
675 			goto out;
676 
677 		for (smi = 0; smi < NMIRROR; smi++) {
678 
679 			smp = &mp->submirrors[smi];
680 			if (smp->state == SMS_UNUSED)
681 				continue;
682 
683 			compnp = smp->submirnamep;
684 			assert(compnp != NULL);
685 
686 			mname = metagetmiscname(compnp, ep);
687 			if (mname == NULL)
688 				goto out;
689 
690 			break;
691 		}
692 
693 		if (smi == NMIRROR)
694 			goto out;
695 	}
696 
697 	/*
698 	 * Handle stripes and submirrors identically; just return the
699 	 * interlace of the first row.
700 	 */
701 	if (strcmp(mname, MD_STRIPE) == 0) {
702 		md_stripe_t	*stp;
703 
704 		stp = meta_get_stripe(sp, compnp, ep);
705 		if (stp == NULL)
706 			goto out;
707 
708 		a = stp->rows.rows_val[0].interlace;
709 		goto out;
710 	}
711 
712 	/*
713 	 * Raid is even more straightforward; the interlace applies to
714 	 * the entire device.
715 	 */
716 	if (strcmp(mname, MD_RAID) == 0) {
717 		md_raid_t	*rp;
718 
719 		rp = meta_get_raid(sp, compnp, ep);
720 		if (rp == NULL)
721 			goto out;
722 
723 		a = rp->interlace;
724 		goto out;
725 	}
726 
727 	/*
728 	 * If we have arrived here with the alignment still not set,
729 	 * then we expect the error to have been set by one of the
730 	 * routines we called.  If neither is the case, something has
731 	 * really gone wrong above.  (Probably the submirror walk
732 	 * failed to produce a valid submirror, but that would be
733 	 * really bad...)
734 	 */
735 out:
736 	meta_sp_debug("meta_sp_get_default_alignment: miscname %s, "
737 	    "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a);
738 
739 	if (getenv(META_SP_DEBUG) && !mdisok(ep)) {
740 		mde_perror(ep, NULL);
741 	}
742 
743 	assert((a > 0) || (!mdisok(ep)));
744 
745 	return (a);
746 }
747 
748 
749 
750 /*
751  * FUNCTION:	meta_check_insp()
752  * INPUT:	sp	- the set name for the device to check
753  *		np	- the name of the device to check
754  *		slblk	- the starting offset of the device to check
755  *		nblks	- the number of blocks in the device to check
756  * OUTPUT:	ep	- return error pointer
757  * RETURNS:	int	-  0 - device contains soft partitions
758  *			  -1 - device does not contain soft partitions
759  * PURPOSE:	determines whether a device contains any soft partitions
760  */
761 /* ARGSUSED */
762 int
763 meta_check_insp(
764 	mdsetname_t	*sp,
765 	mdname_t	*np,
766 	diskaddr_t	slblk,
767 	diskaddr_t	nblks,
768 	md_error_t	*ep
769 )
770 {
771 	mdnamelist_t	*spnlp = NULL;	/* soft partition name list */
772 	int		count;
773 	int		rval;
774 
775 	/* check set pointer */
776 	assert(sp != NULL);
777 
778 	/*
779 	 * Get a list of the soft partitions that currently reside on
780 	 * the component.  We should ALWAYS force reload the cache,
781 	 * because if we're using the md.tab, we must rebuild
782 	 * the list because it won't contain the previous (if any)
783 	 * soft partition.
784 	 */
785 	/* find all soft partitions on the component */
786 	count = meta_sp_get_by_component(sp, np, &spnlp, 1, ep);
787 
788 	if (count == -1) {
789 		rval = -1;
790 	} else if (count > 0) {
791 		rval = mduseerror(ep, MDE_ALREADY, np->dev,
792 		    spnlp->namep->cname, np->cname);
793 	} else {
794 		rval = 0;
795 	}
796 
797 	metafreenamelist(spnlp);
798 	return (rval);
799 }
800 
801 /*
802  * **************************************************************************
803  *                    Extent List Manipulation Functions                    *
804  * **************************************************************************
805  */
806 
807 /*
808  * FUNCTION:	meta_sp_cmp_by_nameseq()
809  * INPUT:	e1	- first node to compare
810  *		e2	- second node to compare
811  * OUTPUT:	none
812  * RETURNS:	int	- =0 - nodes are equal
813  *			  <0 - e1 should go before e2
814  *			  >0 - e1 should go after e2
815  * PURPOSE:	used for sorted list inserts to build a list sorted by
816  *		name first and sequence number second.
817  */
818 static int
819 meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2)
820 {
821 	int rval;
822 
823 	if (e1->ext_namep == NULL)
824 		return (1);
825 	if (e2->ext_namep == NULL)
826 		return (-1);
827 	if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0)
828 		return (rval);
829 
830 	/* the names are equal, compare sequence numbers */
831 	if (e1->ext_seq > e2->ext_seq)
832 		return (1);
833 	if (e1->ext_seq < e2->ext_seq)
834 		return (-1);
835 	/* sequence numbers are also equal */
836 	return (0);
837 }
838 
839 /*
840  * FUNCTION:	meta_sp_cmp_by_offset()
841  * INPUT:	e1	- first node to compare
842  *		e2	- second node to compare
843  * OUTPUT:	none
844  * RETURNS:	int	- =0 - nodes are equal
845  *			  <0 - e1 should go before e2
846  *			  >0 - e1 should go after e2
847  * PURPOSE:	used for sorted list inserts to build a list sorted by offset
848  */
849 static int
850 meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2)
851 {
852 	if (e1->ext_offset > e2->ext_offset)
853 		return (1);
854 	if (e1->ext_offset < e2->ext_offset)
855 		return (-1);
856 	/* offsets are equal */
857 	return (0);
858 }
859 
860 /*
861  * FUNCTION:	meta_sp_list_insert()
862  * INPUT:	sp	- the set name for the device the node belongs to
863  *		np	- the name of the device the node belongs to
864  *		head	- the head of the list, must be NULL for empty list
865  *		offset	- the physical offset of this extent in sectors
866  *		length	- the length of this extent in sectors
867  *		type	- the type of the extent being inserted
868  *		seq	- the sequence number of the extent being inserted
869  *		flags	- extent flags (eg. whether it needs to be updated)
870  *		compare	- the compare function to use
871  * OUTPUT:	head	- points to the new head if a node was inserted
872  *			  at the beginning
873  * RETURNS:	void
874  * PURPOSE:	inserts an extent node into a sorted doubly linked list.
875  *		The sort order is determined by the compare function.
876  *		Memory is allocated for the node in this function and it
877  *		is up to the caller to free it, possibly using
878  *		meta_sp_list_free().  If a node is inserted at the
879  *		beginning of the list, the head pointer is updated to
880  *		point to the new first node.
881  */
882 static void
883 meta_sp_list_insert(
884 	mdsetname_t	*sp,
885 	mdname_t	*np,
886 	sp_ext_node_t	**head,
887 	sp_ext_offset_t	offset,
888 	sp_ext_length_t	length,
889 	sp_ext_type_t	type,
890 	uint_t		seq,
891 	uint_t		flags,
892 	ext_cmpfunc_t	compare
893 )
894 {
895 	sp_ext_node_t	*newext;
896 	sp_ext_node_t	*curext;
897 
898 	assert(head != NULL);
899 
900 	/* Don't bother adding zero length nodes */
901 	if (length == 0ULL)
902 		return;
903 
904 	/* allocate and fill in new ext_node */
905 	newext = Zalloc(sizeof (sp_ext_node_t));
906 
907 	newext->ext_offset = offset;
908 	newext->ext_length = length;
909 	newext->ext_flags = flags;
910 	newext->ext_type = type;
911 	newext->ext_seq = seq;
912 	newext->ext_setp = sp;
913 	newext->ext_namep = np;
914 
915 	/* first node in the list */
916 	if (*head == NULL) {
917 		newext->ext_next = newext->ext_prev = NULL;
918 		*head = newext;
919 	} else if ((*compare)(*head, newext) >= 0) {
920 		/* the first node has a bigger offset, so insert before it */
921 		assert((*head)->ext_prev == NULL);
922 
923 		newext->ext_prev = NULL;
924 		newext->ext_next = *head;
925 		(*head)->ext_prev = newext;
926 		*head = newext;
927 	} else {
928 		/*
929 		 * find the next node whose offset is greater than
930 		 * the one we want to insert, or the end of the list.
931 		 */
932 		for (curext = *head;
933 		    (curext->ext_next != NULL) &&
934 		    ((*compare)(curext->ext_next, newext) < 0);
935 		    (curext = curext->ext_next))
936 			;
937 
938 		/* link the new node in after the current node */
939 		newext->ext_next = curext->ext_next;
940 		newext->ext_prev = curext;
941 
942 		if (curext->ext_next != NULL)
943 			curext->ext_next->ext_prev = newext;
944 
945 		curext->ext_next = newext;
946 	}
947 }
948 
949 /*
950  * FUNCTION:	meta_sp_list_free()
951  * INPUT:	head	- the head of the list, must be NULL for empty list
952  * OUTPUT:	head	- points to NULL on return
953  * RETURNS:	void
954  * PURPOSE:	walks a double linked extent list and frees each node
955  */
956 static void
957 meta_sp_list_free(sp_ext_node_t **head)
958 {
959 	sp_ext_node_t	*ext;
960 	sp_ext_node_t	*next;
961 
962 	assert(head != NULL);
963 
964 	ext = *head;
965 	while (ext) {
966 		next = ext->ext_next;
967 		Free(ext);
968 		ext = next;
969 	}
970 	*head = NULL;
971 }
972 
973 /*
974  * FUNCTION:	meta_sp_list_remove()
975  * INPUT:	head	- the head of the list, must be NULL for empty list
976  *		ext	- the extent to remove, must be a member of the list
977  * OUTPUT:	head	- points to the new head of the list
978  * RETURNS:	void
979  * PURPOSE:	unlinks the node specified by ext from the list and
980  *		frees it, possibly moving the head pointer forward if
981  *		the head is the node being removed.
982  */
983 static void
984 meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext)
985 {
986 	assert(head != NULL);
987 	assert(*head != NULL);
988 
989 	if (*head == ext)
990 		*head = ext->ext_next;
991 
992 	if (ext->ext_prev != NULL)
993 		ext->ext_prev->ext_next = ext->ext_next;
994 	if (ext->ext_next != NULL)
995 		ext->ext_next->ext_prev = ext->ext_prev;
996 	Free(ext);
997 }
998 
999 /*
1000  * FUNCTION:	meta_sp_list_size()
1001  * INPUT:	head	- the head of the list, must be NULL for empty list
1002  *		exttype	- the type of the extents to sum
1003  *		exclude_wm - subtract space for extent headers from total
1004  * OUTPUT:	none
1005  * RETURNS:	sp_ext_length_t	- the sum of all of the lengths
1006  * PURPOSE:	sums the lengths of all extents in the list matching the
1007  *		specified type.  This could be used for computing the
1008  *		amount of free or used space, for example.
1009  */
1010 static sp_ext_length_t
1011 meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm)
1012 {
1013 	sp_ext_node_t	*ext;
1014 	sp_ext_length_t	size = 0LL;
1015 
1016 	for (ext = head; ext != NULL; ext = ext->ext_next)
1017 		if (ext->ext_type == exttype)
1018 			size += ext->ext_length -
1019 			    ((exclude_wm) ? MD_SP_WMSIZE : 0);
1020 
1021 	return (size);
1022 }
1023 
1024 /*
1025  * FUNCTION:	meta_sp_list_find()
1026  * INPUT:	head	- the head of the list, must be NULL for empty list
1027  *		offset	- the offset contained by the node to find
1028  * OUTPUT:	none
1029  * RETURNS:	sp_ext_node_t *	- the node containing the requested offset
1030  *				  or NULL if no such nodes were found.
1031  * PURPOSE:	finds a node in a list containing the requested offset
1032  *		(inclusive).  If multiple nodes contain this offset then
1033  *		only the first will be returned, though typically these
1034  *		lists are managed with non-overlapping nodes.
1035  *
1036  *		*The list MUST be sorted by offset for this function to work.*
1037  */
1038 static sp_ext_node_t *
1039 meta_sp_list_find(
1040 	sp_ext_node_t	*head,
1041 	sp_ext_offset_t	offset
1042 )
1043 {
1044 	sp_ext_node_t	*ext;
1045 
1046 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1047 		/* check if the offset lies within this extent */
1048 		if ((offset >= ext->ext_offset) &&
1049 		    (offset < ext->ext_offset + ext->ext_length)) {
1050 			/*
1051 			 * the requested extent should always be a
1052 			 * subset of an extent in the list.
1053 			 */
1054 			return (ext);
1055 		}
1056 	}
1057 	return (NULL);
1058 }
1059 
1060 /*
1061  * FUNCTION:	meta_sp_list_freefill()
1062  * INPUT:	head	- the head of the list, must be NULL for empty list
1063  *		size	- the size of the volume this extent list is
1064  *			  representing
1065  * OUTPUT:	head	- the new head of the list
1066  * RETURNS:	void
1067  * PURPOSE:	finds gaps in the extent list and fills them with a free
1068  *		node.  If there is a gap at the beginning the head
1069  *		pointer will be changed to point to the new free node.
1070  *		If there is free space at the end, the last free extent
1071  *		will extend all the way out to the size specified.
1072  *
1073  *		*The list MUST be sorted by offset for this function to work.*
1074  */
1075 static void
1076 meta_sp_list_freefill(
1077 	sp_ext_node_t	**head,
1078 	sp_ext_length_t	size
1079 )
1080 {
1081 	sp_ext_node_t	*ext;
1082 	sp_ext_offset_t	curoff = 0LL;
1083 
1084 	for (ext = *head; ext != NULL; ext = ext->ext_next) {
1085 		if (curoff < ext->ext_offset)
1086 			meta_sp_list_insert(NULL, NULL, head,
1087 			    curoff, ext->ext_offset - curoff,
1088 			    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1089 		curoff = ext->ext_offset + ext->ext_length;
1090 	}
1091 
1092 	/* pad inverse list out to the end */
1093 	if (curoff < size)
1094 		meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff,
1095 		    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1096 
1097 	if (getenv(META_SP_DEBUG)) {
1098 		meta_sp_debug("meta_sp_list_freefill: Extent list with "
1099 		    "holes freefilled:\n");
1100 		meta_sp_list_dump(*head);
1101 	}
1102 }
1103 
1104 /*
1105  * FUNCTION:	meta_sp_list_dump()
1106  * INPUT:	head	- the head of the list, must be NULL for empty list
1107  * OUTPUT:	none
1108  * RETURNS:	void
1109  * PURPOSE:	dumps the entire extent list to stdout for easy debugging
1110  */
1111 static void
1112 meta_sp_list_dump(sp_ext_node_t *head)
1113 {
1114 	sp_ext_node_t	*ext;
1115 
1116 	meta_sp_debug("meta_sp_list_dump: dumping extent list:\n");
1117 	meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name",
1118 	    "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev",
1119 	    "Next");
1120 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1121 		if (ext->ext_namep != NULL)
1122 			meta_sp_debug("%5s", ext->ext_namep->cname);
1123 		else
1124 			meta_sp_debug("%5s", "NONE");
1125 
1126 		meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq);
1127 		switch (ext->ext_type) {
1128 		case EXTTYP_ALLOC:
1129 			meta_sp_debug("%7s ", "ALLOC");
1130 			break;
1131 		case EXTTYP_FREE:
1132 			meta_sp_debug("%7s ", "FREE");
1133 			break;
1134 		case EXTTYP_END:
1135 			meta_sp_debug("%7s ", "END");
1136 			break;
1137 		case EXTTYP_RESERVED:
1138 			meta_sp_debug("%7s ", "RESV");
1139 			break;
1140 		default:
1141 			meta_sp_debug("%7s ", "INVLD");
1142 			break;
1143 		}
1144 
1145 		meta_sp_debug("%10llu %10llu %5u %10p %10p\n",
1146 		    ext->ext_offset, ext->ext_length,
1147 		    ext->ext_flags, (void *) ext->ext_prev,
1148 		    (void *) ext->ext_next);
1149 	}
1150 	meta_sp_debug("\n");
1151 }
1152 
1153 /*
1154  * FUNCTION:	meta_sp_list_overlaps()
1155  * INPUT:	head	- the head of the list, must be NULL for empty list
1156  * OUTPUT:	none
1157  * RETURNS:	int	- 1 if extents overlap, 0 if ok
1158  * PURPOSE:	checks a list for overlaps.  The list MUST be sorted by
1159  *		offset for this function to work properly.
1160  */
1161 static int
1162 meta_sp_list_overlaps(sp_ext_node_t *head)
1163 {
1164 	sp_ext_node_t	*ext;
1165 
1166 	for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) {
1167 		if (ext->ext_offset + ext->ext_length >
1168 		    ext->ext_next->ext_offset)
1169 			return (1);
1170 	}
1171 	return (0);
1172 }
1173 
1174 /*
1175  * **************************************************************************
1176  *                        Extent Allocation Functions                       *
1177  * **************************************************************************
1178  */
1179 
1180 /*
1181  * FUNCTION:	meta_sp_alloc_by_ext()
1182  * INPUT:	sp	- the set name for the device the node belongs to
1183  *		np	- the name of the device the node belongs to
1184  *		head	- the head of the list, must be NULL for empty list
1185  *		free_ext	- the free extent being allocated from
1186  *		alloc_offset	- the offset of the allocation
1187  *		alloc_len	- the length of the allocation
1188  *		seq		- the sequence number of the allocation
1189  * OUTPUT:	head	- the new head pointer
1190  * RETURNS:	void
1191  * PURPOSE:	allocates a portion of the free extent free_ext.  The
1192  *		allocated portion starts at alloc_offset and is
1193  *		alloc_length long.  Both (alloc_offset) and (alloc_offset +
1194  *		alloc_length) must be contained within the free extent.
1195  *
1196  *		The free extent is split into as many as 3 pieces - a
1197  *		free extent containing [ free_offset .. alloc_offset ), an
1198  *		allocated extent containing the range [ alloc_offset ..
1199  *		alloc_end ], and another free extent containing the
1200  *		range ( alloc_end .. free_end ].  If either of the two
1201  *		new free extents would be zero length, they are not created.
1202  *
1203  *		Finally, the original free extent is removed.  All newly
1204  *		created extents have the EXTFLG_UPDATE flag set.
1205  */
1206 static void
1207 meta_sp_alloc_by_ext(
1208 	mdsetname_t	*sp,
1209 	mdname_t	*np,
1210 	sp_ext_node_t	**head,
1211 	sp_ext_node_t	*free_ext,
1212 	sp_ext_offset_t	alloc_offset,
1213 	sp_ext_length_t	alloc_length,
1214 	uint_t		seq
1215 )
1216 {
1217 	sp_ext_offset_t	free_offset = free_ext->ext_offset;
1218 	sp_ext_length_t	free_length = free_ext->ext_length;
1219 
1220 	sp_ext_offset_t	alloc_end = alloc_offset + alloc_length;
1221 	sp_ext_offset_t	free_end  = free_offset  + free_length;
1222 
1223 	/* allocated extent must be a subset of the free extent */
1224 	assert(free_offset <= alloc_offset);
1225 	assert(free_end >= alloc_end);
1226 
1227 	meta_sp_list_remove(head, free_ext);
1228 
1229 	if (free_offset < alloc_offset) {
1230 		meta_sp_list_insert(NULL, NULL, head, free_offset,
1231 		    (alloc_offset - free_offset), EXTTYP_FREE, 0,
1232 		    EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1233 	}
1234 
1235 	if (free_end > alloc_end) {
1236 		meta_sp_list_insert(NULL, NULL, head, alloc_end,
1237 		    (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE,
1238 		    meta_sp_cmp_by_offset);
1239 	}
1240 
1241 	meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length,
1242 	    EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1243 
1244 	if (getenv(META_SP_DEBUG)) {
1245 		meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n");
1246 		meta_sp_list_dump(*head);
1247 	}
1248 }
1249 
1250 /*
1251  * FUNCTION:	meta_sp_alloc_by_len()
1252  * INPUT:	sp	- the set name for the device the node belongs to
1253  *		np	- the name of the device the node belongs to
1254  *		head	- the head of the list, must be NULL for empty list
1255  *		*lp	- the requested length to allocate
1256  *		last_off	- the last offset already allocated.
1257  *		alignment	- the desired extent alignmeent
1258  * OUTPUT:	head	- the new head pointer
1259  *		*lp	- the length allocated
1260  * RETURNS:	int	- -1 if error, the number of new extents on success
1261  * PURPOSE:	allocates extents from free space to satisfy the requested
1262  *		length.  If requested length is zero, allocates all
1263  *		remaining free space.  This function provides the meat
1264  *		of the extent allocation algorithm.  Allocation is a
1265  *		three tier process:
1266  *
1267  *		1. If last_off is nonzero and there is free space following
1268  *		   that node, then it is extended to allocate as much of that
1269  *		   free space as possible.  This is useful for metattach.
1270  *		2. If a free extent can be found to satisfy the remaining
1271  *		   requested space, then satisfy the rest of the request
1272  *		   from that extent.
1273  *		3. Start allocating space from any remaining free extents until
1274  *		   the remainder of the request is satisified.
1275  *
1276  *              If alignment is non-zero, then every extent modified
1277  *              or newly allocated will be aligned modulo alignment,
1278  *              with a length that is an integer multiple of
1279  *              alignment.
1280  *
1281  *		The EXTFLG_UPDATE flag is set for all nodes (free and
1282  *		allocated) that require updated watermarks.
1283  *
1284  *		This algorithm may have a negative impact on fragmentation
1285  *		in pathological cases and may be improved if it turns out
1286  *		to be a problem.  This may be exacerbated by particularly
1287  *		large alignments.
1288  *
1289  * NOTE:	It's confusing, so it demands an explanation:
1290  *		- len is used to represent requested data space; it
1291  *		  does not include room for a watermark.  On each full
1292  *		  or partial allocation, len will be decremented by
1293  *		  alloc_len (see next paragraph) until it reaches
1294  *		  zero.
1295  *		- alloc_len is used to represent data space allocated
1296  *		  from a particular extent; it does not include space
1297  *		  for a watermark.  In the rare event that a_length
1298  *		  (see next paragraph) is equal to MD_SP_WMSIZE,
1299  *		  alloc_len will be zero and the resulting MD_SP_WMSIZE
1300  *		  fragment of space will be utterly unusable.
1301  *		- a_length is used to represent all space to be
1302  *		  allocated from a particular extent; it DOES include
1303  *		  space for a watermark.
1304  */
1305 static int
1306 meta_sp_alloc_by_len(
1307 	mdsetname_t	*sp,
1308 	mdname_t	*np,
1309 	sp_ext_node_t	**head,
1310 	sp_ext_length_t	*lp,
1311 	sp_ext_offset_t	last_off,
1312 	sp_ext_offset_t	alignment
1313 )
1314 {
1315 	sp_ext_node_t	*free_ext;
1316 	sp_ext_node_t	*alloc_ext;
1317 	uint_t		last_seq = 0;
1318 	uint_t		numexts = 0;
1319 	sp_ext_length_t	freespace;
1320 	sp_ext_length_t	alloc_len;
1321 	sp_ext_length_t	len;
1322 
1323 	/* We're DOA if we can't read *lp */
1324 	assert(lp != NULL);
1325 	len = *lp;
1326 
1327 	/*
1328 	 * Process the nominal case first: we've been given an actual
1329 	 * size argument, rather than the literal "all"
1330 	 */
1331 
1332 	if (len != 0) {
1333 
1334 		/*
1335 		 * Short circuit the check for free space.  This may
1336 		 * tell us we have enough space when we really don't
1337 		 * because each extent loses space to a watermark, but
1338 		 * it will always tell us there isn't enough space
1339 		 * correctly.  Worst case we do some extra work.
1340 		 */
1341 		freespace = meta_sp_list_size(*head, EXTTYP_FREE,
1342 		    INCLUDE_WM);
1343 
1344 		if (freespace < len)
1345 			return (-1);
1346 
1347 		/*
1348 		 * First see if we can extend the last extent for an
1349 		 * attach.
1350 		 */
1351 		if (last_off != 0LL) {
1352 			int align = 0;
1353 
1354 			alloc_ext =
1355 			    meta_sp_list_find(*head, last_off);
1356 			assert(alloc_ext != NULL);
1357 
1358 			/*
1359 			 * The offset test reflects the
1360 			 * inclusion of the watermark in the extent
1361 			 */
1362 			align = (alignment > 0) &&
1363 			    (((alloc_ext->ext_offset + MD_SP_WMSIZE) %
1364 			    alignment) == 0);
1365 
1366 			/*
1367 			 * If we decided not to align here, we should
1368 			 * also reset "alignment" so we don't bother
1369 			 * later, either.
1370 			 */
1371 			if (!align) {
1372 				alignment = 0;
1373 			}
1374 
1375 			last_seq = alloc_ext->ext_seq;
1376 
1377 			free_ext = meta_sp_list_find(*head,
1378 			    alloc_ext->ext_offset +
1379 			    alloc_ext->ext_length);
1380 
1381 			/*
1382 			 * If a free extent follows our last allocated
1383 			 * extent, then remove the last allocated
1384 			 * extent and increase the size of the free
1385 			 * extent to overlap it, then allocate the
1386 			 * total space from the new free extent.
1387 			 */
1388 			if (free_ext != NULL &&
1389 			    free_ext->ext_type == EXTTYP_FREE) {
1390 				assert(free_ext->ext_offset ==
1391 				    alloc_ext->ext_offset +
1392 				    alloc_ext->ext_length);
1393 
1394 				alloc_len =
1395 				    MIN(len, free_ext->ext_length);
1396 
1397 				if (align && (alloc_len < len)) {
1398 					/* No watermark space needed */
1399 					alloc_len -= alloc_len % alignment;
1400 				}
1401 
1402 				if (alloc_len > 0) {
1403 					free_ext->ext_offset -=
1404 					    alloc_ext->ext_length;
1405 					free_ext->ext_length +=
1406 					    alloc_ext->ext_length;
1407 
1408 					meta_sp_alloc_by_ext(sp, np, head,
1409 					    free_ext, free_ext->ext_offset,
1410 					    alloc_ext->ext_length + alloc_len,
1411 					    last_seq);
1412 
1413 					/*
1414 					 * now remove the original allocated
1415 					 * node.  We may have overlapping
1416 					 * extents for a short time before
1417 					 * this node is removed.
1418 					 */
1419 					meta_sp_list_remove(head, alloc_ext);
1420 					len -= alloc_len;
1421 				}
1422 			}
1423 			last_seq++;
1424 		}
1425 
1426 		if (len == 0LL)
1427 			goto out;
1428 
1429 		/*
1430 		 * Next, see if we can find a single allocation for
1431 		 * the remainder.  This may make fragmentation worse
1432 		 * in some cases, but there's no good way to allocate
1433 		 * that doesn't have a highly fragmented corner case.
1434 		 */
1435 		for (free_ext = *head; free_ext != NULL;
1436 		    free_ext = free_ext->ext_next) {
1437 			sp_ext_offset_t	a_offset;
1438 			sp_ext_offset_t	a_length;
1439 
1440 			if (free_ext->ext_type != EXTTYP_FREE)
1441 				continue;
1442 
1443 			/*
1444 			 * The length test should include space for
1445 			 * the watermark
1446 			 */
1447 
1448 			a_offset = free_ext->ext_offset;
1449 			a_length = free_ext->ext_length;
1450 
1451 			if (alignment > 0) {
1452 
1453 				/*
1454 				 * Shortcut for extents that have been
1455 				 * previously added to pad out the
1456 				 * data space
1457 				 */
1458 				if (a_length < alignment) {
1459 					continue;
1460 				}
1461 
1462 				/*
1463 				 * Round up so the data space begins
1464 				 * on a properly aligned boundary.
1465 				 */
1466 				a_offset += alignment -
1467 				    (a_offset % alignment) - MD_SP_WMSIZE;
1468 
1469 				/*
1470 				 * This is only necessary in case the
1471 				 * watermark size is ever greater than
1472 				 * one.  It'll never happen, of
1473 				 * course; we'll get rid of watermarks
1474 				 * before we make 'em bigger.
1475 				 */
1476 				if (a_offset < free_ext->ext_offset) {
1477 					a_offset += alignment;
1478 				}
1479 
1480 				/*
1481 				 * Adjust the length to account for
1482 				 * the space lost above (if any)
1483 				 */
1484 				a_length -=
1485 				    (a_offset - free_ext->ext_offset);
1486 			}
1487 
1488 			if (a_length >= len + MD_SP_WMSIZE) {
1489 				meta_sp_alloc_by_ext(sp, np, head,
1490 				    free_ext, a_offset,
1491 				    len + MD_SP_WMSIZE, last_seq);
1492 
1493 				len = 0LL;
1494 				numexts++;
1495 				break;
1496 			}
1497 		}
1498 
1499 		if (len == 0LL)
1500 			goto out;
1501 
1502 
1503 		/*
1504 		 * If the request could not be satisfied by extending
1505 		 * the last extent or by a single extent, then put
1506 		 * multiple smaller extents together until the request
1507 		 * is satisfied.
1508 		 */
1509 		for (free_ext = *head; (free_ext != NULL) && (len > 0);
1510 		    free_ext = free_ext->ext_next) {
1511 			sp_ext_offset_t a_offset;
1512 			sp_ext_length_t a_length;
1513 
1514 			if (free_ext->ext_type != EXTTYP_FREE)
1515 				continue;
1516 
1517 			a_offset = free_ext->ext_offset;
1518 			a_length = free_ext->ext_length;
1519 
1520 			if (alignment > 0) {
1521 
1522 				/*
1523 				 * Shortcut for extents that have been
1524 				 * previously added to pad out the
1525 				 * data space
1526 				 */
1527 				if (a_length < alignment) {
1528 					continue;
1529 				}
1530 
1531 				/*
1532 				 * Round up so the data space begins
1533 				 * on a properly aligned boundary.
1534 				 */
1535 				a_offset += alignment -
1536 				    (a_offset % alignment) - MD_SP_WMSIZE;
1537 
1538 				/*
1539 				 * This is only necessary in case the
1540 				 * watermark size is ever greater than
1541 				 * one.  It'll never happen, of
1542 				 * course; we'll get rid of watermarks
1543 				 * before we make 'em bigger.
1544 				 */
1545 				if (a_offset < free_ext->ext_offset) {
1546 					a_offset += alignment;
1547 				}
1548 
1549 				/*
1550 				 * Adjust the length to account for
1551 				 * the space lost above (if any)
1552 				 */
1553 				a_length -=
1554 				    (a_offset - free_ext->ext_offset);
1555 
1556 				/*
1557 				 * Adjust the length to be properly
1558 				 * aligned if it is NOT to be the
1559 				 * last extent in the soft partition.
1560 				 */
1561 				if ((a_length - MD_SP_WMSIZE) < len)
1562 					a_length -=
1563 					    (a_length - MD_SP_WMSIZE)
1564 					    % alignment;
1565 			}
1566 
1567 			alloc_len = MIN(len, a_length - MD_SP_WMSIZE);
1568 			if (alloc_len == 0)
1569 				continue;
1570 
1571 			/*
1572 			 * meta_sp_alloc_by_ext() expects the
1573 			 * allocation length to include the watermark
1574 			 * size, which is why we don't simply pass in
1575 			 * alloc_len here.
1576 			 */
1577 			meta_sp_alloc_by_ext(sp, np, head, free_ext,
1578 			    a_offset, MIN(len + MD_SP_WMSIZE, a_length),
1579 			    last_seq);
1580 
1581 			len -= alloc_len;
1582 			numexts++;
1583 			last_seq++;
1584 		}
1585 
1586 
1587 		/*
1588 		 * If there was not enough space we can throw it all
1589 		 * away since no real work has been done yet.
1590 		 */
1591 		if (len != 0) {
1592 			meta_sp_list_free(head);
1593 			return (-1);
1594 		}
1595 	}
1596 
1597 	/*
1598 	 * Otherwise, the literal "all" was specified: allocate all
1599 	 * available free space.  Don't bother with alignment.
1600 	 */
1601 	else {
1602 		/* First, extend the last extent if this is a grow */
1603 		if (last_off != 0LL) {
1604 			alloc_ext =
1605 			    meta_sp_list_find(*head, last_off);
1606 			assert(alloc_ext != NULL);
1607 
1608 			last_seq = alloc_ext->ext_seq;
1609 
1610 			free_ext = meta_sp_list_find(*head,
1611 			    alloc_ext->ext_offset +
1612 			    alloc_ext->ext_length);
1613 
1614 			/*
1615 			 * If a free extent follows our last allocated
1616 			 * extent, then remove the last allocated
1617 			 * extent and increase the size of the free
1618 			 * extent to overlap it, then allocate the
1619 			 * total space from the new free extent.
1620 			 */
1621 			if (free_ext != NULL &&
1622 			    free_ext->ext_type == EXTTYP_FREE) {
1623 				assert(free_ext->ext_offset ==
1624 				    alloc_ext->ext_offset +
1625 				    alloc_ext->ext_length);
1626 
1627 				len = alloc_len =
1628 				    free_ext->ext_length;
1629 
1630 				free_ext->ext_offset -=
1631 				    alloc_ext->ext_length;
1632 				free_ext->ext_length +=
1633 				    alloc_ext->ext_length;
1634 
1635 				meta_sp_alloc_by_ext(sp, np, head,
1636 				    free_ext, free_ext->ext_offset,
1637 				    alloc_ext->ext_length + alloc_len,
1638 				    last_seq);
1639 
1640 				/*
1641 				 * now remove the original allocated
1642 				 * node.  We may have overlapping
1643 				 * extents for a short time before
1644 				 * this node is removed.
1645 				 */
1646 				meta_sp_list_remove(head, alloc_ext);
1647 			}
1648 
1649 			last_seq++;
1650 		}
1651 
1652 		/* Next, grab all remaining free space */
1653 		for (free_ext = *head; free_ext != NULL;
1654 		    free_ext = free_ext->ext_next) {
1655 
1656 			if (free_ext->ext_type == EXTTYP_FREE) {
1657 				alloc_len =
1658 				    free_ext->ext_length - MD_SP_WMSIZE;
1659 				if (alloc_len == 0)
1660 					continue;
1661 
1662 				/*
1663 				 * meta_sp_alloc_by_ext() expects the
1664 				 * allocation length to include the
1665 				 * watermark size, which is why we
1666 				 * don't simply pass in alloc_len
1667 				 * here.
1668 				 */
1669 				meta_sp_alloc_by_ext(sp, np, head,
1670 				    free_ext, free_ext->ext_offset,
1671 				    free_ext->ext_length,
1672 				    last_seq);
1673 
1674 				len += alloc_len;
1675 				numexts++;
1676 				last_seq++;
1677 			}
1678 		}
1679 	}
1680 
1681 out:
1682 	if (getenv(META_SP_DEBUG)) {
1683 		meta_sp_debug("meta_sp_alloc_by_len: Extent list after "
1684 		    "allocation:\n");
1685 		meta_sp_list_dump(*head);
1686 	}
1687 
1688 	if (*lp == 0) {
1689 		*lp = len;
1690 
1691 		/*
1692 		 * Make sure the callers hit a no space error if we
1693 		 * didn't actually find anything.
1694 		 */
1695 		if (len == 0) {
1696 			return (-1);
1697 		}
1698 	}
1699 
1700 	return (numexts);
1701 }
1702 
1703 /*
1704  * FUNCTION:	meta_sp_alloc_by_list()
1705  * INPUT:	sp	- the set name for the device the node belongs to
1706  *		np	- the name of the device the node belongs to
1707  *		head	- the head of the list, must be NULL for empty list
1708  *		oblist	- an extent list containing requested nodes to allocate
1709  * OUTPUT:	head	- the new head pointer
1710  * RETURNS:	int	- -1 if error, the number of new extents on success
1711  * PURPOSE:	allocates extents from free space to satisfy the requested
1712  *		extent list.  This is primarily used for the -o/-b options
1713  *		where the user may specifically request extents to allocate.
1714  *		Each extent in the oblist must be a subset (inclusive) of a
1715  *		free extent and may not overlap each other.  This
1716  *		function sets the EXTFLG_UPDATE flag for each node that
1717  *		requires a watermark update after allocating.
1718  */
1719 static int
1720 meta_sp_alloc_by_list(
1721 	mdsetname_t	*sp,
1722 	mdname_t	*np,
1723 	sp_ext_node_t	**head,
1724 	sp_ext_node_t	*oblist
1725 )
1726 {
1727 	sp_ext_node_t	*ext;
1728 	sp_ext_node_t	*free_ext;
1729 	uint_t		numexts = 0;
1730 
1731 	for (ext = oblist; ext != NULL; ext = ext->ext_next) {
1732 
1733 		free_ext = meta_sp_list_find(*head,
1734 		    ext->ext_offset - MD_SP_WMSIZE);
1735 
1736 		/* Make sure the allocation is within the free extent */
1737 		if ((free_ext == NULL) ||
1738 		    (ext->ext_offset + ext->ext_length >
1739 		    free_ext->ext_offset + free_ext->ext_length) ||
1740 		    (free_ext->ext_type != EXTTYP_FREE))
1741 			return (-1);
1742 
1743 		meta_sp_alloc_by_ext(sp, np, head, free_ext,
1744 		    ext->ext_offset - MD_SP_WMSIZE,
1745 		    ext->ext_length + MD_SP_WMSIZE, ext->ext_seq);
1746 
1747 		numexts++;
1748 	}
1749 
1750 	assert(meta_sp_list_overlaps(*head) == 0);
1751 
1752 	if (getenv(META_SP_DEBUG)) {
1753 		meta_sp_debug("meta_sp_alloc_by_list: Extent list after "
1754 		    "allocation:\n");
1755 		meta_sp_list_dump(*head);
1756 	}
1757 
1758 	return (numexts);
1759 }
1760 
1761 /*
1762  * **************************************************************************
1763  *                     Extent List Population Functions                     *
1764  * **************************************************************************
1765  */
1766 
1767 /*
1768  * FUNCTION:	meta_sp_extlist_from_namelist()
1769  * INPUT:	sp	- the set name for the device the node belongs to
1770  *		spnplp	- the namelist of soft partitions to build a list from
1771  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1772  *		ep	- return error pointer
1773  * RETURNS:	int	- -1 if error, 0 on success
1774  * PURPOSE:	builds an extent list representing the soft partitions
1775  *		specified in the namelist.  Each extent in each soft
1776  *		partition is added to the list with the type EXTTYP_ALLOC.
1777  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1778  *		extent in the list includes the space occupied by the
1779  *		watermark, which is not included in the unit structures.
1780  */
1781 static int
1782 meta_sp_extlist_from_namelist(
1783 	mdsetname_t	*sp,
1784 	mdnamelist_t	*spnlp,
1785 	sp_ext_node_t	**extlist,
1786 	md_error_t	*ep
1787 )
1788 {
1789 	int		extn;
1790 	md_sp_t		*msp;		/* unit structure of the sp's */
1791 	mdnamelist_t	*namep;
1792 
1793 	assert(sp != NULL);
1794 
1795 	/*
1796 	 * Now go through the soft partitions and add a node to the used
1797 	 * list for each allocated extent.
1798 	 */
1799 	for (namep = spnlp; namep != NULL; namep = namep->next) {
1800 		mdname_t	*curnp = namep->namep;
1801 
1802 		/* get the unit structure */
1803 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
1804 			return (-1);
1805 
1806 		for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1807 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1808 
1809 			/*
1810 			 * subtract from offset and add to the length
1811 			 * to account for the watermark, which is not
1812 			 * contained in the extents in the unit structure.
1813 			 */
1814 			meta_sp_list_insert(sp, curnp, extlist,
1815 			    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
1816 			    EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset);
1817 		}
1818 	}
1819 	return (0);
1820 }
1821 
1822 /*
1823  * FUNCTION:	meta_sp_extlist_from_wm()
1824  * INPUT:	sp	- the set name for the device the node belongs to
1825  *		compnp	- the name of the device to scan watermarks on
1826  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1827  *		ep	- return error pointer
1828  * RETURNS:	int	- -1 if error, 0 on success
1829  * PURPOSE:	builds an extent list representing the soft partitions
1830  *		specified in the namelist.  Each extent in each soft
1831  *		partition is added to the list with the type EXTTYP_ALLOC.
1832  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1833  *		extent in the list includes the space occupied by the
1834  *		watermark, which is not included in the unit structures.
1835  */
1836 static int
1837 meta_sp_extlist_from_wm(
1838 	mdsetname_t	*sp,
1839 	mdname_t	*compnp,
1840 	sp_ext_node_t	**extlist,
1841 	ext_cmpfunc_t	compare,
1842 	md_error_t	*ep
1843 )
1844 {
1845 	mp_watermark_t	wm;
1846 	mdname_t	*np = NULL;
1847 	mdsetname_t	*spsetp = NULL;
1848 	sp_ext_offset_t	cur_off;
1849 	md_set_desc	*sd;
1850 	int		init = 0;
1851 	mdkey_t		key;
1852 	minor_t		mnum;
1853 
1854 	if (!metaislocalset(sp)) {
1855 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1856 			return (-1);
1857 	}
1858 
1859 	if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR)
1860 		return (-1);
1861 
1862 	for (;;) {
1863 		if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) {
1864 			return (-1);
1865 		}
1866 
1867 		/* get the set and name pointers */
1868 		if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) {
1869 			if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) {
1870 				return (-1);
1871 			}
1872 		}
1873 
1874 		/*
1875 		 * For the MN set, meta_init_make_device needs to
1876 		 * be run on all the nodes so the entries for the
1877 		 * softpart device name and its comp can be created
1878 		 * in the same order in the replica namespace.  If
1879 		 * we have it run on mdmn_do_iocset then the mddbs
1880 		 * will be out of sync between master node and slave
1881 		 * nodes.
1882 		 */
1883 		if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) {
1884 
1885 			if (!metaislocalset(sp) && MD_MNSET_DESC(sd)) {
1886 				md_mn_msg_addmdname_t	*send_params;
1887 				int			result;
1888 				md_mn_result_t		*resp = NULL;
1889 				int			message_size;
1890 
1891 				message_size =  sizeof (*send_params) +
1892 				    strlen(wm.wm_mdname) + 1;
1893 				send_params = Zalloc(message_size);
1894 				send_params->addmdname_setno = sp->setno;
1895 				(void) strcpy(&send_params->addmdname_name[0],
1896 				    wm.wm_mdname);
1897 				result = mdmn_send_message(sp->setno,
1898 				    MD_MN_MSG_ADDMDNAME,
1899 				    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0,
1900 				    (char *)send_params, message_size, &resp,
1901 				    ep);
1902 				Free(send_params);
1903 				if (resp != NULL) {
1904 					if (resp->mmr_exitval != 0) {
1905 						free_result(resp);
1906 						return (-1);
1907 					}
1908 					free_result(resp);
1909 				}
1910 				if (result != 0)
1911 					return (-1);
1912 			} else {
1913 
1914 				if (!is_existing_meta_hsp(sp, wm.wm_mdname)) {
1915 					if ((key = meta_init_make_device(&sp,
1916 					    wm.wm_mdname, ep)) <= 0) {
1917 						return (-1);
1918 					}
1919 					init = 1;
1920 				}
1921 			}
1922 
1923 			np = metaname(&spsetp, wm.wm_mdname, META_DEVICE, ep);
1924 			if (np == NULL) {
1925 				if (init) {
1926 					if (meta_getnmentbykey(sp->setno,
1927 					    MD_SIDEWILD, key, NULL, &mnum,
1928 					    NULL, ep) != NULL) {
1929 						(void) metaioctl(MD_IOCREM_DEV,
1930 						    &mnum, ep, NULL);
1931 					}
1932 					(void) del_self_name(sp, key, ep);
1933 				}
1934 				return (-1);
1935 			}
1936 		}
1937 
1938 		/* insert watermark into extent list */
1939 		meta_sp_list_insert(spsetp, np, extlist, cur_off,
1940 		    wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq,
1941 		    EXTFLG_UPDATE, compare);
1942 
1943 		/* if we see the end watermark, we're done */
1944 		if (wm.wm_type == EXTTYP_END)
1945 			break;
1946 
1947 		cur_off += wm.wm_length + 1;
1948 
1949 		/* clear out set and name pointers for next iteration */
1950 		np = NULL;
1951 		spsetp = NULL;
1952 	}
1953 
1954 	return (0);
1955 }
1956 
1957 /*
1958  * **************************************************************************
1959  *                        Print (metastat) Functions                        *
1960  * **************************************************************************
1961  */
1962 
1963 /*
1964  * FUNCTION:	meta_sp_short_print()
1965  * INPUT:	msp	- the unit structure to display
1966  *		fp	- the file pointer to send output to
1967  *		options	- print options from the command line processor
1968  * OUTPUT:	ep	- return error pointer
1969  * RETURNS:	int	- -1 if error, 0 on success
1970  * PURPOSE:	display a short report of the soft partition in md.tab
1971  *		form, primarily used for metastat -p.
1972  */
1973 static int
1974 meta_sp_short_print(
1975 	md_sp_t		*msp,
1976 	char		*fname,
1977 	FILE		*fp,
1978 	mdprtopts_t	options,
1979 	md_error_t	*ep
1980 )
1981 {
1982 	int	extn;
1983 
1984 	if (options & PRINT_LARGEDEVICES) {
1985 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0)
1986 			return (0);
1987 	}
1988 
1989 	if (options & PRINT_FN) {
1990 		if ((msp->common.revision & MD_FN_META_DEV) == 0)
1991 			return (0);
1992 	}
1993 
1994 	/* print name and -p */
1995 	if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF)
1996 		return (mdsyserror(ep, errno, fname));
1997 
1998 	/* print the component */
1999 	/*
2000 	 * Always print the full path name
2001 	 */
2002 	if (fprintf(fp, " %s", msp->compnamep->rname) == EOF)
2003 		return (mdsyserror(ep, errno, fname));
2004 
2005 	/* print out each extent */
2006 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2007 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2008 		if (fprintf(fp, " -o %llu -b %llu ", extp->poff,
2009 		    extp->len) == EOF)
2010 			return (mdsyserror(ep, errno, fname));
2011 	}
2012 
2013 	if (fprintf(fp, "\n") == EOF)
2014 		return (mdsyserror(ep, errno, fname));
2015 
2016 	/* success */
2017 	return (0);
2018 }
2019 
2020 /*
2021  * FUNCTION:	meta_sp_status_to_name()
2022  * INPUT:	xsp_status	- the status value to convert to a string
2023  *		tstate		- transient errored device state. If set the
2024  *				  device is Unavailable
2025  * OUTPUT:	none
2026  * RETURNS:	char *	- a pointer to the string representing the status value
2027  * PURPOSE:	return an internationalized string representing the
2028  *		status value for a soft partition.  The strings are
2029  *		strdup'd and must be freed by the caller.
2030  */
2031 static char *
2032 meta_sp_status_to_name(
2033 	xsp_status_t	xsp_status,
2034 	uint_t		tstate
2035 )
2036 {
2037 	char *rval = NULL;
2038 
2039 	/*
2040 	 * Check to see if we have MD_INACCESSIBLE set. This is the only valid
2041 	 * value for an 'Unavailable' return. tstate can be set because of
2042 	 * other multi-node reasons (e.g. ABR being set)
2043 	 */
2044 	if (tstate & MD_INACCESSIBLE) {
2045 		return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable")));
2046 	}
2047 
2048 	switch (xsp_status) {
2049 	case MD_SP_CREATEPEND:
2050 		rval = Strdup(dgettext(TEXT_DOMAIN, "Creating"));
2051 		break;
2052 	case MD_SP_GROWPEND:
2053 		rval = Strdup(dgettext(TEXT_DOMAIN, "Growing"));
2054 		break;
2055 	case MD_SP_DELPEND:
2056 		rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting"));
2057 		break;
2058 	case MD_SP_OK:
2059 		rval = Strdup(dgettext(TEXT_DOMAIN, "Okay"));
2060 		break;
2061 	case MD_SP_ERR:
2062 		rval = Strdup(dgettext(TEXT_DOMAIN, "Errored"));
2063 		break;
2064 	case MD_SP_RECOVER:
2065 		rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering"));
2066 		break;
2067 	}
2068 
2069 	if (rval == NULL)
2070 		rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid"));
2071 
2072 	return (rval);
2073 }
2074 
2075 /*
2076  * FUNCTION:	meta_sp_report()
2077  * INPUT:	sp	- the set name for the unit being displayed
2078  *		msp	- the unit structure to display
2079  *		nlpp	- pass back the large devs
2080  *		fp	- the file pointer to send output to
2081  *		options	- print options from the command line processor
2082  * OUTPUT:	ep	- return error pointer
2083  * RETURNS:	int	- -1 if error, 0 on success
2084  * PURPOSE:	print a full report of the device specified
2085  */
2086 static int
2087 meta_sp_report(
2088 	mdsetname_t	*sp,
2089 	md_sp_t		*msp,
2090 	mdnamelist_t	**nlpp,
2091 	char		*fname,
2092 	FILE		*fp,
2093 	mdprtopts_t	options,
2094 	md_error_t	*ep
2095 )
2096 {
2097 	uint_t		extn;
2098 	char		*status;
2099 	char		*devid = "";
2100 	mdname_t	*didnp = NULL;
2101 	ddi_devid_t	dtp;
2102 	int		len;
2103 	uint_t		tstate = 0;
2104 
2105 	if (options & PRINT_LARGEDEVICES) {
2106 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0) {
2107 			return (0);
2108 		} else {
2109 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2110 				return (-1);
2111 		}
2112 	}
2113 
2114 	if (options & PRINT_FN) {
2115 		if ((msp->common.revision & MD_FN_META_DEV) == 0) {
2116 			return (0);
2117 		} else {
2118 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2119 				return (-1);
2120 		}
2121 	}
2122 
2123 	if (options & PRINT_HEADER) {
2124 		if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"),
2125 		    msp->common.namep->cname) == EOF)
2126 			return (mdsyserror(ep, errno, fname));
2127 	}
2128 
2129 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Device: %s\n"),
2130 	    msp->compnamep->cname) == EOF)
2131 		return (mdsyserror(ep, errno, fname));
2132 
2133 	/* Determine if device is available before displaying status */
2134 	if (metaismeta(msp->common.namep)) {
2135 		if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0)
2136 			return (-1);
2137 	}
2138 	status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED);
2139 
2140 	/* print out "State" to be consistent with other metadevices */
2141 	if (tstate & MD_ABR_CAP) {
2142 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2143 		    "    State: %s - Application Based Recovery (ABR)\n"),
2144 		    status) == EOF) {
2145 			Free(status);
2146 			return (mdsyserror(ep, errno, fname));
2147 		}
2148 	} else {
2149 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2150 		    "    State: %s\n"), status) == EOF) {
2151 			Free(status);
2152 			return (mdsyserror(ep, errno, fname));
2153 		}
2154 	}
2155 	free(status);
2156 
2157 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Size: %llu blocks (%s)\n"),
2158 	    msp->common.size,
2159 	    meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF)
2160 		return (mdsyserror(ep, errno, fname));
2161 
2162 	/* print component details */
2163 	if (! metaismeta(msp->compnamep)) {
2164 		diskaddr_t	start_blk;
2165 		int		has_mddb;
2166 		char		*has_mddb_str;
2167 
2168 		/* print header */
2169 		/*
2170 		 * Building a format string on the fly that will
2171 		 * be used in (f)printf. This allows the length
2172 		 * of the ctd to vary from small to large without
2173 		 * looking horrible.
2174 		 */
2175 		len = strlen(msp->compnamep->cname);
2176 		len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device")));
2177 		len += 2;
2178 		if (fprintf(fp,
2179 		    "\t%-*.*s %-12.12s %-5.5s %s\n",
2180 		    len, len,
2181 		    dgettext(TEXT_DOMAIN, "Device"),
2182 		    dgettext(TEXT_DOMAIN, "Start Block"),
2183 		    dgettext(TEXT_DOMAIN, "Dbase"),
2184 		    dgettext(TEXT_DOMAIN, "Reloc")) == EOF) {
2185 			return (mdsyserror(ep, errno, fname));
2186 		}
2187 
2188 
2189 		/* get info */
2190 		if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) ==
2191 		    MD_DISKADDR_ERROR)
2192 			return (-1);
2193 
2194 		if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0)
2195 			return (-1);
2196 
2197 		if (has_mddb)
2198 			has_mddb_str = dgettext(TEXT_DOMAIN, "Yes");
2199 		else
2200 			has_mddb_str = dgettext(TEXT_DOMAIN, "No");
2201 
2202 		/* populate the key in the name_p structure */
2203 		didnp = metadevname(&sp, msp->compnamep->dev, ep);
2204 		if (didnp == NULL) {
2205 			return (-1);
2206 		}
2207 
2208 		/* determine if devid does NOT exist */
2209 		if (options & PRINT_DEVID) {
2210 			if ((dtp = meta_getdidbykey(sp->setno,
2211 			    getmyside(sp, ep), didnp->key, ep)) == NULL)
2212 				devid = dgettext(TEXT_DOMAIN, "No ");
2213 			else {
2214 				devid = dgettext(TEXT_DOMAIN, "Yes");
2215 				free(dtp);
2216 			}
2217 		}
2218 
2219 		/* print info */
2220 		/*
2221 		 * This allows the length
2222 		 * of the ctd to vary from small to large without
2223 		 * looking horrible.
2224 		 */
2225 		if (fprintf(fp, "\t%-*s %8lld     %-5.5s %s\n",
2226 		    len, msp->compnamep->cname,
2227 		    start_blk, has_mddb_str, devid) == EOF) {
2228 			return (mdsyserror(ep, errno, fname));
2229 		}
2230 		(void) fprintf(fp, "\n");
2231 	}
2232 
2233 
2234 	/* print the headers */
2235 	if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n",
2236 	    dgettext(TEXT_DOMAIN, "Extent"),
2237 	    dgettext(TEXT_DOMAIN, "Start Block"),
2238 	    dgettext(TEXT_DOMAIN, "Block count")) == EOF)
2239 		return (mdsyserror(ep, errno, fname));
2240 
2241 	/* print out each extent */
2242 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2243 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2244 
2245 		/* If PRINT_TIMES option is ever supported, add output here */
2246 		if (fprintf(fp, "\t%6u %24llu %24llu\n",
2247 		    extn, extp->poff, extp->len) == EOF)
2248 			return (mdsyserror(ep, errno, fname));
2249 	}
2250 
2251 	/* separate records with a newline */
2252 	(void) fprintf(fp, "\n");
2253 	return (0);
2254 }
2255 
2256 /*
2257  * FUNCTION:	meta_sp_print()
2258  * INPUT:	sp	- the set name for the unit being displayed
2259  *		np	- the name of the device to print
2260  *		fname	- ??? not used
2261  *		fp	- the file pointer to send output to
2262  *		options	- print options from the command line processor
2263  * OUTPUT:	ep	- return error pointer
2264  * RETURNS:	int	- -1 if error, 0 on success
2265  * PURPOSE:	print a full report of the device specified by metastat.
2266  *		This is the main entry point for printing.
2267  */
2268 int
2269 meta_sp_print(
2270 	mdsetname_t	*sp,
2271 	mdname_t	*np,
2272 	mdnamelist_t	**nlpp,
2273 	char		*fname,
2274 	FILE		*fp,
2275 	mdprtopts_t	options,
2276 	md_error_t	*ep
2277 )
2278 {
2279 	md_sp_t		*msp;
2280 	md_unit_t	*mdp;
2281 	int		rval = 0;
2282 	set_t		setno;
2283 	minor_t		unit;
2284 
2285 	/* should always have the same set */
2286 	assert(sp != NULL);
2287 
2288 	/* print all the soft partitions */
2289 	if (np == NULL) {
2290 		mdnamelist_t	*nlp = NULL;
2291 		mdnamelist_t	*p;
2292 		int		cnt;
2293 
2294 		if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0)
2295 			return (-1);
2296 		else if (cnt == 0)
2297 			return (0);
2298 
2299 		/* recusively print them out */
2300 		for (p = nlp; (p != NULL); p = p->next) {
2301 			mdname_t	*curnp = p->namep;
2302 
2303 			/*
2304 			 * one problem with the rval of -1 here is that
2305 			 * the error gets "lost" when the next device is
2306 			 * printed, but we want to print them all anyway.
2307 			 */
2308 			rval = meta_sp_print(sp, curnp, nlpp, fname, fp,
2309 			    options, ep);
2310 		}
2311 
2312 		/* clean up, return success */
2313 		metafreenamelist(nlp);
2314 		return (rval);
2315 	}
2316 
2317 	/* get the unit structure */
2318 	if ((msp = meta_get_sp_common(sp, np,
2319 	    ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL)
2320 		return (-1);
2321 
2322 	/* check for parented */
2323 	if ((! (options & PRINT_SUBDEVS)) &&
2324 	    (MD_HAS_PARENT(msp->common.parent))) {
2325 		return (0);
2326 	}
2327 
2328 	/* print appropriate detail */
2329 	if (options & PRINT_SHORT) {
2330 		if (meta_sp_short_print(msp, fname, fp, options, ep) != 0)
2331 			return (-1);
2332 	} else {
2333 		if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0)
2334 			return (-1);
2335 	}
2336 
2337 	/*
2338 	 * Print underlying metadevices if they are parented to us and
2339 	 * if the info for the underlying metadevice has not been printed.
2340 	 */
2341 	if (metaismeta(msp->compnamep)) {
2342 		/* get the unit structure for the subdevice */
2343 		if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL)
2344 			return (-1);
2345 
2346 		setno = MD_MIN2SET(MD_SID(mdp));
2347 		unit = MD_MIN2UNIT(MD_SID(mdp));
2348 
2349 		/* If info not already printed, recurse */
2350 		if (sp_parent_printed[setno] == NULL ||
2351 		    !BT_TEST(sp_parent_printed[setno], unit)) {
2352 			if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp,
2353 			    (options | PRINT_HEADER | PRINT_SUBDEVS),
2354 			    NULL, ep) != 0) {
2355 				return (-1);
2356 			}
2357 			if (sp_parent_printed[setno] == NULL)
2358 				sp_parent_printed[setno] =
2359 				    Zalloc(BT_BITOUL(MD_MAXUNITS));
2360 			BT_SET(sp_parent_printed[setno], unit);
2361 		}
2362 	}
2363 	return (0);
2364 }
2365 
2366 /*
2367  * **************************************************************************
2368  *                     Watermark Manipulation Functions                     *
2369  * **************************************************************************
2370  */
2371 
2372 /*
2373  * FUNCTION:	meta_sp_get_start()
2374  * INPUT:	sp	- the operating set
2375  *		np 	- device upon which the sp is being built
2376  * OUTPUT:	ep	- return error pointer
2377  * RETURNS:	daddr_t	- -1 if error, otherwise the start block
2378  * PURPOSE:	Encapsulate the determination of the start block of the
2379  *		device upon which the sp is built or being built.
2380  */
2381 static diskaddr_t
2382 meta_sp_get_start(
2383 	mdsetname_t	*sp,
2384 	mdname_t	*np,
2385 	md_error_t	*ep
2386 )
2387 {
2388 	daddr_t		start_block;
2389 
2390 	if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR)
2391 		start_block += MD_SP_START;
2392 
2393 	return (start_block);
2394 }
2395 
2396 /*
2397  * FUNCTION:	meta_sp_update_wm_common()
2398  * INPUT:	sp	- the operating set
2399  *		msp	- a pointer to the XDR unit structure
2400  *		extlist	- the extent list specifying watermarks to update
2401  *		iocval	- either MD_IOC_SPUPDATEWM or MD_MN_IOC_SPUPDATEWM
2402  * OUTPUT:	ep	- return error pointer
2403  * RETURNS:	int	- -1 if error, 0 on success
2404  * PURPOSE:	steps backwards through the extent list updating
2405  *		watermarks for all extents with the EXTFLG_UPDATE flag
2406  *		set.  Writing the watermarks guarantees consistency when
2407  *		extents must be broken into pieces since the original
2408  *		watermark will be the last to be updated, and will be
2409  *		changed to point to a new watermark that is already
2410  *		known to be consistent.  If one of the writes fails, the
2411  *		original watermark stays intact and none of the changes
2412  *		are realized.
2413  */
2414 static int
2415 meta_sp_update_wm_common(
2416 	mdsetname_t	*sp,
2417 	md_sp_t		*msp,
2418 	sp_ext_node_t	*extlist,
2419 	int		iocval,
2420 	md_error_t	*ep
2421 )
2422 {
2423 	sp_ext_node_t	*ext;
2424 	sp_ext_node_t	*tail;
2425 	mp_watermark_t	*wmp, *watermarks;
2426 	xsp_offset_t	*osp, *offsets;
2427 	int		update_count = 0;
2428 	int		rval = 0;
2429 	md_unit_t	*mdp;
2430 	md_sp_update_wm_t	update_params;
2431 
2432 	if (getenv(META_SP_DEBUG)) {
2433 		meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n");
2434 		meta_sp_list_dump(extlist);
2435 	}
2436 
2437 	/*
2438 	 * find the last node so we can write the watermarks backwards
2439 	 * and count watermarks to update so we can allocate space
2440 	 */
2441 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
2442 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2443 			update_count++;
2444 		}
2445 
2446 		if (ext->ext_next == NULL) {
2447 			tail = ext;
2448 		}
2449 	}
2450 	ext = tail;
2451 
2452 	wmp = watermarks =
2453 	    Zalloc(update_count * sizeof (mp_watermark_t));
2454 	osp = offsets =
2455 	    Zalloc(update_count * sizeof (sp_ext_offset_t));
2456 
2457 	while (ext != NULL) {
2458 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2459 			/* update watermark */
2460 			wmp->wm_magic = MD_SP_MAGIC;
2461 			wmp->wm_version = MD_SP_VERSION;
2462 			wmp->wm_type = ext->ext_type;
2463 			wmp->wm_seq = ext->ext_seq;
2464 			wmp->wm_length = ext->ext_length - MD_SP_WMSIZE;
2465 
2466 			/* fill in the volume name and set name */
2467 			if (ext->ext_namep != NULL)
2468 				(void) strcpy(wmp->wm_mdname,
2469 				    ext->ext_namep->cname);
2470 			else
2471 				(void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME);
2472 			if (ext->ext_setp != NULL &&
2473 			    ext->ext_setp->setno != MD_LOCAL_SET)
2474 				(void) strcpy(wmp->wm_setname,
2475 				    ext->ext_setp->setname);
2476 			else
2477 				(void) strcpy(wmp->wm_setname,
2478 				    MD_SP_LOCALSETNAME);
2479 
2480 			/* Generate the checksum */
2481 			wmp->wm_checksum = 0;
2482 			crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum,
2483 			    sizeof (*wmp), NULL);
2484 
2485 			/* record the extent offset */
2486 			*osp = ext->ext_offset;
2487 
2488 			/* Advance the placeholders */
2489 			osp++; wmp++;
2490 		}
2491 		ext = ext->ext_prev;
2492 	}
2493 
2494 	mdp = meta_get_mdunit(sp, msp->common.namep, ep);
2495 	if (mdp == NULL) {
2496 		rval = -1;
2497 		goto out;
2498 	}
2499 
2500 	(void) memset(&update_params, 0, sizeof (update_params));
2501 	update_params.mnum = MD_SID(mdp);
2502 	update_params.count = update_count;
2503 	update_params.wmp = (uintptr_t)watermarks;
2504 	update_params.osp = (uintptr_t)offsets;
2505 	MD_SETDRIVERNAME(&update_params, MD_SP,
2506 	    MD_MIN2SET(update_params.mnum));
2507 
2508 	if (metaioctl(iocval, &update_params, &update_params.mde,
2509 	    msp->common.namep->cname) != 0) {
2510 		(void) mdstealerror(ep, &update_params.mde);
2511 		rval = -1;
2512 		goto out;
2513 	}
2514 
2515 out:
2516 	Free(watermarks);
2517 	Free(offsets);
2518 
2519 	return (rval);
2520 }
2521 
2522 static int
2523 meta_sp_update_wm(
2524 	mdsetname_t	*sp,
2525 	md_sp_t		*msp,
2526 	sp_ext_node_t	*extlist,
2527 	md_error_t	*ep
2528 )
2529 {
2530 	return (meta_sp_update_wm_common(sp, msp, extlist, MD_IOC_SPUPDATEWM,
2531 	    ep));
2532 }
2533 
2534 static int
2535 meta_mn_sp_update_wm(
2536 	mdsetname_t	*sp,
2537 	md_sp_t		*msp,
2538 	sp_ext_node_t	*extlist,
2539 	md_error_t	*ep
2540 )
2541 {
2542 	return (meta_sp_update_wm_common(sp, msp, extlist, MD_MN_IOC_SPUPDATEWM,
2543 	    ep));
2544 }
2545 
2546 /*
2547  * FUNCTION:	meta_sp_clear_wm()
2548  * INPUT:	sp	- the operating set
2549  *		msp	- the unit structure for the soft partition to clear
2550  * OUTPUT:	ep	- return error pointer
2551  * RETURNS:	int	- -1 if error, 0 on success
2552  * PURPOSE:	steps through the extents for a soft partition unit and
2553  *		creates an extent list designed to mark all of the
2554  *		watermarks for those extents as free.  The extent list
2555  *		is then passed to meta_sp_update_wm() to actually write
2556  *		the watermarks out.
2557  */
2558 static int
2559 meta_sp_clear_wm(
2560 	mdsetname_t	*sp,
2561 	md_sp_t		*msp,
2562 	md_error_t	*ep
2563 )
2564 {
2565 	sp_ext_node_t	*extlist = NULL;
2566 	int		numexts = msp->ext.ext_len;
2567 	uint_t		i;
2568 	int		rval = 0;
2569 
2570 	/* for each watermark must set the flag to SP_FREE */
2571 	for (i = 0; i < numexts; i++) {
2572 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
2573 
2574 		meta_sp_list_insert(NULL, NULL, &extlist,
2575 		    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
2576 		    EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
2577 	}
2578 
2579 	/* update watermarks */
2580 	rval = meta_sp_update_wm(sp, msp, extlist, ep);
2581 
2582 	meta_sp_list_free(&extlist);
2583 	return (rval);
2584 }
2585 
2586 /*
2587  * FUNCTION:	meta_sp_read_wm()
2588  * INPUT:	sp	- setname for component
2589  *		compnp	- mdname_t for component
2590  *		offset	- the offset of the watermark to read (sectors)
2591  * OUTPUT:	wm	- the watermark structure to read into
2592  *		ep	- return error pointer
2593  * RETURNS:	int	- -1 if error, 0 on success
2594  * PURPOSE:	seeks out to the requested offset and reads a watermark.
2595  *		It then verifies that the magic number is correct and
2596  *		that the checksum is valid, returning an error if either
2597  *		is wrong.
2598  */
2599 static int
2600 meta_sp_read_wm(
2601 	mdsetname_t	*sp,
2602 	mdname_t	*compnp,
2603 	mp_watermark_t	*wm,
2604 	sp_ext_offset_t	offset,
2605 	md_error_t	*ep
2606 )
2607 {
2608 	md_sp_read_wm_t	read_params;
2609 
2610 	/*
2611 	 * make sure block offset does not overflow 2^64 bytes and it's a
2612 	 * multiple of the block size.
2613 	 */
2614 	assert(offset <= (1LL << (64 - DEV_BSHIFT)));
2615 	/* LINTED */
2616 	assert((sizeof (*wm) % DEV_BSIZE) == 0);
2617 
2618 	(void) memset(wm, 0, sizeof (*wm));
2619 
2620 	(void) memset(&read_params, 0, sizeof (read_params));
2621 	read_params.rdev = compnp->dev;
2622 	read_params.wmp = (uintptr_t)wm;
2623 	read_params.offset = offset;
2624 	MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno);
2625 
2626 	if (metaioctl(MD_IOC_SPREADWM, &read_params,
2627 	    &read_params.mde, compnp->cname) != 0) {
2628 
2629 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2630 		    "Extent header read failed, block %llu.\n"), offset);
2631 		return (mdstealerror(ep, &read_params.mde));
2632 	}
2633 
2634 	/* make sure magic number is correct */
2635 	if (wm->wm_magic != MD_SP_MAGIC) {
2636 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2637 		    "found incorrect magic number %x, expected %x.\n"),
2638 		    wm->wm_magic, MD_SP_MAGIC);
2639 		/*
2640 		 * Pass NULL for the device name as we don't have
2641 		 * valid watermark contents.
2642 		 */
2643 		return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL));
2644 	}
2645 
2646 	if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
2647 	    sizeof (*wm), NULL)) {
2648 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2649 		    "found incorrect checksum %x.\n"),
2650 		    wm->wm_checksum);
2651 		return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname));
2652 	}
2653 
2654 	return (0);
2655 }
2656 
2657 /*
2658  * **************************************************************************
2659  *                  Query Functions
2660  * **************************************************************************
2661  */
2662 
2663 /*
2664  * IMPORTANT NOTE: This is a static function that assumes that
2665  *		   its input parameters have been checked and
2666  *		   have valid values that lie within acceptable
2667  *		   ranges.
2668  *
2669  * FUNCTION:	meta_sp_enough_space()
2670  * INPUT:	desired_number_of_sps - the number of soft partitions desired;
2671  *					must be > 0
2672  *		desired_sp_size - the desired soft partition size in blocks;
2673  *				  must be > 0
2674  *		extent_listpp - a reference to a reference to an extent
2675  *				list that lists the extents on a device;
2676  *				must be a reference to a reference to a
2677  *				valid extent list
2678  *		alignment - the desired data space alignment for the sp's
2679  * OUTPUT:	boolean_t return value
2680  * RETURNS:	boolean_t - B_TRUE if there's enough space in the extent
2681  *			    list to create the desired soft partitions,
2682  *			    B_FALSE if there's not enough space
2683  * PURPOSE:	determines whether there's enough free space in an extent
2684  *		list to allow creation of a set of soft partitions
2685  */
2686 static boolean_t
2687 meta_sp_enough_space(
2688 	int		desired_number_of_sps,
2689 	blkcnt_t	desired_sp_size,
2690 	sp_ext_node_t	**extent_listpp,
2691 	sp_ext_length_t	alignment
2692 )
2693 {
2694 	boolean_t		enough_space;
2695 	int			number_of_sps;
2696 	int			number_of_extents_used;
2697 	sp_ext_length_t		desired_ext_length = desired_sp_size;
2698 
2699 	enough_space = B_TRUE;
2700 	number_of_sps = 0;
2701 	while ((enough_space == B_TRUE) &&
2702 	    (number_of_sps < desired_number_of_sps)) {
2703 		/*
2704 		 * Use the extent allocation algorithm implemented by
2705 		 * meta_sp_alloc_by_len() to test whether the free
2706 		 * extents in the extent list referenced by *extent_listpp
2707 		 * contain enough space to accomodate a soft partition
2708 		 * of size desired_ext_length.
2709 		 *
2710 		 * Repeat the test <desired_number_of_sps> times
2711 		 * or until it fails, whichever comes first,
2712 		 * each time allocating the extents required to
2713 		 * create the soft partition without actually
2714 		 * creating the soft partition.
2715 		 */
2716 		number_of_extents_used = meta_sp_alloc_by_len(
2717 		    TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2718 		    extent_listpp, &desired_ext_length,
2719 		    NO_OFFSET, alignment);
2720 		if (number_of_extents_used == -1) {
2721 			enough_space = B_FALSE;
2722 		} else {
2723 			number_of_sps++;
2724 		}
2725 	}
2726 	return (enough_space);
2727 }
2728 
2729 /*
2730  * IMPORTANT NOTE: This is a static function that calls other functions
2731  *		   that check its mdsetnamep and device_mdnamep
2732  *		   input parameters, but expects extent_listpp to
2733  *		   be a initialized to a valid address to which
2734  *		   it can write a reference to the extent list that
2735  *		   it creates.
2736  *
2737  * FUNCTION:	meta_sp_get_extent_list()
2738  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2739  *			     for the set containing the device for
2740  *			     which the extents are to be listed
2741  *		device_mdnamep - a reference to the mdname_t structure
2742  *				 for the device for which the extents
2743  *				 are to be listed
2744  * OUTPUT:	*extent_listpp - a reference to the extent list for
2745  *				 the device; NULL if the function fails
2746  *		*ep - the libmeta error encountered, if any
2747  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2748  *			    B_FALSE if not
2749  * PURPOSE:	gets the extent list for a device
2750  */
2751 static boolean_t
2752 meta_sp_get_extent_list(
2753 	mdsetname_t	*mdsetnamep,
2754 	mdname_t	*device_mdnamep,
2755 	sp_ext_node_t	**extent_listpp,
2756 	md_error_t	*ep
2757 )
2758 {
2759 	diskaddr_t		device_size_in_blocks;
2760 	mdnamelist_t		*sp_name_listp;
2761 	diskaddr_t		start_block_address_in_blocks;
2762 
2763 	*extent_listpp = NULL;
2764 	sp_name_listp = NULL;
2765 
2766 	start_block_address_in_blocks = meta_sp_get_start(mdsetnamep,
2767 	    device_mdnamep, ep);
2768 	if (start_block_address_in_blocks == MD_DISKADDR_ERROR) {
2769 		if (getenv(META_SP_DEBUG)) {
2770 			mde_perror(ep,
2771 			    "meta_sp_get_extent_list:meta_sp_get_start");
2772 		}
2773 		return (B_FALSE);
2774 	}
2775 
2776 	device_size_in_blocks = metagetsize(device_mdnamep, ep);
2777 	if (device_size_in_blocks == MD_DISKADDR_ERROR) {
2778 		if (getenv(META_SP_DEBUG)) {
2779 			mde_perror(ep,
2780 			    "meta_sp_get_extent_list:metagetsize");
2781 		}
2782 		return (B_FALSE);
2783 	}
2784 
2785 	/*
2786 	 * Sanity check: the start block will have skipped an integer
2787 	 * number of cylinders, C.  C will usually be zero.  If (C > 0),
2788 	 * and the disk slice happens to only be C cylinders in total
2789 	 * size, we'll fail this check.
2790 	 */
2791 	if (device_size_in_blocks <=
2792 	    (start_block_address_in_blocks + MD_SP_WMSIZE)) {
2793 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname);
2794 		return (B_FALSE);
2795 	}
2796 
2797 	/*
2798 	 * After this point, we will have allocated resources, so any
2799 	 * failure returns must be through the supplied "fail" label
2800 	 * to properly deallocate things.
2801 	 */
2802 
2803 	/*
2804 	 * Create an empty extent list that starts one watermark past
2805 	 * the start block of the device and ends one watermark before
2806 	 * the end of the device.
2807 	 */
2808 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2809 	    extent_listpp, NO_OFFSET,
2810 	    (sp_ext_length_t)start_block_address_in_blocks,
2811 	    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2812 	    meta_sp_cmp_by_offset);
2813 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2814 	    extent_listpp, (sp_ext_offset_t)(device_size_in_blocks -
2815 	    MD_SP_WMSIZE), MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER,
2816 	    NO_FLAGS, meta_sp_cmp_by_offset);
2817 
2818 	/*
2819 	 * Get the list of soft partitions that are already on the
2820 	 * device.
2821 	 */
2822 	if (meta_sp_get_by_component(mdsetnamep, device_mdnamep,
2823 	    &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) {
2824 		if (getenv(META_SP_DEBUG)) {
2825 			mde_perror(ep,
2826 			    "meta_sp_get_extent_list:meta_sp_get_by_component");
2827 		}
2828 		goto fail;
2829 	}
2830 
2831 	if (sp_name_listp != NULL) {
2832 		/*
2833 		 * If there are soft partitions on the device, add the
2834 		 * extents used in them to the extent list.
2835 		 */
2836 		if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp,
2837 		    extent_listpp, ep) == -1) {
2838 			if (getenv(META_SP_DEBUG)) {
2839 				mde_perror(ep, "meta_sp_get_extent_list:"
2840 				    "meta_sp_extlist_from_namelist");
2841 			}
2842 			goto fail;
2843 		}
2844 		metafreenamelist(sp_name_listp);
2845 	}
2846 
2847 	/*
2848 	 * Add free extents to the extent list to represent
2849 	 * the remaining regions of free space on the
2850 	 * device.
2851 	 */
2852 	meta_sp_list_freefill(extent_listpp, device_size_in_blocks);
2853 	return (B_TRUE);
2854 
2855 fail:
2856 	if (sp_name_listp != NULL) {
2857 		metafreenamelist(sp_name_listp);
2858 	}
2859 
2860 	if (*extent_listpp != NULL) {
2861 		/*
2862 		 * meta_sp_list_free sets *extent_listpp to NULL.
2863 		 */
2864 		meta_sp_list_free(extent_listpp);
2865 	}
2866 	return (B_FALSE);
2867 }
2868 
2869 /*
2870  * IMPORTANT NOTE: This is a static function that calls other functions
2871  *		   that check its mdsetnamep and mddrivenamep
2872  *		   input parameters, but expects extent_listpp to
2873  *		   be a initialized to a valid address to which
2874  *		   it can write a reference to the extent list that
2875  *		   it creates.
2876  *
2877  * FUNCTION:	meta_sp_get_extent_list_for_drive()
2878  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2879  *			     for the set containing the drive for
2880  *			     which the extents are to be listed
2881  *		mddrivenamep   - a reference to the mddrivename_t structure
2882  *				 for the drive for which the extents
2883  *				 are to be listed
2884  * OUTPUT:	*extent_listpp - a reference to the extent list for
2885  *				 the drive; NULL if the function fails
2886  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2887  *			    B_FALSE if not
2888  * PURPOSE:	gets the extent list for a drive when the entire drive
2889  *		is to be soft partitioned
2890  */
2891 static boolean_t
2892 meta_sp_get_extent_list_for_drive(
2893 	mdsetname_t	*mdsetnamep,
2894 	mddrivename_t	*mddrivenamep,
2895 	sp_ext_node_t	**extent_listpp
2896 )
2897 {
2898 	boolean_t		can_use;
2899 	diskaddr_t		free_space;
2900 	md_error_t		mderror;
2901 	mdvtoc_t		proposed_vtoc;
2902 	int			repartition_options;
2903 	int			return_value;
2904 	md_sp_t			test_sp_struct;
2905 
2906 	can_use = B_TRUE;
2907 	*extent_listpp = NULL;
2908 	mderror = mdnullerror;
2909 	test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0,
2910 	    &mderror);
2911 	if (test_sp_struct.compnamep == NULL) {
2912 		can_use = B_FALSE;
2913 	}
2914 
2915 	if (can_use == B_TRUE) {
2916 		mderror = mdnullerror;
2917 		repartition_options = 0;
2918 		return_value = meta_check_sp(mdsetnamep, &test_sp_struct,
2919 		    MDCMD_USE_WHOLE_DISK, &repartition_options, &mderror);
2920 		if (return_value != 0) {
2921 			can_use = B_FALSE;
2922 		}
2923 	}
2924 
2925 	if (can_use == B_TRUE) {
2926 		mderror = mdnullerror;
2927 		repartition_options = repartition_options |
2928 		    (MD_REPART_FORCE | MD_REPART_DONT_LABEL);
2929 		return_value = meta_repartition_drive(mdsetnamep, mddrivenamep,
2930 		    repartition_options, &proposed_vtoc, &mderror);
2931 		if (return_value != 0) {
2932 			can_use = B_FALSE;
2933 		}
2934 	}
2935 
2936 	if (can_use == B_TRUE) {
2937 		free_space = proposed_vtoc.parts[MD_SLICE0].size;
2938 		if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) {
2939 			can_use = B_FALSE;
2940 		}
2941 	}
2942 
2943 	if (can_use == B_TRUE) {
2944 		/*
2945 		 * Create an extent list that starts with
2946 		 * a reserved extent that ends at the start
2947 		 * of the usable space on slice zero of the
2948 		 * proposed VTOC, ends with an extent that
2949 		 * reserves space for a watermark at the end
2950 		 * of slice zero, and contains a single free
2951 		 * extent that occupies the rest of the space
2952 		 * on the slice.
2953 		 *
2954 		 * NOTE:
2955 		 *
2956 		 * Don't use metagetstart() or metagetsize() to
2957 		 * find the usable space.  They query the mdname_t
2958 		 * structure that represents an actual device to
2959 		 * determine the amount of space on the device that
2960 		 * contains metadata and the total amount of space
2961 		 * on the device.  Since this function creates a
2962 		 * proposed extent list that doesn't reflect the
2963 		 * state of an actual device, there's no mdname_t
2964 		 * structure to be queried.
2965 		 *
2966 		 * When a drive is reformatted to prepare for
2967 		 * soft partitioning, all of slice seven is
2968 		 * reserved for metadata, all of slice zero is
2969 		 * available for soft partitioning, and all other
2970 		 * slices on the drive are empty.  The proposed
2971 		 * extent list for the drive therefore contains
2972 		 * only three extents: a reserved extent that ends
2973 		 * at the start of the usable space on slice zero,
2974 		 * a single free extent that occupies all the usable
2975 		 * space on slice zero, and an ending extent that
2976 		 * reserves space for a watermark at the end of
2977 		 * slice zero.
2978 		 */
2979 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2980 		    extent_listpp, NO_OFFSET, (sp_ext_length_t)(MD_SP_START),
2981 		    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2982 		    meta_sp_cmp_by_offset);
2983 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2984 		    extent_listpp, (sp_ext_offset_t)(free_space - MD_SP_WMSIZE),
2985 		    MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER, NO_FLAGS,
2986 		    meta_sp_cmp_by_offset);
2987 		meta_sp_list_freefill(extent_listpp, free_space);
2988 	}
2989 	return (can_use);
2990 }
2991 
2992 /*
2993  * FUNCTION:	meta_sp_can_create_sps()
2994  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2995  *			     for the set containing the device for
2996  *			     which the extents are to be listed
2997  *		mdnamep - a reference to the mdname_t of the device
2998  *			  on which the soft parititions are to be created
2999  *		number_of_sps - the desired number of soft partitions
3000  *		sp_size - the desired soft partition size
3001  * OUTPUT:	boolean_t return value
3002  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3003  *			    B_FALSE if not
3004  * PURPOSE:	determines whether a set of soft partitions can be created
3005  *		on a device
3006  */
3007 boolean_t
3008 meta_sp_can_create_sps(
3009 	mdsetname_t	*mdsetnamep,
3010 	mdname_t	*mdnamep,
3011 	int		number_of_sps,
3012 	blkcnt_t	sp_size
3013 )
3014 {
3015 	sp_ext_node_t	*extent_listp;
3016 	boolean_t	succeeded;
3017 	md_error_t	mde;
3018 
3019 	if ((number_of_sps > 0) && (sp_size > 0)) {
3020 		succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3021 		    &extent_listp, &mde);
3022 	} else {
3023 		succeeded = B_FALSE;
3024 	}
3025 
3026 	/*
3027 	 * We don't really care about an error return from the
3028 	 * alignment call; that will just result in passing zero,
3029 	 * which will be interpreted as no alignment.
3030 	 */
3031 
3032 	if (succeeded == B_TRUE) {
3033 		succeeded = meta_sp_enough_space(number_of_sps,
3034 		    sp_size, &extent_listp,
3035 		    meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde));
3036 		meta_sp_list_free(&extent_listp);
3037 	}
3038 	return (succeeded);
3039 }
3040 
3041 /*
3042  * FUNCTION:	meta_sp_can_create_sps_on_drive()
3043  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3044  *			     for the set containing the drive for
3045  *			     which the extents are to be listed
3046  *		mddrivenamep - a reference to the mddrivename_t of the drive
3047  *			       on which the soft parititions are to be created
3048  *		number_of_sps - the desired number of soft partitions
3049  *		sp_size - the desired soft partition size
3050  * OUTPUT:	boolean_t return value
3051  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3052  *			    B_FALSE if not
3053  * PURPOSE:	determines whether a set of soft partitions can be created
3054  *		on a drive if the entire drive is soft partitioned
3055  */
3056 boolean_t
3057 meta_sp_can_create_sps_on_drive(
3058 	mdsetname_t	*mdsetnamep,
3059 	mddrivename_t	*mddrivenamep,
3060 	int		number_of_sps,
3061 	blkcnt_t	sp_size
3062 )
3063 {
3064 	sp_ext_node_t	*extent_listp;
3065 	boolean_t	succeeded;
3066 
3067 	if ((number_of_sps > 0) && (sp_size > 0)) {
3068 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3069 		    mddrivenamep, &extent_listp);
3070 	} else {
3071 		succeeded = B_FALSE;
3072 	}
3073 
3074 	/*
3075 	 * We don't care about alignment on the space call because
3076 	 * we're specifically dealing with a drive, which will have no
3077 	 * inherent alignment.
3078 	 */
3079 
3080 	if (succeeded == B_TRUE) {
3081 		succeeded = meta_sp_enough_space(number_of_sps, sp_size,
3082 		    &extent_listp, SP_UNALIGNED);
3083 		meta_sp_list_free(&extent_listp);
3084 	}
3085 	return (succeeded);
3086 }
3087 
3088 /*
3089  * FUNCTION:	meta_sp_get_free_space()
3090  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3091  *			     for the set containing the device for
3092  *			     which the free space is to be returned
3093  *		mdnamep - a reference to the mdname_t of the device
3094  *			  for which the free space is to be returned
3095  * OUTPUT:	blkcnt_t return value
3096  * RETURNS:	blkcnt_t - the number of blocks of free space on the device
3097  * PURPOSE:	returns the number of blocks of free space on a device
3098  */
3099 blkcnt_t
3100 meta_sp_get_free_space(
3101 	mdsetname_t	*mdsetnamep,
3102 	mdname_t	*mdnamep
3103 )
3104 {
3105 	sp_ext_node_t		*extent_listp;
3106 	sp_ext_length_t		free_blocks;
3107 	boolean_t		succeeded;
3108 	md_error_t		mde;
3109 
3110 	extent_listp = NULL;
3111 	free_blocks = 0;
3112 	succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3113 	    &extent_listp, &mde);
3114 	if (succeeded == B_TRUE) {
3115 		free_blocks = meta_sp_list_size(extent_listp,
3116 		    EXTTYP_FREE, INCLUDE_WM);
3117 		meta_sp_list_free(&extent_listp);
3118 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3119 			/*
3120 			 * Subtract a safety margin for watermarks when
3121 			 * computing the number of blocks available for
3122 			 * use.  The actual number of watermarks can't
3123 			 * be calculated without knowing the exact numbers
3124 			 * and sizes of both the free extents and the soft
3125 			 * partitions to be created.  The calculation is
3126 			 * highly complex and error-prone even if those
3127 			 * quantities are known.  The approximate value
3128 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3129 			 * correct value in all practical cases.
3130 			 */
3131 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3132 		} else {
3133 			free_blocks = 0;
3134 		}
3135 	} else {
3136 		mdclrerror(&mde);
3137 	}
3138 
3139 	return (free_blocks);
3140 }
3141 
3142 /*
3143  * FUNCTION:	meta_sp_get_free_space_on_drive()
3144  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3145  *			     for the set containing the drive for
3146  *			     which the free space is to be returned
3147  *		mddrivenamep - a reference to the mddrivename_t of the drive
3148  *			       for which the free space is to be returned
3149  * OUTPUT:	blkcnt_t return value
3150  * RETURNS:	blkcnt_t - the number of blocks of free space on the drive
3151  * PURPOSE:	returns the number of blocks of space usable for soft
3152  *		partitions on an entire drive, if the entire drive is
3153  *		soft partitioned
3154  */
3155 blkcnt_t
3156 meta_sp_get_free_space_on_drive(
3157 	mdsetname_t	*mdsetnamep,
3158 	mddrivename_t	*mddrivenamep
3159 )
3160 {
3161 	sp_ext_node_t		*extent_listp;
3162 	sp_ext_length_t		free_blocks;
3163 	boolean_t		succeeded;
3164 
3165 	extent_listp = NULL;
3166 	free_blocks = 0;
3167 	succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3168 	    mddrivenamep, &extent_listp);
3169 	if (succeeded == B_TRUE) {
3170 		free_blocks = meta_sp_list_size(extent_listp,
3171 		    EXTTYP_FREE, INCLUDE_WM);
3172 		meta_sp_list_free(&extent_listp);
3173 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3174 			/*
3175 			 * Subtract a safety margin for watermarks when
3176 			 * computing the number of blocks available for
3177 			 * use.  The actual number of watermarks can't
3178 			 * be calculated without knowing the exact numbers
3179 			 * and sizes of both the free extents and the soft
3180 			 * partitions to be created.  The calculation is
3181 			 * highly complex and error-prone even if those
3182 			 * quantities are known.  The approximate value
3183 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3184 			 * correct value in all practical cases.
3185 			 */
3186 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3187 		} else {
3188 			free_blocks = 0;
3189 		}
3190 	}
3191 	return (free_blocks);
3192 }
3193 
3194 /*
3195  * FUNCTION:	meta_sp_get_number_of_possible_sps()
3196  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3197  *			     for the set containing the device for
3198  *			     which the number of possible soft partitions
3199  *			     is to be returned
3200  *		mdnamep - a reference to the mdname_t of the device
3201  *			  for which the number of possible soft partitions
3202  *			  is to be returned
3203  * OUTPUT:	int return value
3204  * RETURNS:	int - the number of soft partitions of the desired size
3205  *		      that can be created on the device
3206  * PURPOSE:	returns the number of soft partitions of a given size
3207  *		that can be created on a device
3208  */
3209 int
3210 meta_sp_get_number_of_possible_sps(
3211 	mdsetname_t	*mdsetnamep,
3212 	mdname_t	*mdnamep,
3213 	blkcnt_t	sp_size
3214 )
3215 {
3216 	sp_ext_node_t	*extent_listp;
3217 	int		number_of_possible_sps;
3218 	boolean_t	succeeded;
3219 	md_error_t	mde;
3220 	sp_ext_length_t	alignment;
3221 
3222 	extent_listp = NULL;
3223 	number_of_possible_sps = 0;
3224 	if (sp_size > 0) {
3225 		if ((succeeded = meta_sp_get_extent_list(mdsetnamep,
3226 		    mdnamep, &extent_listp, &mde)) == B_FALSE)
3227 			mdclrerror(&mde);
3228 	} else {
3229 		succeeded = B_FALSE;
3230 	}
3231 
3232 	if (succeeded == B_TRUE) {
3233 		alignment = meta_sp_get_default_alignment(mdsetnamep,
3234 		    mdnamep, &mde);
3235 	}
3236 
3237 	while (succeeded == B_TRUE) {
3238 		/*
3239 		 * Keep allocating space from the extent list
3240 		 * for soft partitions of the desired size until
3241 		 * there's not enough free space left in the list
3242 		 * for another soft partiition of that size.
3243 		 * Add one to the number of possible soft partitions
3244 		 * for each soft partition for which there is
3245 		 * enough free space left.
3246 		 */
3247 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3248 		    sp_size, &extent_listp, alignment);
3249 		if (succeeded == B_TRUE) {
3250 			number_of_possible_sps++;
3251 		}
3252 	}
3253 	if (extent_listp != NULL) {
3254 		meta_sp_list_free(&extent_listp);
3255 	}
3256 	return (number_of_possible_sps);
3257 }
3258 
3259 /*
3260  * FUNCTION:	meta_sp_get_number_of_possible_sps_on_drive()
3261  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3262  *			     for the set containing the drive for
3263  *			     which the number of possible soft partitions
3264  *			     is to be returned
3265  *		mddrivenamep - a reference to the mddrivename_t of the drive
3266  *			       for which the number of possible soft partitions
3267  *			       is to be returned
3268  *		sp_size - the size in blocks of the proposed soft partitions
3269  * OUTPUT:	int return value
3270  * RETURNS:	int - the number of soft partitions of the desired size
3271  *		      that can be created on the drive
3272  * PURPOSE:	returns the number of soft partitions of a given size
3273  *		that can be created on a drive, if the entire drive is
3274  *		soft partitioned
3275  */
3276 int
3277 meta_sp_get_number_of_possible_sps_on_drive(
3278 	mdsetname_t	*mdsetnamep,
3279 	mddrivename_t	*mddrivenamep,
3280 	blkcnt_t	sp_size
3281 )
3282 {
3283 	sp_ext_node_t	*extent_listp;
3284 	int		number_of_possible_sps;
3285 	boolean_t	succeeded;
3286 
3287 	extent_listp = NULL;
3288 	number_of_possible_sps = 0;
3289 	if (sp_size > 0) {
3290 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3291 		    mddrivenamep, &extent_listp);
3292 	} else {
3293 		succeeded = B_FALSE;
3294 	}
3295 	while (succeeded == B_TRUE) {
3296 		/*
3297 		 * Keep allocating space from the extent list
3298 		 * for soft partitions of the desired size until
3299 		 * there's not enough free space left in the list
3300 		 * for another soft partition of that size.
3301 		 * Add one to the number of possible soft partitions
3302 		 * for each soft partition for which there is
3303 		 * enough free space left.
3304 		 *
3305 		 * Since it's a drive, not a metadevice, make no
3306 		 * assumptions about alignment.
3307 		 */
3308 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3309 		    sp_size, &extent_listp, SP_UNALIGNED);
3310 		if (succeeded == B_TRUE) {
3311 			number_of_possible_sps++;
3312 		}
3313 	}
3314 	if (extent_listp != NULL) {
3315 		meta_sp_list_free(&extent_listp);
3316 	}
3317 	return (number_of_possible_sps);
3318 }
3319 
3320 /*
3321  * FUNCTION:	meta_sp_get_possible_sp_size()
3322  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3323  *			     for the set containing the device for
3324  *			     which the possible soft partition size
3325  *			     is to be returned
3326  *		mdnamep - a reference to the mdname_t of the device
3327  *			  for which the possible soft partition size
3328  *			  is to be returned
3329  *		number_of_sps - the desired number of soft partitions
3330  * OUTPUT:	blkcnt_t return value
3331  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3332  * PURPOSE:	returns the maximum possible size of each of a given number of
3333  *		soft partitions of equal size that can be created on a device
3334  */
3335 blkcnt_t
3336 meta_sp_get_possible_sp_size(
3337 	mdsetname_t	*mdsetnamep,
3338 	mdname_t	*mdnamep,
3339 	int		number_of_sps
3340 )
3341 {
3342 	blkcnt_t	free_blocks;
3343 	blkcnt_t	sp_size;
3344 	boolean_t	succeeded;
3345 
3346 	sp_size = 0;
3347 	if (number_of_sps > 0) {
3348 		free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep);
3349 		sp_size = free_blocks / number_of_sps;
3350 		succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3351 		    number_of_sps, sp_size);
3352 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3353 			/*
3354 			 * To compensate for space that may have been
3355 			 * occupied by watermarks, reduce sp_size by a
3356 			 * number of blocks equal to the number of soft
3357 			 * partitions desired, and test again to see
3358 			 * whether the desired number of soft partitions
3359 			 * can be created.
3360 			 */
3361 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3362 			succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3363 			    number_of_sps, sp_size);
3364 		}
3365 		if (sp_size < 0) {
3366 			sp_size = 0;
3367 		}
3368 	}
3369 	return (sp_size);
3370 }
3371 
3372 /*
3373  * FUNCTION:	meta_sp_get_possible_sp_size_on_drive()
3374  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3375  *			     for the set containing the drive for
3376  *			     which the possible soft partition size
3377  *			     is to be returned
3378  *		mddrivenamep - a reference to the mddrivename_t of the drive
3379  *			       for which the possible soft partition size
3380  *			       is to be returned
3381  *		number_of_sps - the desired number of soft partitions
3382  * OUTPUT:	blkcnt_t return value
3383  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3384  * PURPOSE:	returns the maximum possible size of each of a given number of
3385  *		soft partitions of equal size that can be created on a drive
3386  *              if the entire drive is soft partitioned
3387  */
3388 blkcnt_t
3389 meta_sp_get_possible_sp_size_on_drive(
3390 	mdsetname_t	*mdsetnamep,
3391 	mddrivename_t	*mddrivenamep,
3392 	int		number_of_sps
3393 )
3394 {
3395 	blkcnt_t	free_blocks;
3396 	blkcnt_t	sp_size;
3397 	boolean_t	succeeded;
3398 
3399 	sp_size = 0;
3400 	if (number_of_sps > 0) {
3401 		free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep,
3402 		    mddrivenamep);
3403 		sp_size = free_blocks / number_of_sps;
3404 		succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3405 		    mddrivenamep, number_of_sps, sp_size);
3406 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3407 			/*
3408 			 * To compensate for space that may have been
3409 			 * occupied by watermarks, reduce sp_size by a
3410 			 * number of blocks equal to the number of soft
3411 			 * partitions desired, and test again to see
3412 			 * whether the desired number of soft partitions
3413 			 * can be created.
3414 			 */
3415 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3416 			succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3417 			    mddrivenamep, number_of_sps, sp_size);
3418 		}
3419 		if (sp_size < 0) {
3420 			sp_size = 0;
3421 		}
3422 	}
3423 	return (sp_size);
3424 }
3425 
3426 /*
3427  * **************************************************************************
3428  *                  Unit Structure Manipulation Functions                   *
3429  * **************************************************************************
3430  */
3431 
3432 /*
3433  * FUNCTION:	meta_sp_fillextarray()
3434  * INPUT:	mp	- the unit structure to fill
3435  *		extlist	- the list of extents to fill with
3436  * OUTPUT:	none
3437  * RETURNS:	void
3438  * PURPOSE:	fills in the unit structure extent list with the extents
3439  *		specified by extlist.  Only extents in extlist with the
3440  *		EXTFLG_UPDATE flag are changed in the unit structure,
3441  *		and the index into the unit structure is the sequence
3442  *		number in the extent list.  After all of the nodes have
3443  *		been updated the virtual offsets in the unit structure
3444  *		are updated to reflect the new lengths.
3445  */
3446 static void
3447 meta_sp_fillextarray(
3448 	mp_unit_t	*mp,
3449 	sp_ext_node_t	*extlist
3450 )
3451 {
3452 	int	i;
3453 	sp_ext_node_t	*ext;
3454 	sp_ext_offset_t	curvoff = 0LL;
3455 
3456 	assert(mp != NULL);
3457 
3458 	/* go through the allocation list and fill in our unit structure */
3459 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
3460 		if ((ext->ext_type == EXTTYP_ALLOC) &&
3461 		    (ext->ext_flags & EXTFLG_UPDATE) != 0) {
3462 			mp->un_ext[ext->ext_seq].un_poff =
3463 			    ext->ext_offset + MD_SP_WMSIZE;
3464 			mp->un_ext[ext->ext_seq].un_len =
3465 			    ext->ext_length - MD_SP_WMSIZE;
3466 		}
3467 	}
3468 
3469 	for (i = 0; i < mp->un_numexts; i++) {
3470 		assert(mp->un_ext[i].un_poff != 0);
3471 		assert(mp->un_ext[i].un_len  != 0);
3472 		mp->un_ext[i].un_voff = curvoff;
3473 		curvoff += mp->un_ext[i].un_len;
3474 	}
3475 }
3476 
3477 /*
3478  * FUNCTION:	meta_sp_createunit()
3479  * INPUT:	np	- the name of the device to create a unit structure for
3480  *		compnp	- the name of the device the soft partition is on
3481  *		extlist	- the extent list to populate the new unit with
3482  *		numexts	- the number of extents in the extent list
3483  *		len	- the total size of the soft partition (sectors)
3484  *		status	- the initial status of the unit structure
3485  * OUTPUT:	ep	- return error pointer
3486  * RETURNS:	mp_unit_t * - the new unit structure.
3487  * PURPOSE:	allocates and fills in a new soft partition unit
3488  *		structure to be passed to the soft partitioning driver
3489  *		for creation.
3490  */
3491 static mp_unit_t *
3492 meta_sp_createunit(
3493 	mdname_t	*np,
3494 	mdname_t	*compnp,
3495 	sp_ext_node_t	*extlist,
3496 	int		numexts,
3497 	sp_ext_length_t	len,
3498 	sp_status_t	status,
3499 	md_error_t	*ep
3500 )
3501 {
3502 	mp_unit_t	*mp;
3503 	uint_t		ms_size;
3504 
3505 	ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) +
3506 	    (numexts * sizeof (mp->un_ext[0]));
3507 
3508 	mp = Zalloc(ms_size);
3509 
3510 	/* fill in fields in common unit structure */
3511 	mp->c.un_type = MD_METASP;
3512 	mp->c.un_size = ms_size;
3513 	MD_SID(mp) = meta_getminor(np->dev);
3514 	mp->c.un_total_blocks = len;
3515 	mp->c.un_actual_tb = len;
3516 
3517 	/* set up geometry */
3518 	(void) meta_sp_setgeom(np, compnp, mp, ep);
3519 
3520 	/* if we're building on metadevice we can't parent */
3521 	if (metaismeta(compnp))
3522 		MD_CAPAB(mp) = MD_CANT_PARENT;
3523 	else
3524 		MD_CAPAB(mp) = MD_CAN_PARENT;
3525 
3526 	/* fill soft partition-specific fields */
3527 	mp->un_dev = compnp->dev;
3528 	mp->un_key = compnp->key;
3529 
3530 	/* mdname_t start_blk field is not 64-bit! */
3531 	mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk;
3532 	mp->un_status = status;
3533 	mp->un_numexts = numexts;
3534 	mp->un_length = len;
3535 
3536 	/* fill in the extent array */
3537 	meta_sp_fillextarray(mp, extlist);
3538 
3539 	return (mp);
3540 }
3541 
3542 /*
3543  * FUNCTION:	meta_sp_updateunit()
3544  * INPUT:	np       - name structure for the metadevice being updated
3545  *		old_un	 - the original unit structure that is being updated
3546  *		extlist	 - the extent list to populate the new unit with
3547  *		grow_len - the amount by which the partition is being grown
3548  *		numexts	 - the number of extents in the extent list
3549  *		ep       - return error pointer
3550  * OUTPUT:	none
3551  * RETURNS:	mp_unit_t * - the updated unit structure
3552  * PURPOSE:	allocates and fills in a new soft partition unit structure to
3553  *		be passed to the soft partitioning driver for creation.  The
3554  *		old unit structure is first copied in, and then the updated
3555  *		extents are changed in the new unit structure.  This is
3556  *		typically used when the size of an existing unit is changed.
3557  */
3558 static mp_unit_t *
3559 meta_sp_updateunit(
3560 	mdname_t	*np,
3561 	mp_unit_t	*old_un,
3562 	sp_ext_node_t	*extlist,
3563 	sp_ext_length_t	grow_len,
3564 	int		numexts,
3565 	md_error_t	*ep
3566 )
3567 {
3568 	mp_unit_t	*new_un;
3569 	sp_ext_length_t	new_len;
3570 	uint_t		new_size;
3571 
3572 	assert(old_un != NULL);
3573 	assert(extlist != NULL);
3574 
3575 	/* allocate new unit structure and copy in old unit */
3576 	new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) +
3577 	    ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0]));
3578 	new_len = old_un->un_length + grow_len;
3579 	new_un = Zalloc(new_size);
3580 	bcopy(old_un, new_un, old_un->c.un_size);
3581 
3582 	/* update size and geometry information */
3583 	new_un->c.un_size = new_size;
3584 	new_un->un_length = new_len;
3585 	new_un->c.un_total_blocks = new_len;
3586 	new_un->c.un_actual_tb = new_len;
3587 	if (meta_adjust_geom((md_unit_t *)new_un, np,
3588 	    old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct,
3589 	    0, ep) != 0) {
3590 		Free(new_un);
3591 		return (NULL);
3592 	}
3593 
3594 	/* update extent information */
3595 	new_un->un_numexts += numexts;
3596 
3597 	meta_sp_fillextarray(new_un, extlist);
3598 
3599 	return (new_un);
3600 }
3601 
3602 /*
3603  * FUNCTION:	meta_get_sp()
3604  * INPUT:	sp	- the set name for the device to get
3605  *		np	- the name of the device to get
3606  * OUTPUT:	ep	- return error pointer
3607  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition
3608  * PURPOSE:	interface to the rest of libmeta for fetching a unit structure
3609  *		for the named device.  Just a wrapper for meta_get_sp_common().
3610  */
3611 md_sp_t *
3612 meta_get_sp(
3613 	mdsetname_t	*sp,
3614 	mdname_t	*np,
3615 	md_error_t	*ep
3616 )
3617 {
3618 	return (meta_get_sp_common(sp, np, 0, ep));
3619 }
3620 
3621 /*
3622  * FUNCTION:	meta_get_sp_common()
3623  * INPUT:	sp	- the set name for the device to get
3624  *		np	- the name of the device to get
3625  *		fast	- whether to use the cache or not (NOT IMPLEMENTED!)
3626  * OUTPUT:	ep	- return error pointer
3627  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition,
3628  *			    NULL if np is not a soft partition
3629  * PURPOSE:	common routine for fetching a soft partition unit structure
3630  */
3631 md_sp_t *
3632 meta_get_sp_common(
3633 	mdsetname_t	*sp,
3634 	mdname_t	*np,
3635 	int		fast,
3636 	md_error_t	*ep
3637 )
3638 {
3639 	mddrivename_t	*dnp = np->drivenamep;
3640 	char		*miscname;
3641 	mp_unit_t	*mp;
3642 	md_sp_t		*msp;
3643 	int		i;
3644 
3645 	/* must have set */
3646 	assert(sp != NULL);
3647 
3648 	/* short circuit */
3649 	if (dnp->unitp != NULL) {
3650 		if (dnp->unitp->type != MD_METASP)
3651 			return (NULL);
3652 		return ((md_sp_t *)dnp->unitp);
3653 	}
3654 	/* get miscname and unit */
3655 	if ((miscname = metagetmiscname(np, ep)) == NULL)
3656 		return (NULL);
3657 
3658 	if (strcmp(miscname, MD_SP) != 0) {
3659 		(void) mdmderror(ep, MDE_NOT_SP, 0, np->cname);
3660 		return (NULL);
3661 	}
3662 
3663 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
3664 		return (NULL);
3665 
3666 	assert(mp->c.un_type == MD_METASP);
3667 
3668 	/* allocate soft partition */
3669 	msp = Zalloc(sizeof (*msp));
3670 
3671 	/* get the common information */
3672 	msp->common.namep = np;
3673 	msp->common.type = mp->c.un_type;
3674 	msp->common.state = mp->c.un_status;
3675 	msp->common.capabilities = mp->c.un_capabilities;
3676 	msp->common.parent = mp->c.un_parent;
3677 	msp->common.size = mp->c.un_total_blocks;
3678 	msp->common.user_flags = mp->c.un_user_flags;
3679 	msp->common.revision = mp->c.un_revision;
3680 
3681 	/* get soft partition information */
3682 	if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL)
3683 		goto out;
3684 
3685 	/*
3686 	 * Fill in the key and the start block.  Note that the start
3687 	 * block in the unit structure is 64 bits but the name pointer
3688 	 * only supports 32 bits.
3689 	 */
3690 	msp->compnamep->key = mp->un_key;
3691 	msp->compnamep->start_blk = mp->un_start_blk;
3692 
3693 	/* fill in status field */
3694 	msp->status = mp->un_status;
3695 
3696 	/* allocate the extents */
3697 	msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val));
3698 	msp->ext.ext_len = mp->un_numexts;
3699 
3700 	/* do the extents for this soft partition */
3701 	for (i = 0; i < mp->un_numexts; i++) {
3702 		struct mp_ext	*mde = &mp->un_ext[i];
3703 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
3704 
3705 		extp->voff = mde->un_voff;
3706 		extp->poff = mde->un_poff;
3707 		extp->len = mde->un_len;
3708 	}
3709 
3710 	/* cleanup, return success */
3711 	Free(mp);
3712 	dnp->unitp = (md_common_t *)msp;
3713 	return (msp);
3714 
3715 out:
3716 	/* clean up and return error */
3717 	Free(mp);
3718 	Free(msp);
3719 	return (NULL);
3720 }
3721 
3722 
3723 /*
3724  * FUNCTION:	meta_init_sp()
3725  * INPUT:	spp	- the set name for the new device
3726  *		argc	- the remaining argument count for the metainit cmdline
3727  *		argv	- the remainder of the unparsed command line
3728  *		options	- global options parsed by metainit
3729  * OUTPUT:	ep	- return error pointer
3730  * RETURNS:	int	- -1 failure, 0 success
3731  * PURPOSE:	provides the command line parsing and name management overhead
3732  *		for creating a new soft partition.  Ultimately this calls
3733  *		meta_create_sp() which does the real work of allocating space
3734  *		for the new soft partition.
3735  */
3736 int
3737 meta_init_sp(
3738 	mdsetname_t	**spp,
3739 	int		argc,
3740 	char		*argv[],
3741 	mdcmdopts_t	options,
3742 	md_error_t	*ep
3743 )
3744 {
3745 	char		*compname = NULL;
3746 	mdname_t	*spcompnp = NULL;	/* name of component volume */
3747 	char		*devname = argv[0];	/* unit name */
3748 	mdname_t	*np = NULL;		/* name of soft partition */
3749 	md_sp_t		*msp = NULL;
3750 	int		c;
3751 	int		old_optind;
3752 	sp_ext_length_t	len = 0LL;
3753 	int		rval = -1;
3754 	uint_t		seq;
3755 	int		oflag;
3756 	int		failed;
3757 	mddrivename_t	*dnp = NULL;
3758 	sp_ext_length_t	alignment = 0LL;
3759 	sp_ext_node_t	*extlist = NULL;
3760 
3761 	assert(argc > 0);
3762 
3763 	/* expect sp name, -p, optional -e, compname, and size parameters */
3764 	/* grab soft partition name */
3765 	if ((np = metaname(spp, devname, META_DEVICE, ep)) == NULL)
3766 		goto out;
3767 
3768 	/* see if it exists already */
3769 	if (metagetmiscname(np, ep) != NULL) {
3770 		(void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP,
3771 		    meta_getminor(np->dev), devname);
3772 		goto out;
3773 	} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) {
3774 		goto out;
3775 	} else {
3776 		mdclrerror(ep);
3777 	}
3778 	--argc, ++argv;
3779 
3780 	if (argc == 0)
3781 		goto syntax;
3782 
3783 	/* grab -p */
3784 	if (strcmp(argv[0], "-p") != 0)
3785 		goto syntax;
3786 	--argc, ++argv;
3787 
3788 	if (argc == 0)
3789 		goto syntax;
3790 
3791 	/* see if -e is there */
3792 	if (strcmp(argv[0], "-e") == 0) {
3793 		/* use the whole disk */
3794 		options |= MDCMD_USE_WHOLE_DISK;
3795 		--argc, ++argv;
3796 	}
3797 
3798 	if (argc == 0)
3799 		goto syntax;
3800 
3801 	/* get component name */
3802 	compname = Strdup(argv[0]);
3803 
3804 	if (options & MDCMD_USE_WHOLE_DISK) {
3805 		if ((dnp = metadrivename(spp, compname, ep)) == NULL) {
3806 			goto out;
3807 		}
3808 		if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) {
3809 			goto out;
3810 		}
3811 	} else if ((spcompnp = metaname(spp, compname, UNKNOWN, ep)) == NULL) {
3812 		goto out;
3813 	}
3814 	assert(*spp != NULL);
3815 
3816 	if (!(options & MDCMD_NOLOCK)) {
3817 		/* grab set lock */
3818 		if (meta_lock(*spp, TRUE, ep))
3819 			goto out;
3820 
3821 		if (meta_check_ownership(*spp, ep) != 0)
3822 			goto out;
3823 	}
3824 
3825 	/* allocate the soft partition */
3826 	msp = Zalloc(sizeof (*msp));
3827 
3828 	/* setup common */
3829 	msp->common.namep = np;
3830 	msp->common.type = MD_METASP;
3831 
3832 	compname = spcompnp->cname;
3833 
3834 	assert(spcompnp->rname != NULL);
3835 	--argc, ++argv;
3836 
3837 	if (argc == 0) {
3838 		goto syntax;
3839 	}
3840 
3841 	if (*argv[0] == '-') {
3842 		/*
3843 		 * parse any other command line options, this includes
3844 		 * the recovery options -o and -b. The special thing
3845 		 * with these options is that the len needs to be
3846 		 * kept track of otherwise when the geometry of the
3847 		 * "device" is built it will create an invalid geometry
3848 		 */
3849 		old_optind = optind = 0;
3850 		opterr = 0;
3851 		oflag = 0;
3852 		seq = 0;
3853 		failed = 0;
3854 		while ((c = getopt(argc, argv, "A:o:b:")) != -1) {
3855 			sp_ext_offset_t	offset;
3856 			sp_ext_length_t	length;
3857 			longlong_t	tmp_size;
3858 
3859 			switch (c) {
3860 			case 'A':	/* data alignment */
3861 				if (meta_sp_parsesizestring(optarg,
3862 				    &alignment) == -1) {
3863 					failed = 1;
3864 				}
3865 				break;
3866 			case 'o':	/* offset in the partition */
3867 				if (oflag == 1) {
3868 					failed = 1;
3869 				} else {
3870 					tmp_size = atoll(optarg);
3871 					if (tmp_size <= 0) {
3872 						failed = 1;
3873 					} else {
3874 						oflag = 1;
3875 						options |= MDCMD_DIRECT;
3876 
3877 						offset = tmp_size;
3878 					}
3879 				}
3880 
3881 				break;
3882 			case 'b':	/* number of blocks */
3883 				if (oflag == 0) {
3884 					failed = 1;
3885 				} else {
3886 					tmp_size = atoll(optarg);
3887 					if (tmp_size <= 0) {
3888 						failed = 1;
3889 					} else {
3890 						oflag = 0;
3891 
3892 						length = tmp_size;
3893 
3894 						/* we have a pair of values */
3895 						meta_sp_list_insert(*spp, np,
3896 						    &extlist, offset, length,
3897 						    EXTTYP_ALLOC, seq++,
3898 						    EXTFLG_UPDATE,
3899 						    meta_sp_cmp_by_offset);
3900 						len += length;
3901 					}
3902 				}
3903 
3904 				break;
3905 			default:
3906 				argc -= old_optind;
3907 				argv += old_optind;
3908 				goto options;
3909 			}
3910 
3911 			if (failed) {
3912 				argc -= old_optind;
3913 				argv += old_optind;
3914 				goto syntax;
3915 			}
3916 
3917 			old_optind = optind;
3918 		}
3919 		argc -= optind;
3920 		argv += optind;
3921 
3922 		/*
3923 		 * Must have matching pairs of -o and -b flags
3924 		 */
3925 		if (oflag != 0)
3926 			goto syntax;
3927 
3928 		/*
3929 		 * Can't specify both layout (indicated indirectly by
3930 		 * len being set by thye -o/-b cases above) AND
3931 		 * alignment
3932 		 */
3933 		if ((len > 0LL) && (alignment > 0LL))
3934 			goto syntax;
3935 
3936 		/*
3937 		 * sanity check the allocation list
3938 		 */
3939 		if ((extlist != NULL) && meta_sp_list_overlaps(extlist))
3940 			goto syntax;
3941 	}
3942 
3943 	if (len == 0LL) {
3944 		if (argc == 0)
3945 			goto syntax;
3946 		if (meta_sp_parsesize(argv[0], &len) == -1)
3947 			goto syntax;
3948 		--argc, ++argv;
3949 	}
3950 
3951 	msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val));
3952 	msp->ext.ext_val->len = len;
3953 	msp->compnamep = spcompnp;
3954 
3955 	/* we should be at the end */
3956 	if (argc != 0)
3957 		goto syntax;
3958 
3959 	/* create soft partition */
3960 	if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0)
3961 		goto out;
3962 	rval = 0;
3963 
3964 	/* let em know */
3965 	if (options & MDCMD_PRINT) {
3966 		(void) printf(dgettext(TEXT_DOMAIN,
3967 		    "%s: Soft Partition is setup\n"),
3968 		    devname);
3969 		(void) fflush(stdout);
3970 	}
3971 	goto out;
3972 
3973 syntax:
3974 	/* syntax error */
3975 	rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv);
3976 	goto out;
3977 
3978 options:
3979 	/* options error */
3980 	rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv);
3981 	goto out;
3982 
3983 out:
3984 	if (msp != NULL) {
3985 		if (msp->ext.ext_val != NULL) {
3986 			Free(msp->ext.ext_val);
3987 		}
3988 		Free(msp);
3989 	}
3990 
3991 	return (rval);
3992 }
3993 
3994 /*
3995  * FUNCTION:	meta_free_sp()
3996  * INPUT:	msp	- the soft partition unit to free
3997  * OUTPUT:	none
3998  * RETURNS:	void
3999  * PURPOSE:	provides an interface from the rest of libmeta for freeing a
4000  *		soft partition unit
4001  */
4002 void
4003 meta_free_sp(md_sp_t *msp)
4004 {
4005 	Free(msp);
4006 }
4007 
4008 /*
4009  * FUNCTION:	meta_sp_issp()
4010  * INPUT:	sp	- the set name to check
4011  *		np	- the name to check
4012  * OUTPUT:	ep	- return error pointer
4013  * RETURNS:	int	- 0 means sp,np is a soft partition
4014  *			  1 means sp,np is not a soft partition
4015  * PURPOSE:	determines whether the given device is a soft partition
4016  *		device.  This is called by other metadevice check routines.
4017  */
4018 int
4019 meta_sp_issp(
4020 	mdsetname_t	*sp,
4021 	mdname_t	*np,
4022 	md_error_t	*ep
4023 )
4024 {
4025 	if (meta_get_sp_common(sp, np, 0, ep) == NULL)
4026 		return (1);
4027 
4028 	return (0);
4029 }
4030 
4031 /*
4032  * FUNCTION:	meta_check_sp()
4033  * INPUT:	sp	- the set name to check
4034  *		msp	- the unit structure to check
4035  *		options	- creation options
4036  * OUTPUT:	repart_options - options to be passed to
4037  *				meta_repartition_drive()
4038  *		ep	- return error pointer
4039  * RETURNS:	int	-  0 ok to create on this component
4040  *			  -1 error or not ok to create on this component
4041  * PURPOSE:	Checks to determine whether the rules for creation of
4042  *		soft partitions allow creation of a soft partition on
4043  *		the device described by the mdname_t structure referred
4044  *		to by msp->compnamep.
4045  *
4046  *		NOTE: Does NOT check to determine whether the extents
4047  *		      described in the md_sp_t structure referred to by
4048  *		      msp will fit on the device described by the mdname_t
4049  *		      structure located at msp->compnamep.
4050  */
4051 static int
4052 meta_check_sp(
4053 	mdsetname_t	*sp,
4054 	md_sp_t		*msp,
4055 	mdcmdopts_t	options,
4056 	int		*repart_options,
4057 	md_error_t	*ep
4058 )
4059 {
4060 	md_common_t	*mdp;
4061 	mdname_t	*compnp = msp->compnamep;
4062 	uint_t		slice;
4063 	mddrivename_t	*dnp;
4064 	mdname_t	*slicenp;
4065 	mdvtoc_t	*vtocp;
4066 
4067 	/* make sure it is in the set */
4068 	if (meta_check_inset(sp, compnp, ep) != 0)
4069 		return (-1);
4070 
4071 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4072 		uint_t	rep_slice;
4073 
4074 		/*
4075 		 * check to make sure we can partition this drive.
4076 		 * we cannot continue if any of the following are
4077 		 * true:
4078 		 * The drive is a metadevice.
4079 		 * The drive contains a mounted slice.
4080 		 * The drive contains a slice being swapped to.
4081 		 * The drive contains slices which are part of other
4082 		 * metadevices.
4083 		 * The drive contains a metadb.
4084 		 */
4085 		if (metaismeta(compnp))
4086 			return (mddeverror(ep, MDE_IS_META, compnp->dev,
4087 			    compnp->cname));
4088 
4089 		assert(compnp->drivenamep != NULL);
4090 
4091 		/*
4092 		 * ensure that we have slice 0 since the disk will be
4093 		 * repartitioned in the USE_WHOLE_DISK case.  this check
4094 		 * is redundant unless the user incorrectly specifies a
4095 		 * a fully qualified drive AND slice name (i.e.,
4096 		 * /dev/dsk/cXtXdXsX), which will be incorrectly
4097 		 * recognized as a drive name by the metaname code.
4098 		 */
4099 
4100 		if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL)
4101 			return (-1);
4102 		if (slice != MD_SLICE0)
4103 			return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname));
4104 
4105 		dnp = compnp->drivenamep;
4106 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
4107 			return (-1);
4108 
4109 		for (slice = 0; slice < vtocp->nparts; slice++) {
4110 
4111 			/* only check if the slice really exists */
4112 			if (vtocp->parts[slice].size == 0)
4113 				continue;
4114 
4115 			slicenp = metaslicename(dnp, slice, ep);
4116 			if (slicenp == NULL)
4117 				return (-1);
4118 
4119 			/* check to ensure that it is not already in use */
4120 			if (meta_check_inuse(sp,
4121 			    slicenp, MDCHK_INUSE, ep) != 0) {
4122 				return (-1);
4123 			}
4124 
4125 			/*
4126 			 * Up to this point, tests are applied to all
4127 			 * slices uniformly.
4128 			 */
4129 
4130 			if (slice == rep_slice) {
4131 				/*
4132 				 * Tests inside the body of this
4133 				 * conditional are applied only to
4134 				 * slice seven.
4135 				 */
4136 				if (meta_check_inmeta(sp, slicenp,
4137 				    options | MDCHK_ALLOW_MDDB |
4138 				    MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0)
4139 					return (-1);
4140 
4141 				/*
4142 				 * For slice seven, a metadb is NOT an
4143 				 * automatic failure. It merely means
4144 				 * that we're not allowed to muck
4145 				 * about with the partitioning of that
4146 				 * slice.  We indicate this by masking
4147 				 * in the MD_REPART_LEAVE_REP flag.
4148 				 */
4149 				if (metahasmddb(sp, slicenp, ep)) {
4150 					assert(repart_options !=
4151 					    NULL);
4152 					*repart_options |=
4153 					    MD_REPART_LEAVE_REP;
4154 				}
4155 
4156 				/*
4157 				 * Skip the remaining tests for slice
4158 				 * seven
4159 				 */
4160 				continue;
4161 			}
4162 
4163 			/*
4164 			 * Tests below this point will be applied to
4165 			 * all slices EXCEPT for the replica slice.
4166 			 */
4167 
4168 
4169 			/* check if component is in a metadevice */
4170 			if (meta_check_inmeta(sp, slicenp, options, 0,
4171 			    -1, ep) != 0)
4172 				return (-1);
4173 
4174 			/* check to see if component has a metadb */
4175 			if (metahasmddb(sp, slicenp, ep))
4176 				return (mddeverror(ep, MDE_HAS_MDDB,
4177 				    slicenp->dev, slicenp->cname));
4178 		}
4179 		/*
4180 		 * This should be all of the testing necessary when
4181 		 * the MDCMD_USE_WHOLE_DISK flag is set; the rest of
4182 		 * meta_check_sp() is oriented towards component
4183 		 * arguments instead of disks.
4184 		 */
4185 		goto meta_check_sp_ok;
4186 
4187 	}
4188 
4189 	/* check to ensure that it is not already in use */
4190 	if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) {
4191 		return (-1);
4192 	}
4193 
4194 	if (!metaismeta(compnp)) {	/* handle non-metadevices */
4195 
4196 		/*
4197 		 * The component can have one or more soft partitions on it
4198 		 * already, but can't be part of any other type of metadevice,
4199 		 * so if it is used for a metadevice, but the metadevice
4200 		 * isn't a soft partition, return failure.
4201 		 */
4202 
4203 		if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 &&
4204 		    meta_check_insp(sp, compnp, 0, -1, ep) == 0) {
4205 			return (-1);
4206 		}
4207 	} else {			/* handle metadevices */
4208 		/* get underlying unit & check capabilities */
4209 		if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL)
4210 			return (-1);
4211 
4212 		if ((! (mdp->capabilities & MD_CAN_PARENT)) ||
4213 		    (! (mdp->capabilities & MD_CAN_SP)))
4214 			return (mdmderror(ep, MDE_INVAL_UNIT,
4215 			    meta_getminor(compnp->dev), compnp->cname));
4216 	}
4217 
4218 meta_check_sp_ok:
4219 	mdclrerror(ep);
4220 	return (0);
4221 }
4222 
4223 /*
4224  * FUNCTION:	meta_create_sp()
4225  * INPUT:	sp	- the set name to create in
4226  *		msp	- the unit structure to create
4227  *		oblist	- an optional list of requested extents (-o/-b options)
4228  *		options	- creation options
4229  *		alignment - data alignment
4230  * OUTPUT:	ep	- return error pointer
4231  * RETURNS:	int	-  0 success, -1 error
4232  * PURPOSE:	does most of the work for creating a soft partition.  If
4233  *		metainit -p -e was used, first partition the drive.  Then
4234  *		create an extent list based on the existing soft partitions
4235  *		and assume all space not used by them is free.  Storage for
4236  *		the new soft partition is allocated from the free extents
4237  *		based on the length specified on the command line or the
4238  *		oblist passed in.  The unit structure is then committed and
4239  *		the watermarks are updated.  Finally, the status is changed to
4240  *		Okay and the process is complete.
4241  */
4242 static int
4243 meta_create_sp(
4244 	mdsetname_t	*sp,
4245 	md_sp_t		*msp,
4246 	sp_ext_node_t	*oblist,
4247 	mdcmdopts_t	options,
4248 	sp_ext_length_t	alignment,
4249 	md_error_t	*ep
4250 )
4251 {
4252 	mdname_t	*np = msp->common.namep;
4253 	mdname_t	*compnp = msp->compnamep;
4254 	mp_unit_t	*mp = NULL;
4255 	mdnamelist_t	*keynlp = NULL, *spnlp = NULL;
4256 	md_set_params_t	set_params;
4257 	int		rval = -1;
4258 	diskaddr_t	comp_size;
4259 	diskaddr_t	sp_start;
4260 	sp_ext_node_t	*extlist = NULL;
4261 	int		numexts = 0;	/* number of extents */
4262 	int		count = 0;
4263 	int		committed = 0;
4264 	int		repart_options = MD_REPART_FORCE;
4265 	int		create_flag = MD_CRO_32BIT;
4266 	int		mn_set_master = 0;
4267 
4268 	md_set_desc	*sd;
4269 	md_set_mmown_params_t	*ownpar = NULL;
4270 	int		comp_is_mirror = 0;
4271 
4272 	/* validate soft partition */
4273 	if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0)
4274 		return (-1);
4275 
4276 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4277 		if ((options & MDCMD_DOIT) != 0) {
4278 			if (meta_repartition_drive(sp,
4279 			    compnp->drivenamep,
4280 			    repart_options,
4281 			    NULL, /* Don't return the VTOC */
4282 			    ep) != 0)
4283 
4284 				return (-1);
4285 		} else {
4286 			/*
4287 			 * If -n and -e are both specified, it doesn't make
4288 			 * sense to continue without actually partitioning
4289 			 * the drive.
4290 			 */
4291 			return (0);
4292 		}
4293 	}
4294 
4295 	/* populate the start_blk field of the component name */
4296 	if ((sp_start = meta_sp_get_start(sp, compnp, ep)) ==
4297 	    MD_DISKADDR_ERROR) {
4298 		rval = -1;
4299 		goto out;
4300 	}
4301 
4302 	if (options & MDCMD_DOIT) {
4303 		/* store name in namespace */
4304 		if (add_key_name(sp, compnp, &keynlp, ep) != 0) {
4305 			rval = -1;
4306 			goto out;
4307 		}
4308 	}
4309 
4310 	/*
4311 	 * Get a list of the soft partitions that currently reside on
4312 	 * the component.  We should ALWAYS force reload the cache,
4313 	 * because if this is a single creation, there will not BE a
4314 	 * cached list, and if we're using the md.tab, we must rebuild
4315 	 * the list because it won't contain the previous (if any)
4316 	 * soft partition.
4317 	 */
4318 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4319 	if (count < 0) {
4320 		/* error occured */
4321 		rval = -1;
4322 		goto out;
4323 	}
4324 
4325 	/*
4326 	 * get the size of the underlying device.  if the size is smaller
4327 	 * than or equal to the watermark size, we know there isn't
4328 	 * enough space.
4329 	 */
4330 	if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) {
4331 		rval = -1;
4332 		goto out;
4333 	} else if (comp_size <= MD_SP_WMSIZE) {
4334 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname);
4335 		rval = -1;
4336 		goto out;
4337 	}
4338 	/*
4339 	 * seed extlist with reserved space at the beginning of the volume and
4340 	 * enough space for the end watermark.  The end watermark always gets
4341 	 * updated, but if the underlying device changes size it may not be
4342 	 * pointed to until the extent before it is updated.  Since the
4343 	 * end of the reserved space is where the first watermark starts,
4344 	 * the reserved extent should never be marked for updating.
4345 	 */
4346 
4347 	meta_sp_list_insert(NULL, NULL, &extlist,
4348 	    0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4349 	meta_sp_list_insert(NULL, NULL, &extlist,
4350 	    (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE,
4351 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4352 
4353 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4354 		rval = -1;
4355 		goto out;
4356 	}
4357 
4358 	metafreenamelist(spnlp);
4359 
4360 	if (getenv(META_SP_DEBUG)) {
4361 		meta_sp_debug("meta_create_sp: list of used extents:\n");
4362 		meta_sp_list_dump(extlist);
4363 	}
4364 
4365 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4366 
4367 	/* get extent list from -o/-b options or from free space */
4368 	if (options & MDCMD_DIRECT) {
4369 		if (getenv(META_SP_DEBUG)) {
4370 			meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n");
4371 			meta_sp_list_dump(oblist);
4372 		}
4373 
4374 		numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist);
4375 		if (numexts == -1) {
4376 			(void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname);
4377 			rval = -1;
4378 			goto out;
4379 		}
4380 	} else {
4381 		numexts = meta_sp_alloc_by_len(sp, np, &extlist,
4382 		    &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment :
4383 		    meta_sp_get_default_alignment(sp, compnp, ep));
4384 		if (numexts == -1) {
4385 			(void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname);
4386 			rval = -1;
4387 			goto out;
4388 		}
4389 	}
4390 
4391 	assert(extlist != NULL);
4392 
4393 	/* create soft partition */
4394 	mp = meta_sp_createunit(msp->common.namep, msp->compnamep,
4395 	    extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep);
4396 
4397 	create_flag = meta_check_devicesize(mp->c.un_total_blocks);
4398 
4399 	/* if we're not doing anything (metainit -n), return success */
4400 	if (! (options & MDCMD_DOIT)) {
4401 		rval = 0;	/* success */
4402 		goto out;
4403 	}
4404 
4405 	(void) memset(&set_params, 0, sizeof (set_params));
4406 
4407 	if (create_flag == MD_CRO_64BIT) {
4408 		mp->c.un_revision |= MD_64BIT_META_DEV;
4409 		set_params.options = MD_CRO_64BIT;
4410 	} else {
4411 		mp->c.un_revision &= ~MD_64BIT_META_DEV;
4412 		set_params.options = MD_CRO_32BIT;
4413 	}
4414 
4415 	if (getenv(META_SP_DEBUG)) {
4416 		meta_sp_debug("meta_create_sp: printing unit structure\n");
4417 		meta_sp_printunit(mp);
4418 	}
4419 
4420 	/*
4421 	 * Check to see if we're trying to create a partition on a mirror. If so
4422 	 * we may have to enforce an ownership change before writing the
4423 	 * watermark out.
4424 	 */
4425 	if (metaismeta(compnp)) {
4426 		char *miscname;
4427 
4428 		miscname = metagetmiscname(compnp, ep);
4429 		if (miscname != NULL)
4430 			comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0);
4431 		else
4432 			comp_is_mirror = 0;
4433 	} else {
4434 		comp_is_mirror = 0;
4435 	}
4436 
4437 	/*
4438 	 * For a multi-node environment we have to ensure that the master
4439 	 * node owns an underlying mirror before we issue the MD_IOCSET ioctl.
4440 	 * If the master does not own the device we will deadlock as the
4441 	 * implicit write of the watermarks (in sp_ioctl.c) will cause an
4442 	 * ownership change that will block as the MD_IOCSET is still in
4443 	 * progress. To close this window we force an owner change to occur
4444 	 * before issuing the MD_IOCSET. We cannot simply open the device and
4445 	 * write to it as this will only work for the first soft-partition
4446 	 * creation.
4447 	 */
4448 
4449 	if (comp_is_mirror && !metaislocalset(sp)) {
4450 
4451 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4452 			rval = -1;
4453 			goto out;
4454 		}
4455 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
4456 			mn_set_master = 1;
4457 		}
4458 	}
4459 
4460 	set_params.mnum = MD_SID(mp);
4461 	set_params.size = mp->c.un_size;
4462 	set_params.mdp = (uintptr_t)mp;
4463 	MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum));
4464 
4465 	/* first phase of commit. */
4466 	if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
4467 	    np->cname) != 0) {
4468 		(void) mdstealerror(ep, &set_params.mde);
4469 		rval = -1;
4470 		goto out;
4471 	}
4472 
4473 	/* we've successfully committed the record */
4474 	committed = 1;
4475 
4476 	/* write watermarks */
4477 	/*
4478 	 * Special-case for Multi-node sets. As we now have a distributed DRL
4479 	 * update mechanism, we _will_ hit the ioctl-within-ioctl deadlock case
4480 	 * unless we use a 'special' MN-capable ioctl to stage the watermark
4481 	 * update. This only affects the master-node in an MN set.
4482 	 */
4483 	if (mn_set_master) {
4484 		if (meta_mn_sp_update_wm(sp, msp, extlist, ep) < 0) {
4485 			rval = -1;
4486 			goto out;
4487 		}
4488 	} else {
4489 		if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
4490 			rval = -1;
4491 			goto out;
4492 		}
4493 	}
4494 
4495 	/* second phase of commit, set status to MD_SP_OK */
4496 	if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) {
4497 		rval = -1;
4498 		goto out;
4499 	}
4500 	rval = 0;
4501 out:
4502 	Free(mp);
4503 	if (ownpar)
4504 		Free(ownpar);
4505 
4506 	if (extlist != NULL)
4507 		meta_sp_list_free(&extlist);
4508 
4509 	if (rval != 0 && keynlp != NULL && committed != 1)
4510 		(void) del_key_names(sp, keynlp, NULL);
4511 
4512 	metafreenamelist(keynlp);
4513 
4514 	return (rval);
4515 }
4516 
4517 /*
4518  * **************************************************************************
4519  *                      Reset (metaclear) Functions                         *
4520  * **************************************************************************
4521  */
4522 
4523 /*
4524  * FUNCTION:	meta_sp_reset_common()
4525  * INPUT:	sp	- the set name of the device to reset
4526  *		np	- the name of the device to reset
4527  *		msp	- the unit structure to reset
4528  *		options	- metaclear options
4529  * OUTPUT:	ep	- return error pointer
4530  * RETURNS:	int	-  0 success, -1 error
4531  * PURPOSE:	"resets", or more accurately deletes, the soft partition
4532  *		specified.  First the state is set to "deleting" and then the
4533  *		watermarks are all cleared out.  Once the watermarks have been
4534  *		updated, the unit structure is deleted from the metadb.
4535  */
4536 static int
4537 meta_sp_reset_common(
4538 	mdsetname_t	*sp,
4539 	mdname_t	*np,
4540 	md_sp_t		*msp,
4541 	md_sp_reset_t	reset_params,
4542 	mdcmdopts_t	options,
4543 	md_error_t	*ep
4544 )
4545 {
4546 	char	*miscname;
4547 	int	rval = -1;
4548 	int	is_open = 0;
4549 
4550 	/* make sure that nobody owns us */
4551 	if (MD_HAS_PARENT(msp->common.parent))
4552 		return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev),
4553 		    np->cname));
4554 
4555 	/* make sure that the soft partition isn't open */
4556 	if ((is_open = meta_isopen(sp, np, ep, options)) < 0)
4557 		return (-1);
4558 	else if (is_open)
4559 		return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev),
4560 		    np->cname));
4561 
4562 	/* get miscname */
4563 	if ((miscname = metagetmiscname(np, ep)) == NULL)
4564 		return (-1);
4565 
4566 	/* fill in reset params */
4567 	MD_SETDRIVERNAME(&reset_params, miscname, sp->setno);
4568 	reset_params.mnum = meta_getminor(np->dev);
4569 	reset_params.force = (options & MDCMD_FORCE) ? 1 : 0;
4570 
4571 	/*
4572 	 * clear soft partition - phase one.
4573 	 * place the soft partition into the "delete pending" state.
4574 	 */
4575 	if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0)
4576 		return (-1);
4577 
4578 	/*
4579 	 * Now clear the watermarks.  If the force flag is specified,
4580 	 * ignore any errors writing the watermarks and delete the unit
4581 	 * structure anyway.  An error may leave the on-disk format in a
4582 	 * corrupt state.  If force is not specified and we fail here,
4583 	 * the soft partition will remain in the "delete pending" state.
4584 	 */
4585 	if ((meta_sp_clear_wm(sp, msp, ep) < 0) &&
4586 	    ((options & MDCMD_FORCE) == 0))
4587 		goto out;
4588 
4589 	/*
4590 	 * clear soft partition - phase two.
4591 	 * the driver removes the soft partition from the metadb and
4592 	 * zeros out incore version.
4593 	 */
4594 	if (metaioctl(MD_IOCRESET, &reset_params,
4595 	    &reset_params.mde, np->cname) != 0) {
4596 		(void) mdstealerror(ep, &reset_params.mde);
4597 		goto out;
4598 	}
4599 
4600 	/*
4601 	 * Wait for the /dev to be cleaned up. Ignore the return
4602 	 * value since there's not much we can do.
4603 	 */
4604 	(void) meta_update_devtree(meta_getminor(np->dev));
4605 
4606 	rval = 0;	/* success */
4607 
4608 	if (options & MDCMD_PRINT) {
4609 		(void) printf(dgettext(TEXT_DOMAIN,
4610 		    "%s: Soft Partition is cleared\n"),
4611 		    np->cname);
4612 		(void) fflush(stdout);
4613 	}
4614 
4615 	/*
4616 	 * if told to recurse and on a metadevice, then attempt to
4617 	 * clear the subdevices.  Indicate failure if the clear fails.
4618 	 */
4619 	if ((options & MDCMD_RECURSE) &&
4620 	    (metaismeta(msp->compnamep)) &&
4621 	    (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0))
4622 		rval = -1;
4623 
4624 out:
4625 	meta_invalidate_name(np);
4626 	return (rval);
4627 }
4628 
4629 /*
4630  * FUNCTION:	meta_sp_reset()
4631  * INPUT:	sp	- the set name of the device to reset
4632  *		np	- the name of the device to reset
4633  *		options	- metaclear options
4634  * OUTPUT:	ep	- return error pointer
4635  * RETURNS:	int	-  0 success, -1 error
4636  * PURPOSE:	provides the entry point to the rest of libmeta for deleting a
4637  *		soft partition.  If np is NULL, then soft partitions are
4638  *		all deleted at the current level and then recursively deleted.
4639  *		Otherwise, if a name is specified either directly or as a
4640  *		result of a recursive operation, it deletes only that name.
4641  *		Since something sitting under a soft partition may be parented
4642  *		to it, we have to reparent that other device to another soft
4643  *		partition on the same component if we're deleting the one it's
4644  *		parented to.
4645  */
4646 int
4647 meta_sp_reset(
4648 	mdsetname_t	*sp,
4649 	mdname_t	*np,
4650 	mdcmdopts_t	options,
4651 	md_error_t	*ep
4652 )
4653 {
4654 	md_sp_t		*msp;
4655 	int		rval = -1;
4656 	mdnamelist_t	*spnlp = NULL, *nlp = NULL;
4657 	md_sp_reset_t	reset_params;
4658 	int		num_sp;
4659 
4660 	assert(sp != NULL);
4661 
4662 	/* reset/delete all soft paritions */
4663 	if (np == NULL) {
4664 		/*
4665 		 * meta_reset_all sets MDCMD_RECURSE, but this behavior
4666 		 * is incorrect for soft partitions.  We want to clear
4667 		 * all soft partitions at a particular level in the
4668 		 * metadevice stack before moving to the next level.
4669 		 * Thus, we clear MDCMD_RECURSE from the options.
4670 		 */
4671 		options &= ~MDCMD_RECURSE;
4672 
4673 		/* for each soft partition */
4674 		rval = 0;
4675 		if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
4676 			rval = -1;
4677 
4678 		for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) {
4679 			np = nlp->namep;
4680 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4681 				rval = -1;
4682 				break;
4683 			}
4684 			/*
4685 			 * meta_reset_all calls us twice to get soft
4686 			 * partitions at the top and bottom of the stack.
4687 			 * thus, if we have a parent, we'll get deleted
4688 			 * on the next call.
4689 			 */
4690 			if (MD_HAS_PARENT(msp->common.parent))
4691 				continue;
4692 			/*
4693 			 * If this is a multi-node set, we send a series
4694 			 * of individual metaclear commands.
4695 			 */
4696 			if (meta_is_mn_set(sp, ep)) {
4697 				if (meta_mn_send_metaclear_command(sp,
4698 				    np->cname, options, 0, ep) != 0) {
4699 					rval = -1;
4700 					break;
4701 				}
4702 			} else {
4703 				if (meta_sp_reset(sp, np, options, ep) != 0) {
4704 					rval = -1;
4705 					break;
4706 				}
4707 			}
4708 		}
4709 		/* cleanup return status */
4710 		metafreenamelist(spnlp);
4711 		return (rval);
4712 	}
4713 
4714 	/* check the name */
4715 	if (metachkmeta(np, ep) != 0)
4716 		return (-1);
4717 
4718 	/* get the unit structure */
4719 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4720 		return (-1);
4721 
4722 	/* clear out reset parameters */
4723 	(void) memset(&reset_params, 0, sizeof (reset_params));
4724 
4725 	/* if our child is a metadevice, we need to deparent/reparent it */
4726 	if (metaismeta(msp->compnamep)) {
4727 		/* get sp's on this component */
4728 		if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep,
4729 		    &spnlp, 1, ep)) <= 0)
4730 			/* no sp's on this device.  error! */
4731 			return (-1);
4732 		else if (num_sp == 1)
4733 			/* last sp on this device, so we deparent */
4734 			reset_params.new_parent = MD_NO_PARENT;
4735 		else {
4736 			/* have to reparent this metadevice */
4737 			for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4738 				if (meta_getminor(nlp->namep->dev) ==
4739 				    meta_getminor(np->dev))
4740 					continue;
4741 				/*
4742 				 * this isn't the softpart we are deleting,
4743 				 * so use this device as the new parent.
4744 				 */
4745 				reset_params.new_parent =
4746 				    meta_getminor(nlp->namep->dev);
4747 				break;
4748 			}
4749 		}
4750 		metafreenamelist(spnlp);
4751 	}
4752 
4753 	if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0)
4754 		return (-1);
4755 
4756 	return (0);
4757 }
4758 
4759 /*
4760  * FUNCTION:	meta_sp_reset_component()
4761  * INPUT:	sp	- the set name of the device to reset
4762  *		name	- the string name of the device to reset
4763  *		options	- metaclear options
4764  * OUTPUT:	ep	- return error pointer
4765  * RETURNS:	int	-  0 success, -1 error
4766  * PURPOSE:	provides the ability to delete all soft partitions on a
4767  *		specified device (metaclear -p).  It first gets all of the
4768  *		soft partitions on the component and then deletes each one
4769  *		individually.
4770  */
4771 int
4772 meta_sp_reset_component(
4773 	mdsetname_t	*sp,
4774 	char		*name,
4775 	mdcmdopts_t	options,
4776 	md_error_t	*ep
4777 )
4778 {
4779 	mdname_t	*compnp, *np;
4780 	mdnamelist_t	*spnlp = NULL;
4781 	mdnamelist_t	*nlp = NULL;
4782 	md_sp_t		*msp;
4783 	int		count;
4784 	md_sp_reset_t	reset_params;
4785 
4786 	if ((compnp = metaname(&sp, name, UNKNOWN, ep)) == NULL)
4787 		return (-1);
4788 
4789 	/* If we're starting out with no soft partitions, it's an error */
4790 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4791 	if (count == 0)
4792 		return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname));
4793 	else if (count < 0)
4794 		return (-1);
4795 
4796 	/*
4797 	 * clear all soft partitions on this component.
4798 	 * NOTE: we reparent underlying metadevices as we go so that
4799 	 * things stay sane.  Also, if we encounter an error, we stop
4800 	 * and go no further in case recovery might be needed.
4801 	 */
4802 	for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4803 		/* clear out reset parameters */
4804 		(void) memset(&reset_params, 0, sizeof (reset_params));
4805 
4806 		/* check the name */
4807 		np = nlp->namep;
4808 
4809 		if (metachkmeta(np, ep) != 0) {
4810 			metafreenamelist(spnlp);
4811 			return (-1);
4812 		}
4813 
4814 		/* get the unit structure */
4815 		if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4816 			metafreenamelist(spnlp);
4817 			return (-1);
4818 		}
4819 
4820 		/* have to deparent/reparent metadevices */
4821 		if (metaismeta(compnp)) {
4822 			if (nlp->next == NULL)
4823 				reset_params.new_parent = MD_NO_PARENT;
4824 			else
4825 				reset_params.new_parent =
4826 				    meta_getminor(spnlp->next->namep->dev);
4827 		}
4828 
4829 		/* clear soft partition */
4830 		if (meta_sp_reset_common(sp, np, msp, reset_params,
4831 		    options, ep) < 0) {
4832 			metafreenamelist(spnlp);
4833 			return (-1);
4834 		}
4835 	}
4836 	metafreenamelist(spnlp);
4837 	return (0);
4838 }
4839 
4840 /*
4841  * **************************************************************************
4842  *                      Grow (metattach) Functions                          *
4843  * **************************************************************************
4844  */
4845 
4846 /*
4847  * FUNCTION:	meta_sp_attach()
4848  * INPUT:	sp	- the set name of the device to attach to
4849  *		np	- the name of the device to attach to
4850  *		addsize	- the unparsed string holding the amount of space to add
4851  *		options	- metattach options
4852  *		alignment - data alignment
4853  * OUTPUT:	ep	- return error pointer
4854  * RETURNS:	int	-  0 success, -1 error
4855  * PURPOSE:	grows a soft partition by reading in the existing unit
4856  *		structure and setting its state to Growing, allocating more
4857  *		space (similar to meta_create_sp()), updating the watermarks,
4858  *		and then writing out the new unit structure in the Okay state.
4859  */
4860 int
4861 meta_sp_attach(
4862 	mdsetname_t	*sp,
4863 	mdname_t	*np,
4864 	char		*addsize,
4865 	mdcmdopts_t	options,
4866 	sp_ext_length_t	alignment,
4867 	md_error_t	*ep
4868 )
4869 {
4870 	md_grow_params_t	grow_params;
4871 	sp_ext_length_t		grow_len;	/* amount to grow */
4872 	mp_unit_t		*mp, *new_un;
4873 	mdname_t		*compnp = NULL;
4874 
4875 	sp_ext_node_t		*extlist = NULL;
4876 	int			numexts;
4877 	mdnamelist_t		*spnlp = NULL;
4878 	int			count;
4879 	md_sp_t			*msp;
4880 	daddr_t			start_block;
4881 
4882 	/* should have the same set */
4883 	assert(sp != NULL);
4884 	assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev)));
4885 
4886 	/* check name */
4887 	if (metachkmeta(np, ep) != 0)
4888 		return (-1);
4889 
4890 	if (meta_sp_parsesize(addsize, &grow_len) == -1) {
4891 		return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname));
4892 	}
4893 
4894 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
4895 		return (-1);
4896 
4897 	/* make sure we don't have a parent */
4898 	if (MD_HAS_PARENT(mp->c.un_parent)) {
4899 		Free(mp);
4900 		return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname));
4901 	}
4902 
4903 	if (getenv(META_SP_DEBUG)) {
4904 		meta_sp_debug("meta_sp_attach: Unit structure before new "
4905 		    "space:\n");
4906 		meta_sp_printunit(mp);
4907 	}
4908 
4909 	/*
4910 	 * NOTE: the fast option to metakeyname is 0 as opposed to 1
4911 	 * If this was not the case we would suffer the following
4912 	 * assertion failure:
4913 	 * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP
4914 	 * file meta_check.x, line 315
4915 	 * I guess this is because we have not "seen" this drive before
4916 	 * and hence hit the failure - this is of course the attach routine
4917 	 */
4918 	if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) {
4919 		Free(mp);
4920 		return (-1);
4921 	}
4922 
4923 	/* metakeyname does not fill in the key. */
4924 	compnp->key = mp->un_key;
4925 
4926 	/* work out the space on the component that we are dealing with */
4927 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
4928 
4929 	/*
4930 	 * see if the component has been soft partitioned yet, or if an
4931 	 * error occurred.
4932 	 */
4933 	if (count == 0) {
4934 		Free(mp);
4935 		return (mdmderror(ep, MDE_NOT_SP, 0, np->cname));
4936 	} else if (count < 0) {
4937 		Free(mp);
4938 		return (-1);
4939 	}
4940 
4941 	/*
4942 	 * seed extlist with reserved space at the beginning of the volume and
4943 	 * enough space for the end watermark.  The end watermark always gets
4944 	 * updated, but if the underlying device changes size it may not be
4945 	 * pointed to until the extent before it is updated.  Since the
4946 	 * end of the reserved space is where the first watermark starts,
4947 	 * the reserved extent should never be marked for updating.
4948 	 */
4949 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
4950 	    MD_DISKADDR_ERROR) {
4951 		Free(mp);
4952 		return (-1);
4953 	}
4954 
4955 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
4956 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4957 	meta_sp_list_insert(NULL, NULL, &extlist,
4958 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
4959 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4960 
4961 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4962 		Free(mp);
4963 		return (-1);
4964 	}
4965 
4966 	metafreenamelist(spnlp);
4967 
4968 	if (getenv(META_SP_DEBUG)) {
4969 		meta_sp_debug("meta_sp_attach: list of used extents:\n");
4970 		meta_sp_list_dump(extlist);
4971 	}
4972 
4973 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4974 
4975 	assert(mp->un_numexts >= 1);
4976 	numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len,
4977 	    mp->un_ext[mp->un_numexts - 1].un_poff,
4978 	    (alignment > 0) ? alignment :
4979 	    meta_sp_get_default_alignment(sp, compnp, ep));
4980 
4981 	if (numexts == -1) {
4982 		Free(mp);
4983 		return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname));
4984 	}
4985 
4986 	/* allocate new unit structure and copy in old unit */
4987 	if ((new_un = meta_sp_updateunit(np, mp, extlist,
4988 	    grow_len, numexts, ep)) == NULL) {
4989 		Free(mp);
4990 		return (-1);
4991 	}
4992 	Free(mp);
4993 
4994 	/* If running in dryrun mode (-n option), we're done here */
4995 	if ((options & MDCMD_DOIT) == 0) {
4996 		if (options & MDCMD_PRINT) {
4997 			(void) printf(dgettext(TEXT_DOMAIN,
4998 			    "%s: Soft Partition would grow\n"),
4999 			    np->cname);
5000 			(void) fflush(stdout);
5001 		}
5002 		return (0);
5003 	}
5004 
5005 	if (getenv(META_SP_DEBUG)) {
5006 		meta_sp_debug("meta_sp_attach: updated unit structure:\n");
5007 		meta_sp_printunit(new_un);
5008 	}
5009 
5010 	assert(new_un != NULL);
5011 
5012 	(void) memset(&grow_params, 0, sizeof (grow_params));
5013 	if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) {
5014 		grow_params.options = MD_CRO_64BIT;
5015 		new_un->c.un_revision |= MD_64BIT_META_DEV;
5016 	} else {
5017 		grow_params.options = MD_CRO_32BIT;
5018 		new_un->c.un_revision &= ~MD_64BIT_META_DEV;
5019 	}
5020 	grow_params.mnum = MD_SID(new_un);
5021 	grow_params.size = new_un->c.un_size;
5022 	grow_params.mdp = (uintptr_t)new_un;
5023 	MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum));
5024 
5025 	if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde,
5026 	    np->cname) != 0) {
5027 		(void) mdstealerror(ep, &grow_params.mde);
5028 		return (-1);
5029 	}
5030 
5031 	/* update all watermarks */
5032 
5033 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
5034 		return (-1);
5035 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0)
5036 		return (-1);
5037 
5038 
5039 	/* second phase of commit, set status to MD_SP_OK */
5040 	if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0)
5041 		return (-1);
5042 
5043 	meta_invalidate_name(np);
5044 
5045 	if (options & MDCMD_PRINT) {
5046 		(void) printf(dgettext(TEXT_DOMAIN,
5047 		    "%s: Soft Partition has been grown\n"),
5048 		    np->cname);
5049 		(void) fflush(stdout);
5050 	}
5051 
5052 	return (0);
5053 }
5054 
5055 /*
5056  * **************************************************************************
5057  *                    Recovery (metarecover) Functions                      *
5058  * **************************************************************************
5059  */
5060 
5061 /*
5062  * FUNCTION:	meta_recover_sp()
5063  * INPUT:	sp	- the name of the set we are recovering on
5064  *		compnp	- name pointer for device we are recovering on
5065  *		argc	- argument count
5066  *		argv	- left over arguments not parsed by metarecover command
5067  *		options	- metarecover options
5068  * OUTPUT:	ep	- return error pointer
5069  * RETURNS:	int	- 0 - success, -1 - error
5070  * PURPOSE:	parse soft partitioning-specific metarecover options and
5071  *		dispatch to the appropriate function to handle recovery.
5072  */
5073 int
5074 meta_recover_sp(
5075 	mdsetname_t	*sp,
5076 	mdname_t	*compnp,
5077 	int		argc,
5078 	char		*argv[],
5079 	mdcmdopts_t	options,
5080 	md_error_t	*ep
5081 )
5082 {
5083 	md_set_desc	*sd;
5084 
5085 	if (argc > 1) {
5086 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5087 		    argc, argv);
5088 		return (-1);
5089 	}
5090 
5091 	/*
5092 	 * For a MN set, this operation must be performed on the master
5093 	 * as it is responsible for maintaining the watermarks
5094 	 */
5095 	if (!metaislocalset(sp)) {
5096 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
5097 			return (-1);
5098 		if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) {
5099 			(void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno,
5100 			    sd->sd_mn_master_nodenm, NULL, NULL);
5101 			return (-1);
5102 		}
5103 	}
5104 	if (argc == 0) {
5105 		/*
5106 		 * if no additional arguments are passed, metarecover should
5107 		 * validate both on-disk and metadb structures as well as
5108 		 * checking that both are consistent with each other
5109 		 */
5110 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5111 			return (-1);
5112 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5113 			return (-1);
5114 		if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0)
5115 			return (-1);
5116 	} else if (strcmp(argv[0], "-d") == 0) {
5117 		/*
5118 		 * Ensure that there is no existing valid record for this
5119 		 * soft-partition. If there is we have nothing to do.
5120 		 */
5121 		if (meta_sp_validate_unit(sp, compnp, options, ep) == 0)
5122 			return (-1);
5123 		/* validate and recover from on-disk structures */
5124 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5125 			return (-1);
5126 		if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0)
5127 			return (-1);
5128 	} else if (strcmp(argv[0], "-m") == 0) {
5129 		/* validate and recover from metadb structures */
5130 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5131 			return (-1);
5132 		if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0)
5133 			return (-1);
5134 	} else {
5135 		/* syntax error */
5136 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5137 		    argc, argv);
5138 		return (-1);
5139 	}
5140 
5141 	return (0);
5142 }
5143 
5144 /*
5145  * FUNCTION:	meta_sp_display_exthdr()
5146  * INPUT:	none
5147  * OUTPUT:	none
5148  * RETURNS:	void
5149  * PURPOSE:	print header line for sp_ext_node_t information.  to be used
5150  *		in conjunction with meta_sp_display_ext().
5151  */
5152 static void
5153 meta_sp_display_exthdr(void)
5154 {
5155 	(void) printf("%20s %5s %7s %20s %20s\n",
5156 	    dgettext(TEXT_DOMAIN, "Name"),
5157 	    dgettext(TEXT_DOMAIN, "Seq#"),
5158 	    dgettext(TEXT_DOMAIN, "Type"),
5159 	    dgettext(TEXT_DOMAIN, "Offset"),
5160 	    dgettext(TEXT_DOMAIN, "Length"));
5161 }
5162 
5163 
5164 /*
5165  * FUNCTION:	meta_sp_display_ext()
5166  * INPUT:	ext	- extent to display
5167  * OUTPUT:	none
5168  * RETURNS:	void
5169  * PURPOSE:	print selected fields from sp_ext_node_t.
5170  */
5171 static void
5172 meta_sp_display_ext(sp_ext_node_t *ext)
5173 {
5174 	/* print extent information */
5175 	if (ext->ext_namep != NULL)
5176 		(void) printf("%20s ", ext->ext_namep->cname);
5177 	else
5178 		(void) printf("%20s ", "NONE");
5179 
5180 	(void) printf("%5u ", ext->ext_seq);
5181 
5182 	switch (ext->ext_type) {
5183 	case EXTTYP_ALLOC:
5184 		(void) printf("%7s ", "ALLOC");
5185 		break;
5186 	case EXTTYP_FREE:
5187 		(void) printf("%7s ", "FREE");
5188 		break;
5189 	case EXTTYP_RESERVED:
5190 		(void) printf("%7s ", "RESV");
5191 		break;
5192 	case EXTTYP_END:
5193 		(void) printf("%7s ", "END");
5194 		break;
5195 	default:
5196 		(void) printf("%7s ", "INVLD");
5197 		break;
5198 	}
5199 
5200 	(void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length);
5201 }
5202 
5203 
5204 /*
5205  * FUNCTION:	meta_sp_checkseq()
5206  * INPUT:	extlist	- list of extents to be checked
5207  * OUTPUT:	none
5208  * RETURNS:	int	- 0 - success, -1 - error
5209  * PURPOSE:	check soft partition sequence numbers.  this function assumes
5210  *		that a list of extents representing 1 or more soft partitions
5211  *		is passed in sorted in sequence number order.  within a
5212  *		single soft partition, there may not be any missing or
5213  *		duplicate sequence numbers.
5214  */
5215 static int
5216 meta_sp_checkseq(sp_ext_node_t *extlist)
5217 {
5218 	sp_ext_node_t *ext;
5219 
5220 	assert(extlist != NULL);
5221 
5222 	for (ext = extlist;
5223 	    ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC;
5224 	    ext = ext->ext_next) {
5225 		if (ext->ext_next->ext_namep != NULL &&
5226 		    strcmp(ext->ext_next->ext_namep->cname,
5227 		    ext->ext_namep->cname) != 0)
5228 				continue;
5229 
5230 		if (ext->ext_next->ext_seq != ext->ext_seq + 1) {
5231 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5232 			    "%s: sequence numbers are "
5233 			    "incorrect: %d should be %d\n"),
5234 			    ext->ext_next->ext_namep->cname,
5235 			    ext->ext_next->ext_seq, ext->ext_seq + 1);
5236 			return (-1);
5237 		}
5238 	}
5239 	return (0);
5240 }
5241 
5242 
5243 /*
5244  * FUNCTION:	meta_sp_resolve_name_conflict()
5245  * INPUT:	sp	- name of set we're are recovering in.
5246  *		old_np	- name pointer of soft partition we found on disk.
5247  * OUTPUT:	new_np	- name pointer for new soft partition name.
5248  *		ep	- error pointer returned.
5249  * RETURNS:	int	- 0 - name not replace, 1 - name replaced, -1 - error
5250  * PURPOSE:	Check to see if the name of one of the soft partitions we found
5251  *		on disk already exists in the metadb.  If so, prompt for a new
5252  *		name.  In addition, we keep a static array of names that
5253  *		will be recovered from this device since these names don't
5254  *		exist in the configuration at this point but cannot be
5255  *		recovered more than once.
5256  */
5257 static int
5258 meta_sp_resolve_name_conflict(
5259 	mdsetname_t	*sp,
5260 	mdname_t	*old_np,
5261 	mdname_t	**new_np,
5262 	md_error_t	*ep
5263 )
5264 {
5265 	char		yesno[255];
5266 	char		*yes;
5267 	char		newname[MD_SP_MAX_DEVNAME_PLUS_1];
5268 	int		nunits;
5269 	static int	*used_names = NULL;
5270 
5271 	assert(old_np != NULL);
5272 
5273 	if (used_names == NULL) {
5274 		if ((nunits = meta_get_nunits(ep)) < 0)
5275 			return (-1);
5276 		used_names = Zalloc(nunits * sizeof (int));
5277 	}
5278 
5279 	/* see if it exists already */
5280 	if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 &&
5281 	    metagetmiscname(old_np, ep) == NULL) {
5282 		if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5283 			return (-1);
5284 		else {
5285 			used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1;
5286 			mdclrerror(ep);
5287 			return (0);
5288 		}
5289 	}
5290 
5291 	/* name exists, ask the user for a new one */
5292 	(void) printf(dgettext(TEXT_DOMAIN,
5293 	    "WARNING: A soft partition named %s was found in the extent\n"
5294 	    "headers, but this name already exists in the metadb "
5295 	    "configuration.\n"
5296 	    "In order to continue recovery you must supply\n"
5297 	    "a new name for this soft partition.\n"), old_np->cname);
5298 	(void) printf(dgettext(TEXT_DOMAIN,
5299 	    "Would you like to continue and supply a new name? (yes/no) "));
5300 
5301 	(void) fflush(stdout);
5302 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
5303 	    (strlen(yesno) == 1))
5304 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
5305 		    dgettext(TEXT_DOMAIN, "no"));
5306 	yes = dgettext(TEXT_DOMAIN, "yes");
5307 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
5308 		return (-1);
5309 	}
5310 
5311 	(void) fflush(stdin);
5312 
5313 	/* get the new name */
5314 	for (;;) {
5315 		(void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name "
5316 		    "for this soft partition (dXXXX) "));
5317 		(void) fflush(stdout);
5318 		if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL)
5319 			(void) strcpy(newname, "");
5320 
5321 		/* remove newline character */
5322 		if (newname[strlen(newname) - 1] == '\n')
5323 			newname[strlen(newname) - 1] = '\0';
5324 
5325 		if (!(is_metaname(newname)) ||
5326 		    (meta_init_make_device(&sp, newname, ep) <= 0)) {
5327 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5328 			    "Invalid metadevice name\n"));
5329 			(void) fflush(stderr);
5330 			continue;
5331 		}
5332 
5333 		if ((*new_np = metaname(&sp, newname,
5334 		    META_DEVICE, ep)) == NULL) {
5335 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5336 			    "Invalid metadevice name\n"));
5337 			(void) fflush(stderr);
5338 			continue;
5339 		}
5340 
5341 		assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits);
5342 		/* make sure the name isn't already being used */
5343 		if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] ||
5344 		    metagetmiscname(*new_np, ep) != NULL) {
5345 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5346 			    "That name already exists\n"));
5347 			continue;
5348 		} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5349 			return (-1);
5350 
5351 		break;
5352 	}
5353 
5354 	/* got a new name, place in used array and return */
5355 	used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1;
5356 	mdclrerror(ep);
5357 	return (1);
5358 }
5359 
5360 /*
5361  * FUNCTION:	meta_sp_validate_wm()
5362  * INPUT:	sp	- set name we are recovering in
5363  *		compnp	- name pointer for device we are recovering from
5364  *		options	- metarecover options
5365  * OUTPUT:	ep	- error pointer returned
5366  * RETURNS:	int	- 0 - success, -1 - error
5367  * PURPOSE:	validate and display watermark configuration.  walk the
5368  *		on-disk watermark structures and validate the information
5369  *		found within.  since a watermark configuration is
5370  *		"self-defining", the act of traversing the watermarks
5371  *		is part of the validation process.
5372  */
5373 static int
5374 meta_sp_validate_wm(
5375 	mdsetname_t	*sp,
5376 	mdname_t	*compnp,
5377 	mdcmdopts_t	options,
5378 	md_error_t	*ep
5379 )
5380 {
5381 	sp_ext_node_t	*extlist = NULL;
5382 	sp_ext_node_t	*ext;
5383 	int		num_sps = 0;
5384 	int		rval;
5385 
5386 	if ((options & MDCMD_VERBOSE) != 0)
5387 		(void) printf(dgettext(TEXT_DOMAIN,
5388 		    "Verifying on-disk structures on %s.\n"),
5389 		    compnp->cname);
5390 
5391 	/*
5392 	 * for each watermark, build an ext_node, place on list.
5393 	 */
5394 	rval = meta_sp_extlist_from_wm(sp, compnp, &extlist,
5395 	    meta_sp_cmp_by_nameseq, ep);
5396 
5397 	if ((options & MDCMD_VERBOSE) != 0) {
5398 		/* print out what we found */
5399 		if (extlist == NULL)
5400 			(void) printf(dgettext(TEXT_DOMAIN,
5401 			    "No extent headers found on %s.\n"),
5402 			    compnp->cname);
5403 		else {
5404 			(void) printf(dgettext(TEXT_DOMAIN,
5405 			    "The following extent headers were found on %s.\n"),
5406 			    compnp->cname);
5407 			meta_sp_display_exthdr();
5408 		}
5409 		for (ext = extlist; ext != NULL; ext = ext->ext_next)
5410 			meta_sp_display_ext(ext);
5411 	}
5412 
5413 	if (rval < 0) {
5414 		(void) printf(dgettext(TEXT_DOMAIN,
5415 		    "%s: On-disk structures invalid or "
5416 		    "no soft partitions found.\n"),
5417 		    compnp->cname);
5418 		return (-1);
5419 	}
5420 
5421 	assert(extlist != NULL);
5422 
5423 	/* count number of soft partitions */
5424 	for (ext = extlist;
5425 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5426 	    ext = ext->ext_next) {
5427 		if (ext->ext_next != NULL &&
5428 		    ext->ext_next->ext_namep != NULL &&
5429 		    strcmp(ext->ext_next->ext_namep->cname,
5430 		    ext->ext_namep->cname) == 0)
5431 				continue;
5432 		num_sps++;
5433 	}
5434 
5435 	if ((options & MDCMD_VERBOSE) != 0)
5436 		(void) printf(dgettext(TEXT_DOMAIN,
5437 		    "Found %d soft partition(s) on %s.\n"), num_sps,
5438 		    compnp->cname);
5439 
5440 	if (num_sps == 0) {
5441 		(void) printf(dgettext(TEXT_DOMAIN,
5442 		    "%s: No soft partitions.\n"), compnp->cname);
5443 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5444 	}
5445 
5446 	/* check sequence numbers */
5447 	if ((options & MDCMD_VERBOSE) != 0)
5448 		(void) printf(dgettext(TEXT_DOMAIN,
5449 		    "Checking sequence numbers.\n"));
5450 
5451 	if (meta_sp_checkseq(extlist) != 0)
5452 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5453 
5454 	return (0);
5455 }
5456 
5457 /*
5458  * FUNCTION:	meta_sp_validate_unit()
5459  * INPUT:	sp	- name of set we are recovering in
5460  *		compnp	- name of component we are recovering from
5461  *		options	- metarecover options
5462  * OUTPUT:	ep	- error pointer returned
5463  * RETURNS:	int	- 0 - success, -1 - error
5464  * PURPOSE:	validate and display metadb configuration.  begin by getting
5465  *		all soft partitions built on the specified component.  get
5466  *		the unit structure for each one and validate the fields within.
5467  */
5468 static int
5469 meta_sp_validate_unit(
5470 	mdsetname_t	*sp,
5471 	mdname_t	*compnp,
5472 	mdcmdopts_t	options,
5473 	md_error_t	*ep
5474 )
5475 {
5476 	md_sp_t		*msp;
5477 	mdnamelist_t	*spnlp = NULL;
5478 	mdnamelist_t	*namep = NULL;
5479 	int		count;
5480 	uint_t		extn;
5481 	sp_ext_length_t	size;
5482 
5483 	if ((options & MDCMD_VERBOSE) != 0)
5484 		(void) printf(dgettext(TEXT_DOMAIN,
5485 		    "%s: Validating soft partition metadb entries.\n"),
5486 		    compnp->cname);
5487 
5488 	if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR)
5489 		return (-1);
5490 
5491 	/* get all soft partitions on component */
5492 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
5493 
5494 	if (count == 0) {
5495 		(void) printf(dgettext(TEXT_DOMAIN,
5496 		    "%s: No soft partitions.\n"), compnp->cname);
5497 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5498 	} else if (count < 0) {
5499 		return (-1);
5500 	}
5501 
5502 	/* Now go through the soft partitions and check each one */
5503 	for (namep = spnlp; namep != NULL; namep = namep->next) {
5504 		mdname_t	*curnp = namep->namep;
5505 		sp_ext_offset_t	curvoff;
5506 
5507 		/* get the unit structure */
5508 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
5509 			return (-1);
5510 
5511 		/* verify generic unit structure parameters */
5512 		if ((options & MDCMD_VERBOSE) != 0)
5513 			(void) printf(dgettext(TEXT_DOMAIN,
5514 			    "\nVerifying device %s.\n"),
5515 			    curnp->cname);
5516 
5517 		/*
5518 		 * MD_SP_LAST is an invalid state and is always the
5519 		 * highest numbered.
5520 		 */
5521 		if (msp->status >= MD_SP_LAST) {
5522 			(void) printf(dgettext(TEXT_DOMAIN,
5523 			    "%s: status value %u is out of range.\n"),
5524 			    curnp->cname, msp->status);
5525 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5526 			    0, curnp->cname));
5527 		} else if ((options & MDCMD_VERBOSE) != 0) {
5528 			uint_t	tstate = 0;
5529 
5530 			if (metaismeta(msp->compnamep)) {
5531 				if (meta_get_tstate(msp->common.namep->dev,
5532 				    &tstate, ep) != 0)
5533 					return (-1);
5534 			}
5535 			(void) printf(dgettext(TEXT_DOMAIN,
5536 			    "%s: Status \"%s\" is valid.\n"),
5537 			    curnp->cname, meta_sp_status_to_name(msp->status,
5538 			    tstate & MD_DEV_ERRORED));
5539 		}
5540 
5541 		/* Now verify each extent */
5542 		if ((options & MDCMD_VERBOSE) != 0)
5543 			(void) printf("%14s %21s %21s %21s\n",
5544 			    dgettext(TEXT_DOMAIN, "Extent Number"),
5545 			    dgettext(TEXT_DOMAIN, "Virtual Offset"),
5546 			    dgettext(TEXT_DOMAIN, "Physical Offset"),
5547 			    dgettext(TEXT_DOMAIN, "Length"));
5548 
5549 		curvoff = 0ULL;
5550 		for (extn = 0; extn < msp->ext.ext_len; extn++) {
5551 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
5552 
5553 			if ((options & MDCMD_VERBOSE) != 0)
5554 				(void) printf("%14u %21llu %21llu %21llu\n",
5555 				    extn, extp->voff, extp->poff, extp->len);
5556 
5557 			if (extp->voff != curvoff) {
5558 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5559 				    "%s: virtual offset for extent %u "
5560 				    "is inconsistent, expected %llu, "
5561 				    "got %llu.\n"), curnp->cname, extn,
5562 				    curvoff, extp->voff);
5563 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5564 				    0, compnp->cname));
5565 			}
5566 
5567 			/* make sure extent does not drop off the end */
5568 			if ((extp->poff + extp->len) == size) {
5569 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5570 				    "%s: extent %u at offset %llu, "
5571 				    "length %llu exceeds the size of the "
5572 				    "device, %llu.\n"), curnp->cname,
5573 				    extn, extp->poff, extp->len, size);
5574 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5575 				    0, compnp->cname));
5576 			}
5577 
5578 			curvoff += extp->len;
5579 		}
5580 	}
5581 	if (options & MDCMD_PRINT) {
5582 		(void) printf(dgettext(TEXT_DOMAIN,
5583 		    "%s: Soft Partition metadb configuration is valid\n"),
5584 		    compnp->cname);
5585 	}
5586 	return (0);
5587 }
5588 
5589 /*
5590  * FUNCTION:	meta_sp_validate_wm_and_unit()
5591  * INPUT:	sp	- name of set we are recovering in
5592  *		compnp	- name of device we are recovering from
5593  *		options	- metarecover options
5594  * OUTPUT:	ep	- error pointer returned
5595  * RETURNS:	int	- 0 - success, -1 error
5596  * PURPOSE:	cross-validate and display watermarks and metadb records.
5597  *		get both the unit structures for the soft partitions built
5598  *		on the specified component and the watermarks found on that
5599  *		component and check to make sure they are consistent with
5600  *		each other.
5601  */
5602 static int
5603 meta_sp_validate_wm_and_unit(
5604 	mdsetname_t	*sp,
5605 	mdname_t	*np,
5606 	mdcmdopts_t	options,
5607 	md_error_t	*ep
5608 )
5609 {
5610 	sp_ext_node_t	*wmlist = NULL;
5611 	sp_ext_node_t	*unitlist = NULL;
5612 	sp_ext_node_t	*unitext;
5613 	sp_ext_node_t	*wmext;
5614 	sp_ext_offset_t	tmpunitoff;
5615 	mdnamelist_t	*spnlp = NULL;
5616 	int		count;
5617 	int		rval = 0;
5618 	int		verbose = (options & MDCMD_VERBOSE);
5619 
5620 	/* get unit structure list */
5621 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
5622 	if (count <= 0)
5623 		return (-1);
5624 
5625 	meta_sp_list_insert(NULL, NULL, &unitlist,
5626 	    metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
5627 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
5628 
5629 	if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) {
5630 		metafreenamelist(spnlp);
5631 		return (-1);
5632 	}
5633 
5634 	metafreenamelist(spnlp);
5635 
5636 	meta_sp_list_freefill(&unitlist, metagetsize(np, ep));
5637 
5638 	if (meta_sp_extlist_from_wm(sp, np, &wmlist,
5639 	    meta_sp_cmp_by_offset, ep) < 0) {
5640 		meta_sp_list_free(&unitlist);
5641 		return (-1);
5642 	}
5643 
5644 	if (getenv(META_SP_DEBUG)) {
5645 		meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n");
5646 		meta_sp_list_dump(unitlist);
5647 		meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n");
5648 		meta_sp_list_dump(wmlist);
5649 	}
5650 
5651 	/*
5652 	 * step through both lists and compare allocated nodes.  Free
5653 	 * nodes and end watermarks may differ between the two but
5654 	 * that's generally ok, and if they're wrong will typically
5655 	 * cause misplaced allocated extents.
5656 	 */
5657 	if (verbose)
5658 		(void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb "
5659 		    "allocations match extent headers.\n"), np->cname);
5660 
5661 	unitext = unitlist;
5662 	wmext = wmlist;
5663 	while ((wmext != NULL) && (unitext != NULL)) {
5664 		/* find next allocated extents in each list */
5665 		while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC)
5666 			wmext = wmext->ext_next;
5667 
5668 		while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC)
5669 			unitext = unitext->ext_next;
5670 
5671 		if (wmext == NULL || unitext == NULL)
5672 			break;
5673 
5674 		if (verbose) {
5675 			(void) printf(dgettext(TEXT_DOMAIN,
5676 			    "Metadb extent:\n"));
5677 			meta_sp_display_exthdr();
5678 			meta_sp_display_ext(unitext);
5679 			(void) printf(dgettext(TEXT_DOMAIN,
5680 			    "Extent header extent:\n"));
5681 			meta_sp_display_exthdr();
5682 			meta_sp_display_ext(wmext);
5683 			(void) printf("\n");
5684 		}
5685 
5686 		if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0)
5687 			rval = -1;
5688 
5689 		/*
5690 		 * if the offsets aren't equal, only increment the
5691 		 * lowest one in hopes of getting the lists back in sync.
5692 		 */
5693 		tmpunitoff = unitext->ext_offset;
5694 		if (unitext->ext_offset <= wmext->ext_offset)
5695 			unitext = unitext->ext_next;
5696 		if (wmext->ext_offset <= tmpunitoff)
5697 			wmext = wmext->ext_next;
5698 	}
5699 
5700 	/*
5701 	 * if both lists aren't at the end then there are extra
5702 	 * allocated nodes in one of them.
5703 	 */
5704 	if (wmext != NULL) {
5705 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5706 		    "%s: extent headers contain allocations not in "
5707 		    "the metadb\n\n"), np->cname);
5708 		rval = -1;
5709 	}
5710 
5711 	if (unitext != NULL) {
5712 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5713 		    "%s: metadb contains allocations not in the extent "
5714 		    "headers\n\n"), np->cname);
5715 		rval = -1;
5716 	}
5717 
5718 	if (options & MDCMD_PRINT) {
5719 		if (rval == 0) {
5720 			(void) printf(dgettext(TEXT_DOMAIN,
5721 			    "%s: Soft Partition metadb matches extent "
5722 			    "header configuration\n"), np->cname);
5723 		} else {
5724 			(void) printf(dgettext(TEXT_DOMAIN,
5725 			    "%s: Soft Partition metadb does not match extent "
5726 			    "header configuration\n"), np->cname);
5727 		}
5728 	}
5729 
5730 	return (rval);
5731 }
5732 
5733 /*
5734  * FUNCTION:	meta_sp_validate_exts()
5735  * INPUT:	compnp	- name pointer for device we are recovering from
5736  *		wmext	- extent node representing watermark
5737  *		unitext	- extent node from unit structure
5738  * OUTPUT:	ep	- return error pointer
5739  * RETURNS:	int	- 0 - succes, mdmderror return code - error
5740  * PURPOSE:	Takes two extent nodes and checks them against each other.
5741  *		offset, length, sequence number, set, and name are compared.
5742  */
5743 static int
5744 meta_sp_validate_exts(
5745 	mdname_t	*compnp,
5746 	sp_ext_node_t	*wmext,
5747 	sp_ext_node_t	*unitext,
5748 	md_error_t	*ep
5749 )
5750 {
5751 	if (wmext->ext_offset != unitext->ext_offset) {
5752 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5753 		    "%s: unit structure and extent header offsets differ.\n"),
5754 		    compnp->cname);
5755 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5756 	}
5757 
5758 	if (wmext->ext_length != unitext->ext_length) {
5759 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5760 		    "%s: unit structure and extent header lengths differ.\n"),
5761 		    compnp->cname);
5762 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5763 	}
5764 
5765 	if (wmext->ext_seq != unitext->ext_seq) {
5766 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5767 		    "%s: unit structure and extent header sequence numbers "
5768 		    "differ.\n"), compnp->cname);
5769 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5770 	}
5771 
5772 	if (wmext->ext_type != unitext->ext_type) {
5773 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5774 		    "%s: unit structure and extent header types differ.\n"),
5775 		    compnp->cname);
5776 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5777 	}
5778 
5779 	/*
5780 	 * If one has a set pointer and the other doesn't, error.
5781 	 * If both extents have setnames, then make sure they match
5782 	 * If both are NULL, it's ok, they match.
5783 	 */
5784 	if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) {
5785 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5786 		    "%s: unit structure and extent header set values "
5787 		    "differ.\n"), compnp->cname);
5788 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5789 	}
5790 
5791 	if (unitext->ext_setp != NULL) {
5792 		if (strcmp(unitext->ext_setp->setname,
5793 		    wmext->ext_setp->setname) != 0) {
5794 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5795 			    "%s: unit structure and extent header set names "
5796 			    "differ.\n"), compnp->cname);
5797 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5798 			    0, compnp->cname));
5799 		}
5800 	}
5801 
5802 	/*
5803 	 * If one has a name pointer and the other doesn't, error.
5804 	 * If both extents have names, then make sure they match
5805 	 * If both are NULL, it's ok, they match.
5806 	 */
5807 	if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) {
5808 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5809 		    "%s: unit structure and extent header name values "
5810 		    "differ.\n"), compnp->cname);
5811 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5812 	}
5813 
5814 	if (unitext->ext_namep != NULL) {
5815 		if (strcmp(wmext->ext_namep->cname,
5816 		    unitext->ext_namep->cname) != 0) {
5817 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5818 			    "%s: unit structure and extent header names "
5819 			    "differ.\n"), compnp->cname);
5820 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5821 			    0, compnp->cname));
5822 		}
5823 	}
5824 
5825 	return (0);
5826 }
5827 
5828 /*
5829  * FUNCTION:	update_sp_status()
5830  * INPUT:	sp	- name of set we are recovering in
5831  *		minors	- pointer to an array of soft partition minor numbers
5832  *		num_sps	- number of minor numbers in array
5833  *		status	- new status to be applied to all soft parts in array
5834  *		mn_set	- set if current set is a multi-node set
5835  * OUTPUT:	ep	- return error pointer
5836  * RETURNS:	int	- 0 - success, -1 - error
5837  * PURPOSE:	update  status of soft partitions to new status. minors is an
5838  *		array of minor numbers to apply the new status to.
5839  *		If mn_set is set, a message is sent to all nodes in the
5840  *		cluster to update the status locally.
5841  */
5842 static int
5843 update_sp_status(
5844 	mdsetname_t	*sp,
5845 	minor_t		*minors,
5846 	int		num_sps,
5847 	sp_status_t	status,
5848 	bool_t		mn_set,
5849 	md_error_t	*ep
5850 )
5851 {
5852 	int	i;
5853 	int	err = 0;
5854 
5855 	if (mn_set) {
5856 		md_mn_msg_sp_setstat_t	sp_setstat_params;
5857 		int			result;
5858 		md_mn_result_t		*resp = NULL;
5859 
5860 		for (i = 0; i < num_sps; i++) {
5861 			sp_setstat_params.sp_setstat_mnum = minors[i];
5862 			sp_setstat_params.sp_setstat_status = status;
5863 
5864 			result = mdmn_send_message(sp->setno,
5865 			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS, 0,
5866 			    (char *)&sp_setstat_params,
5867 			    sizeof (sp_setstat_params),
5868 			    &resp, ep);
5869 			if (resp != NULL) {
5870 				if (resp->mmr_exitval != 0)
5871 					err = -1;
5872 				free_result(resp);
5873 			}
5874 			if (result != 0) {
5875 				err = -1;
5876 			}
5877 		}
5878 	} else {
5879 		if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0)
5880 			err = -1;
5881 	}
5882 	if (err < 0) {
5883 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5884 		    "Error updating status on recovered soft "
5885 		    "partitions.\n"));
5886 	}
5887 	return (err);
5888 }
5889 
5890 /*
5891  * FUNCTION:	meta_sp_recover_from_wm()
5892  * INPUT:	sp	- name of set we are recovering in
5893  *		compnp	- name pointer for component we are recovering from
5894  *		options	- metarecover options
5895  * OUTPUT:	ep	- return error pointer
5896  * RETURNS:	int	- 0 - success, -1 - error
5897  * PURPOSE:	update metadb records to match watermarks.  begin by getting
5898  *		an extlist representing all soft partitions on the component.
5899  *		then build a unit structure for each soft partition.
5900  *		notify user of changes, then commit each soft partition to
5901  *		the metadb one at a time in the "recovering" state.  update
5902  *		any watermarks that may need it	(to reflect possible name
5903  *		changes), and, finally, set the status of all recovered
5904  *		partitions to the "OK" state at once.
5905  */
5906 static int
5907 meta_sp_recover_from_wm(
5908 	mdsetname_t	*sp,
5909 	mdname_t	*compnp,
5910 	mdcmdopts_t	options,
5911 	md_error_t	*ep
5912 )
5913 {
5914 	sp_ext_node_t		*extlist = NULL;
5915 	sp_ext_node_t		*sp_list = NULL;
5916 	sp_ext_node_t		*update_list = NULL;
5917 	sp_ext_node_t		*ext;
5918 	sp_ext_node_t		*sp_ext;
5919 	mp_unit_t		*mp;
5920 	mp_unit_t		**un_array;
5921 	int			numexts = 0, num_sps = 0, i = 0;
5922 	int			err = 0;
5923 	int			not_recovered = 0;
5924 	int			committed = 0;
5925 	sp_ext_length_t		sp_length = 0LL;
5926 	mdnamelist_t		*keynlp = NULL;
5927 	mdname_t		*np;
5928 	mdname_t		*new_np;
5929 	int			new_name;
5930 	md_set_params_t		set_params;
5931 	minor_t			*minors = NULL;
5932 	char			yesno[255];
5933 	char			*yes;
5934 	bool_t			mn_set = 0;
5935 	md_set_desc		*sd;
5936 	mm_unit_t		*mm;
5937 	md_set_mmown_params_t	*ownpar = NULL;
5938 	int			comp_is_mirror = 0;
5939 
5940 	/*
5941 	 * if this component appears in another metadevice already, do
5942 	 * NOT recover from it.
5943 	 */
5944 	if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0)
5945 		return (-1);
5946 
5947 	/* set flag if dealing with a MN set */
5948 	if (!metaislocalset(sp)) {
5949 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5950 			return (-1);
5951 		}
5952 		if (MD_MNSET_DESC(sd))
5953 			mn_set = 1;
5954 	}
5955 	/*
5956 	 * for each watermark, build an ext_node, place on list.
5957 	 */
5958 	if (meta_sp_extlist_from_wm(sp, compnp, &extlist,
5959 	    meta_sp_cmp_by_nameseq, ep) < 0)
5960 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5961 
5962 	assert(extlist != NULL);
5963 
5964 	/* count number of soft partitions */
5965 	for (ext = extlist;
5966 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5967 	    ext = ext->ext_next) {
5968 		if (ext->ext_next != NULL &&
5969 		    ext->ext_next->ext_namep != NULL &&
5970 		    strcmp(ext->ext_next->ext_namep->cname,
5971 		    ext->ext_namep->cname) == 0)
5972 				continue;
5973 		num_sps++;
5974 	}
5975 
5976 	/* allocate array of unit structure pointers */
5977 	un_array = Zalloc(num_sps * sizeof (mp_unit_t *));
5978 
5979 	/*
5980 	 * build unit structures from list of ext_nodes.
5981 	 */
5982 	for (ext = extlist;
5983 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5984 	    ext = ext->ext_next) {
5985 		meta_sp_list_insert(ext->ext_setp, ext->ext_namep,
5986 		    &sp_list, ext->ext_offset, ext->ext_length,
5987 		    ext->ext_type, ext->ext_seq, ext->ext_flags,
5988 		    meta_sp_cmp_by_nameseq);
5989 
5990 		numexts++;
5991 		sp_length += ext->ext_length - MD_SP_WMSIZE;
5992 
5993 		if (ext->ext_next != NULL &&
5994 		    ext->ext_next->ext_namep != NULL &&
5995 		    strcmp(ext->ext_next->ext_namep->cname,
5996 		    ext->ext_namep->cname) == 0)
5997 				continue;
5998 
5999 		/*
6000 		 * if we made it here, we are at a soft partition
6001 		 * boundary in the list.
6002 		 */
6003 		if (getenv(META_SP_DEBUG)) {
6004 			meta_sp_debug("meta_recover_from_wm: dumping wm "
6005 			    "list:\n");
6006 			meta_sp_list_dump(sp_list);
6007 		}
6008 
6009 		assert(sp_list != NULL);
6010 		assert(sp_list->ext_namep != NULL);
6011 
6012 		if ((new_name = meta_sp_resolve_name_conflict(sp,
6013 		    sp_list->ext_namep, &new_np, ep)) < 0) {
6014 			err = 1;
6015 			goto out;
6016 		} else if (new_name) {
6017 			for (sp_ext = sp_list;
6018 			    sp_ext != NULL;
6019 			    sp_ext = sp_ext->ext_next) {
6020 				/*
6021 				 * insert into the update list for
6022 				 * watermark update.
6023 				 */
6024 				meta_sp_list_insert(sp_ext->ext_setp,
6025 				    new_np, &update_list, sp_ext->ext_offset,
6026 				    sp_ext->ext_length, sp_ext->ext_type,
6027 				    sp_ext->ext_seq, EXTFLG_UPDATE,
6028 				    meta_sp_cmp_by_offset);
6029 			}
6030 
6031 		}
6032 		if (options & MDCMD_DOIT) {
6033 			/* store name in namespace */
6034 			if (mn_set) {
6035 				/* send message to all nodes to return key */
6036 				md_mn_msg_addkeyname_t	*send_params;
6037 				int			result;
6038 				md_mn_result_t		*resp = NULL;
6039 				int			message_size;
6040 
6041 				message_size =  sizeof (*send_params) +
6042 				    strlen(compnp->cname) + 1;
6043 				send_params = Zalloc(message_size);
6044 				send_params->addkeyname_setno = sp->setno;
6045 				(void) strcpy(&send_params->addkeyname_name[0],
6046 				    compnp->cname);
6047 				result = mdmn_send_message(sp->setno,
6048 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6049 				    0, (char *)send_params, message_size, &resp,
6050 				    ep);
6051 				Free(send_params);
6052 				if (resp != NULL) {
6053 					if (resp->mmr_exitval >= 0) {
6054 						compnp->key =
6055 						    (mdkey_t)resp->mmr_exitval;
6056 					} else {
6057 						err = 1;
6058 						free_result(resp);
6059 						goto out;
6060 					}
6061 					free_result(resp);
6062 				}
6063 				if (result != 0) {
6064 					err = 1;
6065 					goto out;
6066 				}
6067 				(void) metanamelist_append(&keynlp, compnp);
6068 			} else {
6069 				if (add_key_name(sp, compnp, &keynlp,
6070 				    ep) != 0) {
6071 					err = 1;
6072 					goto out;
6073 				}
6074 			}
6075 		}
6076 
6077 		/* create the unit structure */
6078 		if ((mp = meta_sp_createunit(
6079 		    (new_name) ? new_np : sp_list->ext_namep, compnp,
6080 		    sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) {
6081 			err = 1;
6082 			goto out;
6083 		}
6084 
6085 		if (getenv(META_SP_DEBUG)) {
6086 			meta_sp_debug("meta_sp_recover_from_wm: "
6087 			    "printing newly created unit structure");
6088 			meta_sp_printunit(mp);
6089 		}
6090 
6091 		/* place in unit structure array */
6092 		un_array[i++] = mp;
6093 
6094 		/* free sp_list */
6095 		meta_sp_list_free(&sp_list);
6096 		sp_list = NULL;
6097 		numexts = 0;
6098 		sp_length = 0LL;
6099 	}
6100 
6101 	/* display configuration updates */
6102 	(void) printf(dgettext(TEXT_DOMAIN,
6103 	    "The following soft partitions were found and will be added to\n"
6104 	    "your metadevice configuration.\n"));
6105 	(void) printf("%5s %15s %18s\n",
6106 	    dgettext(TEXT_DOMAIN, "Name"),
6107 	    dgettext(TEXT_DOMAIN, "Size"),
6108 	    dgettext(TEXT_DOMAIN, "No. of Extents"));
6109 	for (i = 0; i < num_sps; i++) {
6110 		(void) printf("%5s%lu %15llu %9d\n", "d",
6111 		    MD_MIN2UNIT(MD_SID(un_array[i])),
6112 		    un_array[i]->un_length, un_array[i]->un_numexts);
6113 	}
6114 
6115 	if (!(options & MDCMD_DOIT)) {
6116 		not_recovered = 1;
6117 		goto out;
6118 	}
6119 
6120 	/* ask user for confirmation */
6121 	(void) printf(dgettext(TEXT_DOMAIN,
6122 	    "WARNING: You are about to add one or more soft partition\n"
6123 	    "metadevices to your metadevice configuration.  If there\n"
6124 	    "appears to be an error in the soft partition(s) displayed\n"
6125 	    "above, do NOT proceed with this recovery operation.\n"));
6126 	(void) printf(dgettext(TEXT_DOMAIN,
6127 	    "Are you sure you want to do this (yes/no)? "));
6128 
6129 	(void) fflush(stdout);
6130 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6131 	    (strlen(yesno) == 1))
6132 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
6133 		    dgettext(TEXT_DOMAIN, "no"));
6134 	yes = dgettext(TEXT_DOMAIN, "yes");
6135 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
6136 		not_recovered = 1;
6137 		goto out;
6138 	}
6139 
6140 	/* commit records one at a time */
6141 	for (i = 0; i < num_sps; i++) {
6142 		(void) memset(&set_params, 0, sizeof (set_params));
6143 		set_params.mnum = MD_SID(un_array[i]);
6144 		set_params.size = (un_array[i])->c.un_size;
6145 		set_params.mdp = (uintptr_t)(un_array[i]);
6146 		set_params.options =
6147 		    meta_check_devicesize(un_array[i]->un_length);
6148 		if (set_params.options == MD_CRO_64BIT) {
6149 			un_array[i]->c.un_revision |= MD_64BIT_META_DEV;
6150 		} else {
6151 			un_array[i]->c.un_revision &= ~MD_64BIT_META_DEV;
6152 		}
6153 		MD_SETDRIVERNAME(&set_params, MD_SP,
6154 		    MD_MIN2SET(set_params.mnum));
6155 
6156 		np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep);
6157 
6158 		/*
6159 		 * If this is an MN set, send the MD_IOCSET ioctl to all nodes
6160 		 */
6161 		if (mn_set) {
6162 			md_mn_msg_iocset_t	send_params;
6163 			int			result;
6164 			md_mn_result_t		*resp = NULL;
6165 			int			mess_size;
6166 
6167 			/*
6168 			 * Calculate message size. md_mn_msg_iocset_t only
6169 			 * contains one extent, so increment the size to
6170 			 * include all extents
6171 			 */
6172 			mess_size = sizeof (send_params) -
6173 			    sizeof (mp_ext_t) +
6174 			    (un_array[i]->un_numexts * sizeof (mp_ext_t));
6175 
6176 			send_params.iocset_params = set_params;
6177 			(void) memcpy(&send_params.unit, un_array[i],
6178 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
6179 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
6180 			result = mdmn_send_message(sp->setno,
6181 			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS, 0,
6182 			    (char *)&send_params, mess_size, &resp,
6183 			    ep);
6184 			if (resp != NULL) {
6185 				if (resp->mmr_exitval != 0)
6186 					err = 1;
6187 				free_result(resp);
6188 			}
6189 			if (result != 0) {
6190 				err = 1;
6191 			}
6192 		} else {
6193 			if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
6194 			    np->cname) != 0) {
6195 				err = 1;
6196 			}
6197 		}
6198 
6199 		if (err == 1) {
6200 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6201 			    "%s: Error committing record to metadb.\n"),
6202 			    np->cname);
6203 			goto out;
6204 		}
6205 
6206 		/* note that we've committed a record */
6207 		if (!committed)
6208 			committed = 1;
6209 
6210 		/* update any watermarks that need it */
6211 		if (update_list != NULL) {
6212 			md_sp_t *msp;
6213 
6214 			/*
6215 			 * Check to see if we're trying to create a partition
6216 			 * on a mirror. If so we may have to enforce an
6217 			 * ownership change before writing the watermark out.
6218 			 */
6219 			if (metaismeta(compnp)) {
6220 				char *miscname;
6221 
6222 				miscname = metagetmiscname(compnp, ep);
6223 				if (miscname != NULL)
6224 					comp_is_mirror = (strcmp(miscname,
6225 					    MD_MIRROR) == 0);
6226 				else
6227 					comp_is_mirror = 0;
6228 			}
6229 			/*
6230 			 * If this is a MN set and the component is a mirror,
6231 			 * change ownership to this node in order to write the
6232 			 * watermarks
6233 			 */
6234 			if (mn_set && comp_is_mirror) {
6235 				mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
6236 				if (mm == NULL) {
6237 					err = 1;
6238 					goto out;
6239 				} else {
6240 					err = meta_mn_change_owner(&ownpar,
6241 					    sp->setno,
6242 					    meta_getminor(compnp->dev),
6243 					    sd->sd_mn_mynode->nd_nodeid,
6244 					    MD_MN_MM_PREVENT_CHANGE |
6245 					    MD_MN_MM_SPAWN_THREAD);
6246 					if (err != 0)
6247 						goto out;
6248 				}
6249 			}
6250 
6251 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
6252 				err = 1;
6253 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6254 				    "%s: Error updating extent headers.\n"),
6255 				    np->cname);
6256 				goto out;
6257 			}
6258 			if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) {
6259 				err = 1;
6260 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6261 				    "%s: Error updating extent headers "
6262 				    "on disk.\n"), np->cname);
6263 				goto out;
6264 			}
6265 		}
6266 		/*
6267 		 * If we have changed ownership earlier and prevented any
6268 		 * ownership changes, we can now allow ownership changes
6269 		 * again.
6270 		 */
6271 		if (ownpar) {
6272 			(void) meta_mn_change_owner(&ownpar, sp->setno,
6273 			    ownpar->d.mnum,
6274 			    ownpar->d.owner,
6275 			    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
6276 		}
6277 	}
6278 
6279 	/* update status of all soft partitions to OK */
6280 	minors = Zalloc(num_sps * sizeof (minor_t));
6281 	for (i = 0; i < num_sps; i++)
6282 		minors[i] = MD_SID(un_array[i]);
6283 
6284 	err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep);
6285 	if (err != 0)
6286 		goto out;
6287 
6288 	if (options & MDCMD_PRINT)
6289 		(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6290 		    "Soft Partitions recovered from device.\n"),
6291 		    compnp->cname);
6292 out:
6293 	/* free memory */
6294 	if (extlist != NULL)
6295 		meta_sp_list_free(&extlist);
6296 	if (sp_list != NULL)
6297 		meta_sp_list_free(&sp_list);
6298 	if (update_list != NULL)
6299 		meta_sp_list_free(&update_list);
6300 	if (un_array != NULL)	{
6301 		for (i = 0; i < num_sps; i++)
6302 			Free(un_array[i]);
6303 		Free(un_array);
6304 	}
6305 	if (minors != NULL)
6306 		Free(minors);
6307 	if (ownpar != NULL)
6308 		Free(ownpar);
6309 	(void) fflush(stdout);
6310 
6311 	if ((keynlp != NULL) && (committed != 1)) {
6312 		/*
6313 		 * if we haven't committed any softparts, either because of an
6314 		 * error or because the user decided not to proceed, delete
6315 		 * namelist key for the component
6316 		 */
6317 		if (mn_set) {
6318 			mdnamelist_t	*p;
6319 
6320 			for (p = keynlp; (p != NULL); p = p->next) {
6321 				mdname_t		*np = p->namep;
6322 				md_mn_msg_delkeyname_t	send_params;
6323 				md_mn_result_t		*resp = NULL;
6324 
6325 				send_params.delkeyname_dev = np->dev;
6326 				send_params.delkeyname_setno = sp->setno;
6327 				send_params.delkeyname_key = np->key;
6328 				(void) mdmn_send_message(sp->setno,
6329 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6330 				    0, (char *)&send_params,
6331 				    sizeof (send_params),
6332 				    &resp, ep);
6333 				if (resp != NULL) {
6334 					free_result(resp);
6335 				}
6336 			}
6337 		} else {
6338 			(void) del_key_names(sp, keynlp, NULL);
6339 		}
6340 	}
6341 
6342 	metafreenamelist(keynlp);
6343 
6344 	if (err)
6345 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
6346 
6347 	if (not_recovered)
6348 		if (options & MDCMD_PRINT)
6349 			(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6350 			    "Soft Partitions NOT recovered from device.\n"),
6351 			    compnp->cname);
6352 	return (0);
6353 }
6354 
6355 /*
6356  * FUNCTION:	meta_sp_recover_from_unit()
6357  * INPUT:	sp	- name of set we are recovering in
6358  *		compnp	- name of component we are recovering from
6359  *		options	- metarecover options
6360  * OUTPUT:	ep	- return error pointer
6361  * RETURNS:	int	- 0 - success, -1 - error
6362  * PURPOSE:	update watermarks to match metadb records.  begin by getting
6363  *		a namelist representing all soft partitions on the specified
6364  *		component.  then, build an extlist representing the soft
6365  *		partitions, filling in the freespace extents.  notify user
6366  *		of changes, place all soft partitions into the "recovering"
6367  *		state and update the watermarks.  finally, return all soft
6368  *		partitions to the "OK" state.
6369  */
6370 static int
6371 meta_sp_recover_from_unit(
6372 	mdsetname_t	*sp,
6373 	mdname_t	*compnp,
6374 	mdcmdopts_t	options,
6375 	md_error_t	*ep
6376 )
6377 {
6378 	mdnamelist_t	*spnlp = NULL;
6379 	mdnamelist_t	*nlp = NULL;
6380 	sp_ext_node_t	*ext = NULL;
6381 	sp_ext_node_t	*extlist = NULL;
6382 	int		count;
6383 	char		yesno[255];
6384 	char		*yes;
6385 	int		rval = 0;
6386 	minor_t		*minors = NULL;
6387 	int		i;
6388 	md_sp_t		*msp;
6389 	md_set_desc	*sd;
6390 	bool_t		mn_set = 0;
6391 	daddr_t		start_block;
6392 
6393 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
6394 	if (count <= 0)
6395 		return (-1);
6396 
6397 	/* set flag if dealing with a MN set */
6398 	if (!metaislocalset(sp)) {
6399 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
6400 			return (-1);
6401 		}
6402 		if (MD_MNSET_DESC(sd))
6403 			mn_set = 1;
6404 	}
6405 	/*
6406 	 * Save the XDR unit structure for one of the soft partitions;
6407 	 * we'll use this later to provide metadevice context to
6408 	 * update the watermarks so the device can be resolved by
6409 	 * devid instead of dev_t.
6410 	 */
6411 	if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) {
6412 		metafreenamelist(spnlp);
6413 		return (-1);
6414 	}
6415 
6416 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
6417 	    MD_DISKADDR_ERROR) {
6418 		return (-1);
6419 	}
6420 
6421 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
6422 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
6423 	meta_sp_list_insert(NULL, NULL, &extlist,
6424 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
6425 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
6426 
6427 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
6428 		metafreenamelist(spnlp);
6429 		return (-1);
6430 	}
6431 
6432 	assert(extlist != NULL);
6433 	if ((options & MDCMD_VERBOSE) != 0) {
6434 		(void) printf(dgettext(TEXT_DOMAIN,
6435 		    "Updating extent headers on device %s from metadb.\n\n"),
6436 		    compnp->cname);
6437 		(void) printf(dgettext(TEXT_DOMAIN,
6438 		    "The following extent headers will be written:\n"));
6439 		meta_sp_display_exthdr();
6440 	}
6441 
6442 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
6443 
6444 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
6445 
6446 		/* mark every node for updating except the reserved space */
6447 		if (ext->ext_type != EXTTYP_RESERVED) {
6448 			ext->ext_flags |= EXTFLG_UPDATE;
6449 
6450 			/* print extent information */
6451 			if ((options & MDCMD_VERBOSE) != 0)
6452 				meta_sp_display_ext(ext);
6453 		}
6454 	}
6455 
6456 	/* request verification and then update all watermarks */
6457 	if ((options & MDCMD_DOIT) != 0) {
6458 
6459 		(void) printf(dgettext(TEXT_DOMAIN,
6460 		    "\nWARNING: You are about to overwrite portions of %s\n"
6461 		    "with soft partition metadata. The extent headers will be\n"
6462 		    "written to match the existing metadb configuration.  If\n"
6463 		    "the device was not previously setup with this\n"
6464 		    "configuration, data loss may result.\n\n"),
6465 		    compnp->cname);
6466 		(void) printf(dgettext(TEXT_DOMAIN,
6467 		    "Are you sure you want to do this (yes/no)? "));
6468 
6469 		(void) fflush(stdout);
6470 		if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6471 		    (strlen(yesno) == 1))
6472 			(void) snprintf(yesno, sizeof (yesno),
6473 			    "%s\n", dgettext(TEXT_DOMAIN, "no"));
6474 		yes = dgettext(TEXT_DOMAIN, "yes");
6475 		if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) {
6476 			/* place soft partitions into recovering state */
6477 			minors = Zalloc(count * sizeof (minor_t));
6478 			for (nlp = spnlp, i = 0;
6479 			    nlp != NULL && i < count;
6480 			    nlp = nlp->next, i++) {
6481 				assert(nlp->namep != NULL);
6482 				minors[i] = meta_getminor(nlp->namep->dev);
6483 			}
6484 			if (update_sp_status(sp, minors, count,
6485 			    MD_SP_RECOVER, mn_set, ep) != 0) {
6486 				rval = -1;
6487 				goto out;
6488 			}
6489 
6490 			/* update the watermarks */
6491 			if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
6492 				rval = -1;
6493 				goto out;
6494 			}
6495 
6496 			if (options & MDCMD_PRINT) {
6497 				(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6498 				    "Soft Partitions recovered from metadb\n"),
6499 				    compnp->cname);
6500 			}
6501 
6502 			/* return soft partitions to the OK state */
6503 			if (update_sp_status(sp, minors, count,
6504 			    MD_SP_OK, mn_set, ep) != 0) {
6505 				rval = -1;
6506 				goto out;
6507 			}
6508 
6509 			rval = 0;
6510 			goto out;
6511 		}
6512 	}
6513 
6514 	if (options & MDCMD_PRINT) {
6515 		(void) printf(dgettext(TEXT_DOMAIN,
6516 		    "%s: Soft Partitions NOT recovered from metadb\n"),
6517 		    compnp->cname);
6518 	}
6519 
6520 out:
6521 	if (minors != NULL)
6522 		Free(minors);
6523 	metafreenamelist(spnlp);
6524 	meta_sp_list_free(&extlist);
6525 	(void) fflush(stdout);
6526 	return (rval);
6527 }
6528 
6529 
6530 /*
6531  * FUNCTION:	meta_sp_update_abr()
6532  * INPUT:	sp	- name of set we are recovering in
6533  * OUTPUT:	ep	- return error pointer
6534  * RETURNS:	int	- 0 - success, -1 - error
6535  * PURPOSE:	update the ABR state for all soft partitions in the set. This
6536  *		is called when joining a set. It sends a message to the master
6537  *		node for each soft partition to get the value of tstate and
6538  *		then sets ABR ,if required, by opening the sp, setting ABR
6539  *		and then closing the sp. This approach is taken rather that
6540  *		just issuing the MD_MN_SET_CAP ioctl, in order to deal with
6541  *		the case when we have another node simultaneously unsetting ABR.
6542  */
6543 int
6544 meta_sp_update_abr(
6545 	mdsetname_t	*sp,
6546 	md_error_t	*ep
6547 )
6548 {
6549 	mdnamelist_t	*devnlp = NULL;
6550 	mdnamelist_t	*p;
6551 	mdname_t	*devnp = NULL;
6552 	md_unit_t	*un;
6553 	char		fname[MAXPATHLEN];
6554 	int		mnum, fd;
6555 	volcap_t	vc;
6556 	uint_t		tstate;
6557 
6558 
6559 	if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
6560 		return (-1);
6561 	}
6562 
6563 	/* Exit if no soft partitions in this set */
6564 	if (devnlp == NULL)
6565 		return (0);
6566 
6567 	/* For each soft partition */
6568 	for (p = devnlp; (p != NULL); p = p->next) {
6569 		devnp = p->namep;
6570 
6571 		/* check if this is a top level metadevice */
6572 		if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL)
6573 			goto out;
6574 		if (MD_HAS_PARENT(MD_PARENT(un))) {
6575 			Free(un);
6576 			continue;
6577 		}
6578 		Free(un);
6579 
6580 		/* Get tstate from Master */
6581 		if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) {
6582 			mdname_t	*np;
6583 			np = metamnumname(&sp, meta_getminor(devnp->dev), 0,
6584 			    ep);
6585 			if (np) {
6586 				md_perror(dgettext(TEXT_DOMAIN,
6587 				    "Unable to get tstate for %s"), np->cname);
6588 			}
6589 			continue;
6590 		}
6591 		/* If not set on the master, nothing to do */
6592 		if (!(tstate & MD_ABR_CAP))
6593 			continue;
6594 
6595 		mnum = meta_getminor(devnp->dev);
6596 		(void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u",
6597 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
6598 		if ((fd = open(fname, O_RDWR, 0)) < 0) {
6599 			md_perror(dgettext(TEXT_DOMAIN,
6600 			    "Could not open device %s"), fname);
6601 			continue;
6602 		}
6603 
6604 		/* Set ABR state */
6605 		vc.vc_info = 0;
6606 		vc.vc_set = 0;
6607 		if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
6608 			(void) close(fd);
6609 			continue;
6610 		}
6611 
6612 		vc.vc_set = DKV_ABR_CAP;
6613 		if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
6614 			(void) close(fd);
6615 			goto out;
6616 		}
6617 
6618 		(void) close(fd);
6619 	}
6620 	metafreenamelist(devnlp);
6621 	return (0);
6622 out:
6623 	metafreenamelist(devnlp);
6624 	return (-1);
6625 }
6626 
6627 /*
6628  * FUNCTION:	meta_mn_sp_update_abr()
6629  * INPUT:	arg	- Given set.
6630  * PURPOSE:	update the ABR state for all soft partitions in the set by
6631  *		forking a process to call meta_sp_update_abr()
6632  *		This function is only called via rpc.metad when adding a node
6633  *		to a set, ie this node is beong joined to the set by another
6634  *		node.
6635  */
6636 void *
6637 meta_mn_sp_update_abr(void *arg)
6638 {
6639 	set_t		setno = *((set_t *)arg);
6640 	mdsetname_t	*sp;
6641 	md_error_t	mde = mdnullerror;
6642 	int		fval;
6643 
6644 	/* should have a set */
6645 	assert(setno != NULL);
6646 
6647 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6648 		mde_perror(&mde, "");
6649 		return (NULL);
6650 	}
6651 
6652 	if (!(meta_is_mn_set(sp, &mde))) {
6653 		mde_perror(&mde, "");
6654 		return (NULL);
6655 	}
6656 
6657 	/* fork a process */
6658 	if ((fval = md_daemonize(sp, &mde)) != 0) {
6659 		/*
6660 		 * md_daemonize will fork off a process.  The is the
6661 		 * parent or error.
6662 		 */
6663 		if (fval > 0) {
6664 			return (NULL);
6665 		}
6666 		mde_perror(&mde, "");
6667 		return (NULL);
6668 	}
6669 	/*
6670 	 * Child process should never return back to rpc.metad, but
6671 	 * should exit.
6672 	 * Flush all internally cached data inherited from parent process
6673 	 * since cached data will be cleared when parent process RPC request
6674 	 * has completed (which is possibly before this child process
6675 	 * can complete).
6676 	 * Child process can retrieve and cache its own copy of data from
6677 	 * rpc.metad that won't be changed by the parent process.
6678 	 *
6679 	 * Reset md_in_daemon since this child will be a client of rpc.metad
6680 	 * not part of the rpc.metad daemon itself.
6681 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
6682 	 * this thread is rpc.metad or any other thread.  (If this thread
6683 	 * was rpc.metad it could use some short circuit code to get data
6684 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
6685 	 */
6686 	md_in_daemon = 0;
6687 	metaflushsetname(sp);
6688 	sr_cache_flush_setno(setno);
6689 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6690 		mde_perror(&mde, "");
6691 		md_exit(sp, 1);
6692 	}
6693 
6694 
6695 	/*
6696 	 * Closing stdin/out/err here.
6697 	 */
6698 	(void) close(0);
6699 	(void) close(1);
6700 	(void) close(2);
6701 	assert(fval == 0);
6702 
6703 	(void) meta_sp_update_abr(sp, &mde);
6704 
6705 	md_exit(sp, 0);
6706 	/*NOTREACHED*/
6707 	return (NULL);
6708 }
6709 
6710 int
6711 meta_sp_check_component(
6712 	mdsetname_t	*sp,
6713 	mdname_t	*np,
6714 	md_error_t	*ep
6715 )
6716 {
6717 	md_sp_t	*msp;
6718 	minor_t	mnum = 0;
6719 	md_dev64_t	dev = 0;
6720 	mdnm_params_t	nm;
6721 	md_getdevs_params_t	mgd;
6722 	side_t	sideno;
6723 	char	*miscname;
6724 	md_dev64_t	*mydev = NULL;
6725 	char	*pname = NULL, *t;
6726 	char	*ctd_name = NULL;
6727 	char	*devname = NULL;
6728 	int	len;
6729 	int	rval = -1;
6730 
6731 	(void) memset(&nm, '\0', sizeof (nm));
6732 	if ((msp = meta_get_sp_common(sp, np, 0, ep)) == NULL)
6733 		return (-1);
6734 
6735 	if ((miscname = metagetmiscname(np, ep)) == NULL)
6736 		return (-1);
6737 
6738 	sideno = getmyside(sp, ep);
6739 
6740 	meta_sp_debug("meta_sp_check_component: %s is on %s key: %d"
6741 	    " dev: %llu\n",
6742 	    np->cname, msp->compnamep->cname, msp->compnamep->key,
6743 	    msp->compnamep->dev);
6744 
6745 	/*
6746 	 * Now get the data from the unit structure. The compnamep stuff
6747 	 * contains the data from the namespace and we need the un_dev
6748 	 * from the unit structure.
6749 	 */
6750 	(void) memset(&mgd, '\0', sizeof (mgd));
6751 	MD_SETDRIVERNAME(&mgd, miscname, sp->setno);
6752 	mgd.cnt = 1;		    /* sp's only have one subdevice */
6753 	mgd.mnum = meta_getminor(np->dev);
6754 
6755 	mydev = Zalloc(sizeof (*mydev));
6756 	mgd.devs = (uintptr_t)mydev;
6757 
6758 	if (metaioctl(MD_IOCGET_DEVS, &mgd, &mgd.mde, np->cname) != 0) {
6759 		meta_sp_debug("meta_sp_check_component: ioctl failed\n");
6760 		(void) mdstealerror(ep, &mgd.mde);
6761 		rval = 0;
6762 		goto out;
6763 	} else if (mgd.cnt <= 0) {
6764 		assert(mgd.cnt >= 0);
6765 		rval = 0;
6766 		goto out;
6767 	}
6768 
6769 	/* Get the devname from the name space. */
6770 	if ((devname = meta_getnmentbykey(sp->setno, sideno,
6771 	    msp->compnamep->key, NULL, &mnum, &dev, ep)) == NULL) {
6772 		meta_sp_debug("meta_sp_check_component: key %d not"
6773 		    "found\n", msp->compnamep->key);
6774 		goto out;
6775 	}
6776 
6777 	meta_sp_debug("dev %s from component: (%lu, %lu)\n",
6778 	    devname,
6779 	    meta_getmajor(*mydev),
6780 	    meta_getminor(*mydev));
6781 	meta_sp_debug("minor from the namespace: %lu\n", mnum);
6782 
6783 	if (mnum != meta_getminor(*mydev)) {
6784 		/*
6785 		 * The minor numbers are different. Update the namespace
6786 		 * with the information from the component.
6787 		 */
6788 
6789 		t = strrchr(devname, '/');
6790 		t++;
6791 		ctd_name = Strdup(t);
6792 
6793 		meta_sp_debug("meta_sp_check_component: ctd_name: %s\n",
6794 		    ctd_name);
6795 
6796 		len = strlen(devname);
6797 		t = strrchr(devname, '/');
6798 		t++;
6799 		pname = Zalloc((len - strlen(t)) + 1);
6800 		(void) strncpy(pname, devname, (len - strlen(t)));
6801 		meta_sp_debug("pathname: %s\n", pname);
6802 
6803 		meta_sp_debug("updating the minor number to %lu\n", nm.mnum);
6804 
6805 		if (meta_update_namespace(sp->setno, sideno,
6806 		    ctd_name, *mydev, msp->compnamep->key, pname,
6807 		    ep) != 0) {
6808 			goto out;
6809 		}
6810 	}
6811 out:
6812 	if (pname != NULL)
6813 		Free(pname);
6814 	if (ctd_name != NULL)
6815 		Free(ctd_name);
6816 	if (devname != NULL)
6817 		Free(devname);
6818 	if (mydev != NULL)
6819 		Free(mydev);
6820 	return (rval);
6821 }
6822