xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_sp.c (revision c1591d2226910ad10594ffb2fa2f1db887f35afb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Just in case we're not in a build environment, make sure that
28  * TEXT_DOMAIN gets set to something.
29  */
30 #if !defined(TEXT_DOMAIN)
31 #define	TEXT_DOMAIN "SYS_TEST"
32 #endif
33 
34 /*
35  * soft partition operations
36  *
37  * Soft Partitions provide a virtual disk mechanism which is used to
38  * divide a large volume into many small pieces, each appearing as a
39  * separate device.  A soft partition consists of a series of extents,
40  * each having an offset and a length.  The extents are logically
41  * contiguous, so where the first extent leaves off the second extent
42  * picks up.  Which extent a given "virtual offset" belongs to is
43  * dependent on the size of all the previous extents in the soft
44  * partition.
45  *
46  * Soft partitions are represented in memory by an extent node
47  * (sp_ext_node_t) which contains all of the information necessary to
48  * create a unit structure and update the on-disk format, called
49  * "watermarks".  These extent nodes are typically kept in a doubly
50  * linked list and are manipulated by list manipulation routines.  A
51  * list of extents may represent all of the soft partitions on a volume,
52  * a single soft partition, or perhaps just a set of extents that need
53  * to be updated.  Extent lists may be sorted by extent or by name/seq#,
54  * depending on which compare function is used.  Most of the routines
55  * require the list be sorted by offset to work, and that's the typical
56  * configuration.
57  *
58  * In order to do an allocation, knowledge of all soft partitions on the
59  * volume is required.  Then free space is determined from the space
60  * that is not allocated, and new allocations can be made from the free
61  * space.  Once the new allocations are made, a unit structure is created
62  * and the watermarks are updated.  The status is then changed to "okay"
63  * on the unit structure to commit the transaction.  If updating the
64  * watermarks fails, the unit structure is in an intermediate state and
65  * the driver will not allow access to the device.
66  *
67  * A typical sequence of events is:
68  *     1. Fetch the list of names for all soft partitions on a volume
69  *         meta_sp_get_by_component()
70  *     2. Construct an extent list from the name list
71  *         meta_sp_extlist_from_namelist()
72  *     3. Fill the gaps in the extent list with free extents
73  *         meta_sp_list_freefill()
74  *     4. Allocate from the free extents
75  *         meta_sp_alloc_by_len()
76  *         meta_sp_alloc_by_list()
77  *     5. Create the unit structure from the extent list
78  *         meta_sp_createunit()
79  *         meta_sp_updateunit()
80  *     6. Write out the watermarks
81  *         meta_sp_update_wm()
82  *     7. Set the status to "Okay"
83  *         meta_sp_setstatus()
84  *
85  */
86 
87 #include <stdio.h>
88 #include <meta.h>
89 #include "meta_repartition.h"
90 #include <sys/lvm/md_sp.h>
91 #include <sys/lvm/md_crc.h>
92 #include <strings.h>
93 #include <sys/lvm/md_mirror.h>
94 #include <sys/bitmap.h>
95 
96 extern int	md_in_daemon;
97 
98 typedef struct sp_ext_node {
99 	struct sp_ext_node	*ext_next;	/* next element */
100 	struct sp_ext_node	*ext_prev;	/* previous element */
101 	sp_ext_type_t		ext_type;	/* type of extent */
102 	sp_ext_offset_t		ext_offset;	/* starting offset */
103 	sp_ext_length_t		ext_length;	/* length of this node */
104 	uint_t			ext_flags;	/* extent flags */
105 	uint32_t		ext_seq;	/* watermark seq no */
106 	mdname_t		*ext_namep;	/* name pointer */
107 	mdsetname_t		*ext_setp;	/* set pointer */
108 } sp_ext_node_t;
109 
110 /* extent flags */
111 #define	EXTFLG_UPDATE	(1)
112 
113 /* Extent node compare function for list sorting */
114 typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *);
115 
116 
117 /* Function Prototypes */
118 
119 /* Debugging Functions */
120 static void meta_sp_debug(char *format, ...);
121 static void meta_sp_printunit(mp_unit_t *mp);
122 
123 /* Misc Support Functions */
124 int meta_sp_parsesize(char *s, sp_ext_length_t *szp);
125 static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp);
126 static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp,
127 	md_error_t *ep);
128 static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp,
129     mdnamelist_t **nlpp, int force, md_error_t *ep);
130 static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp,
131     mdname_t *compnp, md_error_t *ep);
132 
133 /* Extent List Manipulation Functions */
134 static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2);
135 static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2);
136 static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np,
137     sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length,
138     sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare);
139 static void meta_sp_list_free(sp_ext_node_t **head);
140 static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext);
141 static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head,
142     sp_ext_type_t exttype, int exclude_wm);
143 static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head,
144     sp_ext_offset_t offset);
145 static void meta_sp_list_freefill(sp_ext_node_t **extlist,
146     sp_ext_length_t size);
147 static void meta_sp_list_dump(sp_ext_node_t *head);
148 static int meta_sp_list_overlaps(sp_ext_node_t *head);
149 
150 /* Extent List Query Functions */
151 static boolean_t meta_sp_enough_space(int desired_number_of_sps,
152 	blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp,
153 	sp_ext_length_t alignment);
154 static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep,
155 	mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp,
156 	md_error_t *ep);
157 static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep,
158 	mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp);
159 
160 
161 /* Extent Allocation Functions */
162 static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np,
163     sp_ext_node_t **extlist, sp_ext_node_t *free_ext,
164     sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq);
165 static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np,
166     sp_ext_node_t **extlist, sp_ext_length_t *lp,
167     sp_ext_offset_t last_off, sp_ext_length_t alignment);
168 static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np,
169     sp_ext_node_t **extlist, sp_ext_node_t *oblist);
170 
171 /* Extent List Population Functions */
172 static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp,
173     sp_ext_node_t **extlist, md_error_t *ep);
174 static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp,
175     sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep);
176 
177 /* Print (metastat) Functions */
178 static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp,
179     mdprtopts_t options, md_error_t *ep);
180 static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate);
181 static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp,
182     char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep);
183 
184 /* Watermark Manipulation Functions */
185 static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp,
186     sp_ext_node_t *extlist, md_error_t *ep);
187 static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep);
188 static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp,
189     mp_watermark_t *wm, sp_ext_offset_t offset,  md_error_t *ep);
190 static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp,
191     md_error_t *ep);
192 
193 /* Unit Structure Manipulation Functions */
194 static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist);
195 static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp,
196     sp_ext_node_t *extlist, int numexts, sp_ext_length_t len,
197     sp_status_t status, md_error_t *ep);
198 static mp_unit_t *meta_sp_updateunit(mdname_t *np,  mp_unit_t *old_un,
199     sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts,
200     md_error_t *ep);
201 static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist,
202     mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep);
203 static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options,
204     int *repart_options, md_error_t *ep);
205 
206 /* Reset (metaclear) Functions */
207 static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp,
208     md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep);
209 
210 /* Recovery (metarecover) Functions */
211 static void meta_sp_display_exthdr(void);
212 static void meta_sp_display_ext(sp_ext_node_t *ext);
213 static int meta_sp_checkseq(sp_ext_node_t *extlist);
214 static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *,
215     mdname_t **, md_error_t *);
216 static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np,
217     mdcmdopts_t options, md_error_t *ep);
218 static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp,
219     mdcmdopts_t options, md_error_t *ep);
220 static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np,
221     mdcmdopts_t options, md_error_t *ep);
222 static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext,
223     sp_ext_node_t *unitext, md_error_t *ep);
224 static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp,
225     mdcmdopts_t options, md_error_t *ep);
226 static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np,
227     mdcmdopts_t options, md_error_t *ep);
228 
229 /*
230  * Private Constants
231  */
232 
233 static const int FORCE_RELOAD_CACHE = 1;
234 static const uint_t NO_FLAGS = 0;
235 static const sp_ext_offset_t NO_OFFSET = 0ULL;
236 static const uint_t NO_SEQUENCE_NUMBER = 0;
237 static const int ONE_SOFT_PARTITION = 1;
238 
239 static unsigned long sp_parent_printed[BT_BITOUL(MD_MAXUNITS)];
240 
241 #define	TEST_SOFT_PARTITION_NAMEP NULL
242 #define	TEST_SETNAMEP NULL
243 
244 #define	EXCLUDE_WM	(1)
245 #define	INCLUDE_WM	(0)
246 
247 #define	SP_UNALIGNED	(0LL)
248 
249 /*
250  * **************************************************************************
251  *                          Debugging Functions                             *
252  * **************************************************************************
253  */
254 
255 /*PRINTFLIKE1*/
256 static void
257 meta_sp_debug(char *format, ...)
258 {
259 	static int debug;
260 	static int debug_set = 0;
261 	va_list ap;
262 
263 	if (!debug_set) {
264 		debug = getenv(META_SP_DEBUG) ? 1 : 0;
265 		debug_set = 1;
266 	}
267 
268 	if (debug) {
269 		va_start(ap, format);
270 		(void) vfprintf(stderr, format, ap);
271 		va_end(ap);
272 	}
273 }
274 
275 static void
276 meta_sp_printunit(mp_unit_t *mp)
277 {
278 	int i;
279 
280 	if (mp == NULL)
281 		return;
282 
283 	/* print the common fields we know about */
284 	(void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type);
285 	(void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size);
286 	(void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp));
287 
288 	/* sp-specific fields */
289 	(void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status);
290 	(void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts);
291 	(void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length);
292 	(void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev);
293 	(void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev);
294 	(void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key);
295 
296 	/* print extent information */
297 	(void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n");
298 	for (i = 0; i < mp->un_numexts; i++) {
299 		(void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i,
300 		    mp->un_ext[i].un_voff, mp->un_ext[i].un_poff,
301 		    mp->un_ext[i].un_len);
302 	}
303 }
304 
305 /*
306  * FUNCTION:    meta_sp_parsesize()
307  * INPUT:       s       - the string to parse
308  * OUTPUT:      *szp    - disk block count (0 for "all")
309  * RETURNS:     -1 for error, 0 for success
310  * PURPOSE:     parses the command line parameter that specifies the
311  *              requested size of a soft partition.  The input string
312  *              is either the literal "all" or a numeric value
313  *              followed by a single character, b for disk blocks, k
314  *              for kilobytes, m for megabytes, g for gigabytes, or t
315  *              for terabytes.  p for petabytes and e for exabytes
316  *              have been added as undocumented features for future
317  *              expansion.  For example, 100m is 100 megabytes, while
318  *              50g is 50 gigabytes.  All values are rounded up to the
319  *              nearest block size.
320  */
321 int
322 meta_sp_parsesize(char *s, sp_ext_length_t *szp)
323 {
324 	if (s == NULL || szp == NULL) {
325 		return (-1);
326 	}
327 
328 	/* Check for literal "all" */
329 	if (strcasecmp(s, "all") == 0) {
330 		*szp = 0;
331 		return (0);
332 	}
333 
334 	return (meta_sp_parsesizestring(s, szp));
335 }
336 
337 /*
338  * FUNCTION:	meta_sp_parsesizestring()
339  * INPUT:	s	- the string to parse
340  * OUTPUT:	*szp	- disk block count
341  * RETURNS:	-1 for error, 0 for success
342  * PURPOSE:	parses a string that specifies size. The input string is a
343  *		numeric value followed by a single character, b for disk blocks,
344  *		k for kilobytes, m for megabytes, g for gigabytes, or t for
345  *		terabytes.  p for petabytes and e for exabytes have been added
346  *		as undocumented features for future expansion.  For example,
347  *		100m is 100 megabytes, while 50g is 50 gigabytes.  All values
348  *		are rounded up to the nearest block size.
349  */
350 static int
351 meta_sp_parsesizestring(char *s, sp_ext_length_t *szp)
352 {
353 	sp_ext_length_t	len = 0;
354 	char		len_type[2];
355 
356 	if (s == NULL || szp == NULL) {
357 		return (-1);
358 	}
359 
360 	/*
361 	 * make sure block offset does not overflow 2^64 bytes.
362 	 */
363 	if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) ||
364 	    (len == 0LL) ||
365 	    (len > (1LL << (64 - DEV_BSHIFT))))
366 		return (-1);
367 
368 	switch (len_type[0]) {
369 	case 'B':
370 	case 'b':
371 		len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE));
372 		break;
373 	case 'K':
374 	case 'k':
375 		len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE));
376 		break;
377 	case 'M':
378 	case 'm':
379 		len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE));
380 		break;
381 	case 'g':
382 	case 'G':
383 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE));
384 		break;
385 	case 't':
386 	case 'T':
387 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL,
388 		    DEV_BSIZE));
389 		break;
390 	case 'p':
391 	case 'P':
392 		len = lbtodb(roundup(
393 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
394 		    DEV_BSIZE));
395 		break;
396 	case 'e':
397 	case 'E':
398 		len = lbtodb(roundup(
399 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
400 		    DEV_BSIZE));
401 		break;
402 	default:
403 		/* error */
404 		return (-1);
405 	}
406 
407 	*szp = len;
408 	return (0);
409 }
410 
411 /*
412  * FUNCTION:	meta_sp_setgeom()
413  * INPUT:	np      - the underlying device to setup geometry for
414  *		compnp	- the underlying device to setup geometry for
415  *		mp	- the unit structure to set the geometry for
416  * OUTPUT:	ep	- return error pointer
417  * RETURNS:	int	- -1 if error, 0 otherwise
418  * PURPOSE:	establishes geometry information for a device
419  */
420 static int
421 meta_sp_setgeom(
422 	mdname_t	*np,
423 	mdname_t	*compnp,
424 	mp_unit_t	*mp,
425 	md_error_t	*ep
426 )
427 {
428 	mdgeom_t	*geomp;
429 	uint_t		round_cyl = 0;
430 
431 	if ((geomp = metagetgeom(compnp, ep)) == NULL)
432 		return (-1);
433 	if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct,
434 	    geomp->read_reinstruct, round_cyl, ep) != 0)
435 		return (-1);
436 
437 	return (0);
438 }
439 
440 /*
441  * FUNCTION:	meta_sp_setstatus()
442  * INPUT:	sp	- the set name for the devices to set the status on
443  *		minors	- an array of minor numbers of devices to set status on
444  *		num_units - number of entries in the array
445  *		status	- status value to set all units to
446  * OUTPUT:	ep	- return error pointer
447  * RETURNS:	int	- -1 if error, 0 success
448  * PURPOSE:	sets the status of one or more soft partitions to the
449  *		requested value
450  */
451 int
452 meta_sp_setstatus(
453 	mdsetname_t	*sp,
454 	minor_t		*minors,
455 	int		num_units,
456 	sp_status_t	status,
457 	md_error_t	*ep
458 )
459 {
460 	md_sp_statusset_t	status_params;
461 
462 	assert(minors != NULL);
463 
464 	/* update status of all soft partitions to the status passed in */
465 	(void) memset(&status_params, 0, sizeof (status_params));
466 	status_params.num_units = num_units;
467 	status_params.new_status = status;
468 	status_params.size = num_units * sizeof (minor_t);
469 	status_params.minors = (uintptr_t)minors;
470 	MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno);
471 	if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde,
472 	    NULL) != 0) {
473 		(void) mdstealerror(ep, &status_params.mde);
474 		return (-1);
475 	}
476 	return (0);
477 }
478 
479 /*
480  * FUNCTION:	meta_get_sp_names()
481  * INPUT:	sp	- the set name to get soft partitions from
482  *		options	- options from the command line
483  * OUTPUT:	nlpp	- list of all soft partition names
484  *		ep	- return error pointer
485  * RETURNS:	int	- -1 if error, 0 success
486  * PURPOSE:	returns a list of all soft partitions in the metadb
487  *		for all devices in the specified set
488  */
489 int
490 meta_get_sp_names(
491 	mdsetname_t	*sp,
492 	mdnamelist_t	**nlpp,
493 	int		options,
494 	md_error_t	*ep
495 )
496 {
497 	return (meta_get_names(MD_SP, sp, nlpp, options, ep));
498 }
499 
500 /*
501  * FUNCTION:	meta_get_by_component()
502  * INPUT:	sp	- the set name to get soft partitions from
503  *		compnp	- the name of the device containing the soft
504  *			  partitions that will be returned
505  *		force	- 0 - reads cached namelist if available,
506  *			  1 - reloads cached namelist, frees old namelist
507  * OUTPUT:	nlpp	- list of all soft partition names
508  *		ep	- return error pointer
509  * RETURNS:	int	- -1 error, otherwise the number of soft partitions
510  *			  found on the component (0 = none found).
511  * PURPOSE:	returns a list of all soft partitions on a given device
512  *		from the metadb information
513  */
514 static int
515 meta_sp_get_by_component(
516 	mdsetname_t	*sp,
517 	mdname_t	*compnp,
518 	mdnamelist_t	**nlpp,
519 	int		force,
520 	md_error_t	*ep
521 )
522 {
523 	static mdnamelist_t	*cached_list = NULL;	/* cached namelist */
524 	static int		cached_count = 0;	/* cached count */
525 	mdnamelist_t		*spnlp = NULL;		/* all sp names */
526 	mdnamelist_t		*namep;			/* list iterator */
527 	mdnamelist_t		**tailpp = nlpp;	/* namelist tail */
528 	mdnamelist_t		**cachetailpp;		/* cache tail */
529 	md_sp_t			*msp;			/* unit structure */
530 	int			count = 0;		/* count of sp's */
531 	int			err;
532 	mdname_t		*curnp;
533 
534 	if ((cached_list != NULL) && (!force)) {
535 		/* return a copy of the cached list */
536 		for (namep = cached_list; namep != NULL; namep = namep->next)
537 			tailpp = meta_namelist_append_wrapper(tailpp,
538 			    namep->namep);
539 		return (cached_count);
540 	}
541 
542 	/* free the cache and reset values to zeros to prepare for a new list */
543 	metafreenamelist(cached_list);
544 	cached_count = 0;
545 	cached_list = NULL;
546 	cachetailpp = &cached_list;
547 	*nlpp = NULL;
548 
549 	/* get all the softpartitions first of all */
550 	if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
551 		return (-1);
552 
553 	/*
554 	 * Now for each sp, see if it resides on the component we
555 	 * are interested in, if so then add it to our list
556 	 */
557 	for (namep = spnlp; namep != NULL; namep = namep->next) {
558 		curnp = namep->namep;
559 
560 		/* get the unit structure */
561 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
562 			continue;
563 
564 		/*
565 		 * If the current soft partition is not on the same
566 		 * component, continue the search.  If it is on the same
567 		 * component, add it to our namelist.
568 		 */
569 		err = meta_check_samedrive(compnp, msp->compnamep, ep);
570 		if (err <= 0) {
571 			/* not on the same device, check the next one */
572 			continue;
573 		}
574 
575 		/* it's on the same drive */
576 
577 		/*
578 		 * Check for overlapping partitions if the component is not
579 		 * a metadevice.
580 		 */
581 		if (!metaismeta(msp->compnamep)) {
582 			/*
583 			 * if they're on the same drive, neither
584 			 * should be a metadevice if one isn't
585 			 */
586 			assert(!metaismeta(compnp));
587 
588 			if (meta_check_overlap(msp->compnamep->cname,
589 			    compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0)
590 				continue;
591 
592 			/* in this case it's not an error for them to overlap */
593 			mdclrerror(ep);
594 		}
595 
596 		/* Component is on the same device, add to the used list */
597 		tailpp = meta_namelist_append_wrapper(tailpp, curnp);
598 		cachetailpp = meta_namelist_append_wrapper(cachetailpp,
599 		    curnp);
600 
601 		++count;
602 		++cached_count;
603 	}
604 
605 	assert(count == cached_count);
606 	return (count);
607 
608 out:
609 	metafreenamelist(*nlpp);
610 	*nlpp = NULL;
611 	return (-1);
612 }
613 
614 /*
615  * FUNCTION:    meta_sp_get_default_alignment()
616  * INPUT:       sp      - the pertinent set name
617  *              compnp  - the name of the underlying component
618  * OUTPUT:      ep      - return error pointer
619  * RETURNS:     sp_ext_length_t =0: no default alignment
620  *                              >0: default alignment
621  * PURPOSE:     returns the default alignment for soft partitions to
622  *              be built on top of the specified component or
623  *              metadevice
624  */
625 static sp_ext_length_t
626 meta_sp_get_default_alignment(
627 	mdsetname_t	*sp,
628 	mdname_t	*compnp,
629 	md_error_t	*ep
630 )
631 {
632 	sp_ext_length_t	a = SP_UNALIGNED;
633 	char		*mname;
634 
635 	assert(compnp != NULL);
636 
637 	/*
638 	 * We treat raw devices as opaque, and assume nothing about
639 	 * their alignment requirements.
640 	 */
641 	if (!metaismeta(compnp))
642 		return (SP_UNALIGNED);
643 
644 	/*
645 	 * We already know it's a metadevice from the previous test;
646 	 * metagetmiscname() will tell us which metadevice type we
647 	 * have
648 	 */
649 	mname = metagetmiscname(compnp, ep);
650 	if (mname == NULL)
651 		goto out;
652 
653 	/*
654 	 * For a mirror, we want to deal with the stripe that is the
655 	 * primary side.  If it happens to be asymmetrically
656 	 * configured, there is no simple way to fake a universal
657 	 * alignment.  There's a chance that the least common
658 	 * denominator of the set of interlaces from all stripes of
659 	 * all submirrors would do it, but nobody that really cared
660 	 * that much about this issue would create an asymmetric
661 	 * config to start with.
662 	 *
663 	 * If the component underlying the soft partition is a mirror,
664 	 * then at the exit of this loop, compnp will have been
665 	 * updated to describe the first active submirror.
666 	 */
667 	if (strcmp(mname, MD_MIRROR) == 0) {
668 		md_mirror_t	*mp;
669 		int		smi;
670 		md_submirror_t	*smp;
671 
672 		mp = meta_get_mirror(sp, compnp, ep);
673 		if (mp == NULL)
674 			goto out;
675 
676 		for (smi = 0; smi < NMIRROR; smi++) {
677 
678 			smp = &mp->submirrors[smi];
679 			if (smp->state == SMS_UNUSED)
680 				continue;
681 
682 			compnp = smp->submirnamep;
683 			assert(compnp != NULL);
684 
685 			mname = metagetmiscname(compnp, ep);
686 			if (mname == NULL)
687 				goto out;
688 
689 			break;
690 		}
691 
692 		if (smi == NMIRROR)
693 			goto out;
694 	}
695 
696 	/*
697 	 * Handle stripes and submirrors identically; just return the
698 	 * interlace of the first row.
699 	 */
700 	if (strcmp(mname, MD_STRIPE) == 0) {
701 		md_stripe_t	*stp;
702 
703 		stp = meta_get_stripe(sp, compnp, ep);
704 		if (stp == NULL)
705 			goto out;
706 
707 		a = stp->rows.rows_val[0].interlace;
708 		goto out;
709 	}
710 
711 	/*
712 	 * Raid is even more straightforward; the interlace applies to
713 	 * the entire device.
714 	 */
715 	if (strcmp(mname, MD_RAID) == 0) {
716 		md_raid_t	*rp;
717 
718 		rp = meta_get_raid(sp, compnp, ep);
719 		if (rp == NULL)
720 			goto out;
721 
722 		a = rp->interlace;
723 		goto out;
724 	}
725 
726 	/*
727 	 * If we have arrived here with the alignment still not set,
728 	 * then we expect the error to have been set by one of the
729 	 * routines we called.  If neither is the case, something has
730 	 * really gone wrong above.  (Probably the submirror walk
731 	 * failed to produce a valid submirror, but that would be
732 	 * really bad...)
733 	 */
734 out:
735 	meta_sp_debug("meta_sp_get_default_alignment: miscname %s, "
736 	    "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a);
737 
738 	if (getenv(META_SP_DEBUG) && !mdisok(ep)) {
739 		mde_perror(ep, NULL);
740 	}
741 
742 	assert((a > 0) || (!mdisok(ep)));
743 
744 	return (a);
745 }
746 
747 
748 
749 /*
750  * FUNCTION:	meta_check_insp()
751  * INPUT:	sp	- the set name for the device to check
752  *		np	- the name of the device to check
753  *		slblk	- the starting offset of the device to check
754  *		nblks	- the number of blocks in the device to check
755  * OUTPUT:	ep	- return error pointer
756  * RETURNS:	int	-  0 - device contains soft partitions
757  *			  -1 - device does not contain soft partitions
758  * PURPOSE:	determines whether a device contains any soft partitions
759  */
760 /* ARGSUSED */
761 int
762 meta_check_insp(
763 	mdsetname_t	*sp,
764 	mdname_t	*np,
765 	diskaddr_t	slblk,
766 	diskaddr_t	nblks,
767 	md_error_t	*ep
768 )
769 {
770 	mdnamelist_t	*spnlp = NULL;	/* soft partition name list */
771 	int		count;
772 	int		rval;
773 
774 	/* check set pointer */
775 	assert(sp != NULL);
776 
777 	/*
778 	 * Get a list of the soft partitions that currently reside on
779 	 * the component.  We should ALWAYS force reload the cache,
780 	 * because if we're using the md.tab, we must rebuild
781 	 * the list because it won't contain the previous (if any)
782 	 * soft partition.
783 	 */
784 	/* find all soft partitions on the component */
785 	count = meta_sp_get_by_component(sp, np, &spnlp, 1, ep);
786 
787 	if (count == -1) {
788 		rval = -1;
789 	} else if (count > 0) {
790 		rval = mduseerror(ep, MDE_ALREADY, np->dev,
791 		    spnlp->namep->cname, np->cname);
792 	} else {
793 		rval = 0;
794 	}
795 
796 	metafreenamelist(spnlp);
797 	return (rval);
798 }
799 
800 /*
801  * **************************************************************************
802  *                    Extent List Manipulation Functions                    *
803  * **************************************************************************
804  */
805 
806 /*
807  * FUNCTION:	meta_sp_cmp_by_nameseq()
808  * INPUT:	e1	- first node to compare
809  *		e2	- second node to compare
810  * OUTPUT:	none
811  * RETURNS:	int	- =0 - nodes are equal
812  *			  <0 - e1 should go before e2
813  *			  >0 - e1 should go after e2
814  * PURPOSE:	used for sorted list inserts to build a list sorted by
815  *		name first and sequence number second.
816  */
817 static int
818 meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2)
819 {
820 	int rval;
821 
822 	if (e1->ext_namep == NULL)
823 		return (1);
824 	if (e2->ext_namep == NULL)
825 		return (-1);
826 	if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0)
827 		return (rval);
828 
829 	/* the names are equal, compare sequence numbers */
830 	if (e1->ext_seq > e2->ext_seq)
831 		return (1);
832 	if (e1->ext_seq < e2->ext_seq)
833 		return (-1);
834 	/* sequence numbers are also equal */
835 	return (0);
836 }
837 
838 /*
839  * FUNCTION:	meta_sp_cmp_by_offset()
840  * INPUT:	e1	- first node to compare
841  *		e2	- second node to compare
842  * OUTPUT:	none
843  * RETURNS:	int	- =0 - nodes are equal
844  *			  <0 - e1 should go before e2
845  *			  >0 - e1 should go after e2
846  * PURPOSE:	used for sorted list inserts to build a list sorted by offset
847  */
848 static int
849 meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2)
850 {
851 	if (e1->ext_offset > e2->ext_offset)
852 		return (1);
853 	if (e1->ext_offset < e2->ext_offset)
854 		return (-1);
855 	/* offsets are equal */
856 	return (0);
857 }
858 
859 /*
860  * FUNCTION:	meta_sp_list_insert()
861  * INPUT:	sp	- the set name for the device the node belongs to
862  *		np	- the name of the device the node belongs to
863  *		head	- the head of the list, must be NULL for empty list
864  *		offset	- the physical offset of this extent in sectors
865  *		length	- the length of this extent in sectors
866  *		type	- the type of the extent being inserted
867  *		seq	- the sequence number of the extent being inserted
868  *		flags	- extent flags (eg. whether it needs to be updated)
869  *		compare	- the compare function to use
870  * OUTPUT:	head	- points to the new head if a node was inserted
871  *			  at the beginning
872  * RETURNS:	void
873  * PURPOSE:	inserts an extent node into a sorted doubly linked list.
874  *		The sort order is determined by the compare function.
875  *		Memory is allocated for the node in this function and it
876  *		is up to the caller to free it, possibly using
877  *		meta_sp_list_free().  If a node is inserted at the
878  *		beginning of the list, the head pointer is updated to
879  *		point to the new first node.
880  */
881 static void
882 meta_sp_list_insert(
883 	mdsetname_t	*sp,
884 	mdname_t	*np,
885 	sp_ext_node_t	**head,
886 	sp_ext_offset_t	offset,
887 	sp_ext_length_t	length,
888 	sp_ext_type_t	type,
889 	uint_t		seq,
890 	uint_t		flags,
891 	ext_cmpfunc_t	compare
892 )
893 {
894 	sp_ext_node_t	*newext;
895 	sp_ext_node_t	*curext;
896 
897 	assert(head != NULL);
898 
899 	/* Don't bother adding zero length nodes */
900 	if (length == 0ULL)
901 		return;
902 
903 	/* allocate and fill in new ext_node */
904 	newext = Zalloc(sizeof (sp_ext_node_t));
905 
906 	newext->ext_offset = offset;
907 	newext->ext_length = length;
908 	newext->ext_flags = flags;
909 	newext->ext_type = type;
910 	newext->ext_seq = seq;
911 	newext->ext_setp = sp;
912 	newext->ext_namep = np;
913 
914 	/* first node in the list */
915 	if (*head == NULL) {
916 		newext->ext_next = newext->ext_prev = NULL;
917 		*head = newext;
918 	} else if ((*compare)(*head, newext) >= 0) {
919 		/* the first node has a bigger offset, so insert before it */
920 		assert((*head)->ext_prev == NULL);
921 
922 		newext->ext_prev = NULL;
923 		newext->ext_next = *head;
924 		(*head)->ext_prev = newext;
925 		*head = newext;
926 	} else {
927 		/*
928 		 * find the next node whose offset is greater than
929 		 * the one we want to insert, or the end of the list.
930 		 */
931 		for (curext = *head;
932 		    (curext->ext_next != NULL) &&
933 		    ((*compare)(curext->ext_next, newext) < 0);
934 		    (curext = curext->ext_next))
935 			;
936 
937 		/* link the new node in after the current node */
938 		newext->ext_next = curext->ext_next;
939 		newext->ext_prev = curext;
940 
941 		if (curext->ext_next != NULL)
942 			curext->ext_next->ext_prev = newext;
943 
944 		curext->ext_next = newext;
945 	}
946 }
947 
948 /*
949  * FUNCTION:	meta_sp_list_free()
950  * INPUT:	head	- the head of the list, must be NULL for empty list
951  * OUTPUT:	head	- points to NULL on return
952  * RETURNS:	void
953  * PURPOSE:	walks a double linked extent list and frees each node
954  */
955 static void
956 meta_sp_list_free(sp_ext_node_t **head)
957 {
958 	sp_ext_node_t	*ext;
959 	sp_ext_node_t	*next;
960 
961 	assert(head != NULL);
962 
963 	ext = *head;
964 	while (ext) {
965 		next = ext->ext_next;
966 		Free(ext);
967 		ext = next;
968 	}
969 	*head = NULL;
970 }
971 
972 /*
973  * FUNCTION:	meta_sp_list_remove()
974  * INPUT:	head	- the head of the list, must be NULL for empty list
975  *		ext	- the extent to remove, must be a member of the list
976  * OUTPUT:	head	- points to the new head of the list
977  * RETURNS:	void
978  * PURPOSE:	unlinks the node specified by ext from the list and
979  *		frees it, possibly moving the head pointer forward if
980  *		the head is the node being removed.
981  */
982 static void
983 meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext)
984 {
985 	assert(head != NULL);
986 	assert(*head != NULL);
987 
988 	if (*head == ext)
989 		*head = ext->ext_next;
990 
991 	if (ext->ext_prev != NULL)
992 		ext->ext_prev->ext_next = ext->ext_next;
993 	if (ext->ext_next != NULL)
994 		ext->ext_next->ext_prev = ext->ext_prev;
995 	Free(ext);
996 }
997 
998 /*
999  * FUNCTION:	meta_sp_list_size()
1000  * INPUT:	head	- the head of the list, must be NULL for empty list
1001  *		exttype	- the type of the extents to sum
1002  *		exclude_wm - subtract space for extent headers from total
1003  * OUTPUT:	none
1004  * RETURNS:	sp_ext_length_t	- the sum of all of the lengths
1005  * PURPOSE:	sums the lengths of all extents in the list matching the
1006  *		specified type.  This could be used for computing the
1007  *		amount of free or used space, for example.
1008  */
1009 static sp_ext_length_t
1010 meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm)
1011 {
1012 	sp_ext_node_t	*ext;
1013 	sp_ext_length_t	size = 0LL;
1014 
1015 	for (ext = head; ext != NULL; ext = ext->ext_next)
1016 		if (ext->ext_type == exttype)
1017 			size += ext->ext_length -
1018 			    ((exclude_wm) ? MD_SP_WMSIZE : 0);
1019 
1020 	return (size);
1021 }
1022 
1023 /*
1024  * FUNCTION:	meta_sp_list_find()
1025  * INPUT:	head	- the head of the list, must be NULL for empty list
1026  *		offset	- the offset contained by the node to find
1027  * OUTPUT:	none
1028  * RETURNS:	sp_ext_node_t *	- the node containing the requested offset
1029  *				  or NULL if no such nodes were found.
1030  * PURPOSE:	finds a node in a list containing the requested offset
1031  *		(inclusive).  If multiple nodes contain this offset then
1032  *		only the first will be returned, though typically these
1033  *		lists are managed with non-overlapping nodes.
1034  *
1035  *		*The list MUST be sorted by offset for this function to work.*
1036  */
1037 static sp_ext_node_t *
1038 meta_sp_list_find(
1039 	sp_ext_node_t	*head,
1040 	sp_ext_offset_t	offset
1041 )
1042 {
1043 	sp_ext_node_t	*ext;
1044 
1045 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1046 		/* check if the offset lies within this extent */
1047 		if ((offset >= ext->ext_offset) &&
1048 		    (offset < ext->ext_offset + ext->ext_length)) {
1049 			/*
1050 			 * the requested extent should always be a
1051 			 * subset of an extent in the list.
1052 			 */
1053 			return (ext);
1054 		}
1055 	}
1056 	return (NULL);
1057 }
1058 
1059 /*
1060  * FUNCTION:	meta_sp_list_freefill()
1061  * INPUT:	head	- the head of the list, must be NULL for empty list
1062  *		size	- the size of the volume this extent list is
1063  *			  representing
1064  * OUTPUT:	head	- the new head of the list
1065  * RETURNS:	void
1066  * PURPOSE:	finds gaps in the extent list and fills them with a free
1067  *		node.  If there is a gap at the beginning the head
1068  *		pointer will be changed to point to the new free node.
1069  *		If there is free space at the end, the last free extent
1070  *		will extend all the way out to the size specified.
1071  *
1072  *		*The list MUST be sorted by offset for this function to work.*
1073  */
1074 static void
1075 meta_sp_list_freefill(
1076 	sp_ext_node_t	**head,
1077 	sp_ext_length_t	size
1078 )
1079 {
1080 	sp_ext_node_t	*ext;
1081 	sp_ext_offset_t	curoff = 0LL;
1082 
1083 	for (ext = *head; ext != NULL; ext = ext->ext_next) {
1084 		if (curoff < ext->ext_offset)
1085 			meta_sp_list_insert(NULL, NULL, head,
1086 			    curoff, ext->ext_offset - curoff,
1087 			    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1088 		curoff = ext->ext_offset + ext->ext_length;
1089 	}
1090 
1091 	/* pad inverse list out to the end */
1092 	if (curoff < size)
1093 		meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff,
1094 		    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1095 
1096 	if (getenv(META_SP_DEBUG)) {
1097 		meta_sp_debug("meta_sp_list_freefill: Extent list with "
1098 		    "holes freefilled:\n");
1099 		meta_sp_list_dump(*head);
1100 	}
1101 }
1102 
1103 /*
1104  * FUNCTION:	meta_sp_list_dump()
1105  * INPUT:	head	- the head of the list, must be NULL for empty list
1106  * OUTPUT:	none
1107  * RETURNS:	void
1108  * PURPOSE:	dumps the entire extent list to stdout for easy debugging
1109  */
1110 static void
1111 meta_sp_list_dump(sp_ext_node_t *head)
1112 {
1113 	sp_ext_node_t	*ext;
1114 
1115 	meta_sp_debug("meta_sp_list_dump: dumping extent list:\n");
1116 	meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name",
1117 	    "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev",
1118 	    "Next");
1119 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1120 		if (ext->ext_namep != NULL)
1121 			meta_sp_debug("%5s", ext->ext_namep->cname);
1122 		else
1123 			meta_sp_debug("%5s", "NONE");
1124 
1125 		meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq);
1126 		switch (ext->ext_type) {
1127 		case EXTTYP_ALLOC:
1128 			meta_sp_debug("%7s ", "ALLOC");
1129 			break;
1130 		case EXTTYP_FREE:
1131 			meta_sp_debug("%7s ", "FREE");
1132 			break;
1133 		case EXTTYP_END:
1134 			meta_sp_debug("%7s ", "END");
1135 			break;
1136 		case EXTTYP_RESERVED:
1137 			meta_sp_debug("%7s ", "RESV");
1138 			break;
1139 		default:
1140 			meta_sp_debug("%7s ", "INVLD");
1141 			break;
1142 		}
1143 
1144 		meta_sp_debug("%10llu %10llu %5u %10p %10p\n",
1145 		    ext->ext_offset, ext->ext_length,
1146 		    ext->ext_flags, (void *) ext->ext_prev,
1147 		    (void *) ext->ext_next);
1148 	}
1149 	meta_sp_debug("\n");
1150 }
1151 
1152 /*
1153  * FUNCTION:	meta_sp_list_overlaps()
1154  * INPUT:	head	- the head of the list, must be NULL for empty list
1155  * OUTPUT:	none
1156  * RETURNS:	int	- 1 if extents overlap, 0 if ok
1157  * PURPOSE:	checks a list for overlaps.  The list MUST be sorted by
1158  *		offset for this function to work properly.
1159  */
1160 static int
1161 meta_sp_list_overlaps(sp_ext_node_t *head)
1162 {
1163 	sp_ext_node_t	*ext;
1164 
1165 	for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) {
1166 		if (ext->ext_offset + ext->ext_length >
1167 		    ext->ext_next->ext_offset)
1168 			return (1);
1169 	}
1170 	return (0);
1171 }
1172 
1173 /*
1174  * **************************************************************************
1175  *                        Extent Allocation Functions                       *
1176  * **************************************************************************
1177  */
1178 
1179 /*
1180  * FUNCTION:	meta_sp_alloc_by_ext()
1181  * INPUT:	sp	- the set name for the device the node belongs to
1182  *		np	- the name of the device the node belongs to
1183  *		head	- the head of the list, must be NULL for empty list
1184  *		free_ext	- the free extent being allocated from
1185  *		alloc_offset	- the offset of the allocation
1186  *		alloc_len	- the length of the allocation
1187  *		seq		- the sequence number of the allocation
1188  * OUTPUT:	head	- the new head pointer
1189  * RETURNS:	void
1190  * PURPOSE:	allocates a portion of the free extent free_ext.  The
1191  *		allocated portion starts at alloc_offset and is
1192  *		alloc_length long.  Both (alloc_offset) and (alloc_offset +
1193  *		alloc_length) must be contained within the free extent.
1194  *
1195  *		The free extent is split into as many as 3 pieces - a
1196  *		free extent containing [ free_offset .. alloc_offset ), an
1197  *		allocated extent containing the range [ alloc_offset ..
1198  *		alloc_end ], and another free extent containing the
1199  *		range ( alloc_end .. free_end ].  If either of the two
1200  *		new free extents would be zero length, they are not created.
1201  *
1202  *		Finally, the original free extent is removed.  All newly
1203  *		created extents have the EXTFLG_UPDATE flag set.
1204  */
1205 static void
1206 meta_sp_alloc_by_ext(
1207 	mdsetname_t	*sp,
1208 	mdname_t	*np,
1209 	sp_ext_node_t	**head,
1210 	sp_ext_node_t	*free_ext,
1211 	sp_ext_offset_t	alloc_offset,
1212 	sp_ext_length_t	alloc_length,
1213 	uint_t		seq
1214 )
1215 {
1216 	sp_ext_offset_t	free_offset = free_ext->ext_offset;
1217 	sp_ext_length_t	free_length = free_ext->ext_length;
1218 
1219 	sp_ext_offset_t	alloc_end = alloc_offset + alloc_length;
1220 	sp_ext_offset_t	free_end  = free_offset  + free_length;
1221 
1222 	/* allocated extent must be a subset of the free extent */
1223 	assert(free_offset <= alloc_offset);
1224 	assert(free_end >= alloc_end);
1225 
1226 	meta_sp_list_remove(head, free_ext);
1227 
1228 	if (free_offset < alloc_offset) {
1229 		meta_sp_list_insert(NULL, NULL, head, free_offset,
1230 		    (alloc_offset - free_offset), EXTTYP_FREE, 0,
1231 		    EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1232 	}
1233 
1234 	if (free_end > alloc_end) {
1235 		meta_sp_list_insert(NULL, NULL, head, alloc_end,
1236 		    (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE,
1237 		    meta_sp_cmp_by_offset);
1238 	}
1239 
1240 	meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length,
1241 	    EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1242 
1243 	if (getenv(META_SP_DEBUG)) {
1244 		meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n");
1245 		meta_sp_list_dump(*head);
1246 	}
1247 }
1248 
1249 /*
1250  * FUNCTION:	meta_sp_alloc_by_len()
1251  * INPUT:	sp	- the set name for the device the node belongs to
1252  *		np	- the name of the device the node belongs to
1253  *		head	- the head of the list, must be NULL for empty list
1254  *		*lp	- the requested length to allocate
1255  *		last_off	- the last offset already allocated.
1256  *		alignment	- the desired extent alignmeent
1257  * OUTPUT:	head	- the new head pointer
1258  *		*lp	- the length allocated
1259  * RETURNS:	int	- -1 if error, the number of new extents on success
1260  * PURPOSE:	allocates extents from free space to satisfy the requested
1261  *		length.  If requested length is zero, allocates all
1262  *		remaining free space.  This function provides the meat
1263  *		of the extent allocation algorithm.  Allocation is a
1264  *		three tier process:
1265  *
1266  *		1. If last_off is nonzero and there is free space following
1267  *		   that node, then it is extended to allocate as much of that
1268  *		   free space as possible.  This is useful for metattach.
1269  *		2. If a free extent can be found to satisfy the remaining
1270  *		   requested space, then satisfy the rest of the request
1271  *		   from that extent.
1272  *		3. Start allocating space from any remaining free extents until
1273  *		   the remainder of the request is satisified.
1274  *
1275  *              If alignment is non-zero, then every extent modified
1276  *              or newly allocated will be aligned modulo alignment,
1277  *              with a length that is an integer multiple of
1278  *              alignment.
1279  *
1280  *		The EXTFLG_UPDATE flag is set for all nodes (free and
1281  *		allocated) that require updated watermarks.
1282  *
1283  *		This algorithm may have a negative impact on fragmentation
1284  *		in pathological cases and may be improved if it turns out
1285  *		to be a problem.  This may be exacerbated by particularly
1286  *		large alignments.
1287  *
1288  * NOTE:	It's confusing, so it demands an explanation:
1289  *		- len is used to represent requested data space; it
1290  *		  does not include room for a watermark.  On each full
1291  *		  or partial allocation, len will be decremented by
1292  *		  alloc_len (see next paragraph) until it reaches
1293  *		  zero.
1294  *		- alloc_len is used to represent data space allocated
1295  *		  from a particular extent; it does not include space
1296  *		  for a watermark.  In the rare event that a_length
1297  *		  (see next paragraph) is equal to MD_SP_WMSIZE,
1298  *		  alloc_len will be zero and the resulting MD_SP_WMSIZE
1299  *		  fragment of space will be utterly unusable.
1300  *		- a_length is used to represent all space to be
1301  *		  allocated from a particular extent; it DOES include
1302  *		  space for a watermark.
1303  */
1304 static int
1305 meta_sp_alloc_by_len(
1306 	mdsetname_t	*sp,
1307 	mdname_t	*np,
1308 	sp_ext_node_t	**head,
1309 	sp_ext_length_t	*lp,
1310 	sp_ext_offset_t	last_off,
1311 	sp_ext_offset_t	alignment
1312 )
1313 {
1314 	sp_ext_node_t	*free_ext;
1315 	sp_ext_node_t	*alloc_ext;
1316 	uint_t		last_seq = 0;
1317 	uint_t		numexts = 0;
1318 	sp_ext_length_t	freespace;
1319 	sp_ext_length_t	alloc_len;
1320 	sp_ext_length_t	len;
1321 
1322 	/* We're DOA if we can't read *lp */
1323 	assert(lp != NULL);
1324 	len = *lp;
1325 
1326 	/*
1327 	 * Process the nominal case first: we've been given an actual
1328 	 * size argument, rather than the literal "all"
1329 	 */
1330 
1331 	if (len != 0) {
1332 
1333 		/*
1334 		 * Short circuit the check for free space.  This may
1335 		 * tell us we have enough space when we really don't
1336 		 * because each extent loses space to a watermark, but
1337 		 * it will always tell us there isn't enough space
1338 		 * correctly.  Worst case we do some extra work.
1339 		 */
1340 		freespace = meta_sp_list_size(*head, EXTTYP_FREE,
1341 		    INCLUDE_WM);
1342 
1343 		if (freespace < len)
1344 			return (-1);
1345 
1346 		/*
1347 		 * First see if we can extend the last extent for an
1348 		 * attach.
1349 		 */
1350 		if (last_off != 0LL) {
1351 			int align = 0;
1352 
1353 			alloc_ext =
1354 			    meta_sp_list_find(*head, last_off);
1355 			assert(alloc_ext != NULL);
1356 
1357 			/*
1358 			 * The offset test reflects the
1359 			 * inclusion of the watermark in the extent
1360 			 */
1361 			align = (alignment > 0) &&
1362 			    (((alloc_ext->ext_offset + MD_SP_WMSIZE) %
1363 			    alignment) == 0);
1364 
1365 			/*
1366 			 * If we decided not to align here, we should
1367 			 * also reset "alignment" so we don't bother
1368 			 * later, either.
1369 			 */
1370 			if (!align) {
1371 				alignment = 0;
1372 			}
1373 
1374 			last_seq = alloc_ext->ext_seq;
1375 
1376 			free_ext = meta_sp_list_find(*head,
1377 			    alloc_ext->ext_offset +
1378 			    alloc_ext->ext_length);
1379 
1380 			/*
1381 			 * If a free extent follows our last allocated
1382 			 * extent, then remove the last allocated
1383 			 * extent and increase the size of the free
1384 			 * extent to overlap it, then allocate the
1385 			 * total space from the new free extent.
1386 			 */
1387 			if (free_ext != NULL &&
1388 			    free_ext->ext_type == EXTTYP_FREE) {
1389 				assert(free_ext->ext_offset ==
1390 				    alloc_ext->ext_offset +
1391 				    alloc_ext->ext_length);
1392 
1393 				alloc_len =
1394 				    MIN(len, free_ext->ext_length);
1395 
1396 				if (align && (alloc_len < len)) {
1397 					/* No watermark space needed */
1398 					alloc_len -= alloc_len % alignment;
1399 				}
1400 
1401 				if (alloc_len > 0) {
1402 					free_ext->ext_offset -=
1403 					    alloc_ext->ext_length;
1404 					free_ext->ext_length +=
1405 					    alloc_ext->ext_length;
1406 
1407 					meta_sp_alloc_by_ext(sp, np, head,
1408 					    free_ext, free_ext->ext_offset,
1409 					    alloc_ext->ext_length + alloc_len,
1410 					    last_seq);
1411 
1412 					/*
1413 					 * now remove the original allocated
1414 					 * node.  We may have overlapping
1415 					 * extents for a short time before
1416 					 * this node is removed.
1417 					 */
1418 					meta_sp_list_remove(head, alloc_ext);
1419 					len -= alloc_len;
1420 				}
1421 			}
1422 			last_seq++;
1423 		}
1424 
1425 		if (len == 0LL)
1426 			goto out;
1427 
1428 		/*
1429 		 * Next, see if we can find a single allocation for
1430 		 * the remainder.  This may make fragmentation worse
1431 		 * in some cases, but there's no good way to allocate
1432 		 * that doesn't have a highly fragmented corner case.
1433 		 */
1434 		for (free_ext = *head; free_ext != NULL;
1435 		    free_ext = free_ext->ext_next) {
1436 			sp_ext_offset_t	a_offset;
1437 			sp_ext_offset_t	a_length;
1438 
1439 			if (free_ext->ext_type != EXTTYP_FREE)
1440 				continue;
1441 
1442 			/*
1443 			 * The length test should include space for
1444 			 * the watermark
1445 			 */
1446 
1447 			a_offset = free_ext->ext_offset;
1448 			a_length = free_ext->ext_length;
1449 
1450 			if (alignment > 0) {
1451 
1452 				/*
1453 				 * Shortcut for extents that have been
1454 				 * previously added to pad out the
1455 				 * data space
1456 				 */
1457 				if (a_length < alignment) {
1458 					continue;
1459 				}
1460 
1461 				/*
1462 				 * Round up so the data space begins
1463 				 * on a properly aligned boundary.
1464 				 */
1465 				a_offset += alignment -
1466 				    (a_offset % alignment) - MD_SP_WMSIZE;
1467 
1468 				/*
1469 				 * This is only necessary in case the
1470 				 * watermark size is ever greater than
1471 				 * one.  It'll never happen, of
1472 				 * course; we'll get rid of watermarks
1473 				 * before we make 'em bigger.
1474 				 */
1475 				if (a_offset < free_ext->ext_offset) {
1476 					a_offset += alignment;
1477 				}
1478 
1479 				/*
1480 				 * Adjust the length to account for
1481 				 * the space lost above (if any)
1482 				 */
1483 				a_length -=
1484 				    (a_offset - free_ext->ext_offset);
1485 			}
1486 
1487 			if (a_length >= len + MD_SP_WMSIZE) {
1488 				meta_sp_alloc_by_ext(sp, np, head,
1489 				    free_ext, a_offset,
1490 				    len + MD_SP_WMSIZE, last_seq);
1491 
1492 				len = 0LL;
1493 				numexts++;
1494 				break;
1495 			}
1496 		}
1497 
1498 		if (len == 0LL)
1499 			goto out;
1500 
1501 
1502 		/*
1503 		 * If the request could not be satisfied by extending
1504 		 * the last extent or by a single extent, then put
1505 		 * multiple smaller extents together until the request
1506 		 * is satisfied.
1507 		 */
1508 		for (free_ext = *head; (free_ext != NULL) && (len > 0);
1509 		    free_ext = free_ext->ext_next) {
1510 			sp_ext_offset_t a_offset;
1511 			sp_ext_length_t a_length;
1512 
1513 			if (free_ext->ext_type != EXTTYP_FREE)
1514 				continue;
1515 
1516 			a_offset = free_ext->ext_offset;
1517 			a_length = free_ext->ext_length;
1518 
1519 			if (alignment > 0) {
1520 
1521 				/*
1522 				 * Shortcut for extents that have been
1523 				 * previously added to pad out the
1524 				 * data space
1525 				 */
1526 				if (a_length < alignment) {
1527 					continue;
1528 				}
1529 
1530 				/*
1531 				 * Round up so the data space begins
1532 				 * on a properly aligned boundary.
1533 				 */
1534 				a_offset += alignment -
1535 				    (a_offset % alignment) - MD_SP_WMSIZE;
1536 
1537 				/*
1538 				 * This is only necessary in case the
1539 				 * watermark size is ever greater than
1540 				 * one.  It'll never happen, of
1541 				 * course; we'll get rid of watermarks
1542 				 * before we make 'em bigger.
1543 				 */
1544 				if (a_offset < free_ext->ext_offset) {
1545 					a_offset += alignment;
1546 				}
1547 
1548 				/*
1549 				 * Adjust the length to account for
1550 				 * the space lost above (if any)
1551 				 */
1552 				a_length -=
1553 				    (a_offset - free_ext->ext_offset);
1554 
1555 				/*
1556 				 * Adjust the length to be properly
1557 				 * aligned if it is NOT to be the
1558 				 * last extent in the soft partition.
1559 				 */
1560 				if ((a_length - MD_SP_WMSIZE) < len)
1561 					a_length -=
1562 					    (a_length - MD_SP_WMSIZE)
1563 					    % alignment;
1564 			}
1565 
1566 			alloc_len = MIN(len, a_length - MD_SP_WMSIZE);
1567 			if (alloc_len == 0)
1568 				continue;
1569 
1570 			/*
1571 			 * meta_sp_alloc_by_ext() expects the
1572 			 * allocation length to include the watermark
1573 			 * size, which is why we don't simply pass in
1574 			 * alloc_len here.
1575 			 */
1576 			meta_sp_alloc_by_ext(sp, np, head, free_ext,
1577 			    a_offset, MIN(len + MD_SP_WMSIZE, a_length),
1578 			    last_seq);
1579 
1580 			len -= alloc_len;
1581 			numexts++;
1582 			last_seq++;
1583 		}
1584 
1585 
1586 		/*
1587 		 * If there was not enough space we can throw it all
1588 		 * away since no real work has been done yet.
1589 		 */
1590 		if (len != 0) {
1591 			meta_sp_list_free(head);
1592 			return (-1);
1593 		}
1594 	}
1595 
1596 	/*
1597 	 * Otherwise, the literal "all" was specified: allocate all
1598 	 * available free space.  Don't bother with alignment.
1599 	 */
1600 	else {
1601 		/* First, extend the last extent if this is a grow */
1602 		if (last_off != 0LL) {
1603 			alloc_ext =
1604 			    meta_sp_list_find(*head, last_off);
1605 			assert(alloc_ext != NULL);
1606 
1607 			last_seq = alloc_ext->ext_seq;
1608 
1609 			free_ext = meta_sp_list_find(*head,
1610 			    alloc_ext->ext_offset +
1611 			    alloc_ext->ext_length);
1612 
1613 			/*
1614 			 * If a free extent follows our last allocated
1615 			 * extent, then remove the last allocated
1616 			 * extent and increase the size of the free
1617 			 * extent to overlap it, then allocate the
1618 			 * total space from the new free extent.
1619 			 */
1620 			if (free_ext != NULL &&
1621 			    free_ext->ext_type == EXTTYP_FREE) {
1622 				assert(free_ext->ext_offset ==
1623 				    alloc_ext->ext_offset +
1624 				    alloc_ext->ext_length);
1625 
1626 				len = alloc_len =
1627 				    free_ext->ext_length;
1628 
1629 				free_ext->ext_offset -=
1630 				    alloc_ext->ext_length;
1631 				free_ext->ext_length +=
1632 				    alloc_ext->ext_length;
1633 
1634 				meta_sp_alloc_by_ext(sp, np, head,
1635 				    free_ext, free_ext->ext_offset,
1636 				    alloc_ext->ext_length + alloc_len,
1637 				    last_seq);
1638 
1639 				/*
1640 				 * now remove the original allocated
1641 				 * node.  We may have overlapping
1642 				 * extents for a short time before
1643 				 * this node is removed.
1644 				 */
1645 				meta_sp_list_remove(head, alloc_ext);
1646 			}
1647 
1648 			last_seq++;
1649 		}
1650 
1651 		/* Next, grab all remaining free space */
1652 		for (free_ext = *head; free_ext != NULL;
1653 		    free_ext = free_ext->ext_next) {
1654 
1655 			if (free_ext->ext_type == EXTTYP_FREE) {
1656 				alloc_len =
1657 				    free_ext->ext_length - MD_SP_WMSIZE;
1658 				if (alloc_len == 0)
1659 					continue;
1660 
1661 				/*
1662 				 * meta_sp_alloc_by_ext() expects the
1663 				 * allocation length to include the
1664 				 * watermark size, which is why we
1665 				 * don't simply pass in alloc_len
1666 				 * here.
1667 				 */
1668 				meta_sp_alloc_by_ext(sp, np, head,
1669 				    free_ext, free_ext->ext_offset,
1670 				    free_ext->ext_length,
1671 				    last_seq);
1672 
1673 				len += alloc_len;
1674 				numexts++;
1675 				last_seq++;
1676 			}
1677 		}
1678 	}
1679 
1680 out:
1681 	if (getenv(META_SP_DEBUG)) {
1682 		meta_sp_debug("meta_sp_alloc_by_len: Extent list after "
1683 		    "allocation:\n");
1684 		meta_sp_list_dump(*head);
1685 	}
1686 
1687 	if (*lp == 0) {
1688 		*lp = len;
1689 
1690 		/*
1691 		 * Make sure the callers hit a no space error if we
1692 		 * didn't actually find anything.
1693 		 */
1694 		if (len == 0) {
1695 			return (-1);
1696 		}
1697 	}
1698 
1699 	return (numexts);
1700 }
1701 
1702 /*
1703  * FUNCTION:	meta_sp_alloc_by_list()
1704  * INPUT:	sp	- the set name for the device the node belongs to
1705  *		np	- the name of the device the node belongs to
1706  *		head	- the head of the list, must be NULL for empty list
1707  *		oblist	- an extent list containing requested nodes to allocate
1708  * OUTPUT:	head	- the new head pointer
1709  * RETURNS:	int	- -1 if error, the number of new extents on success
1710  * PURPOSE:	allocates extents from free space to satisfy the requested
1711  *		extent list.  This is primarily used for the -o/-b options
1712  *		where the user may specifically request extents to allocate.
1713  *		Each extent in the oblist must be a subset (inclusive) of a
1714  *		free extent and may not overlap each other.  This
1715  *		function sets the EXTFLG_UPDATE flag for each node that
1716  *		requires a watermark update after allocating.
1717  */
1718 static int
1719 meta_sp_alloc_by_list(
1720 	mdsetname_t	*sp,
1721 	mdname_t	*np,
1722 	sp_ext_node_t	**head,
1723 	sp_ext_node_t	*oblist
1724 )
1725 {
1726 	sp_ext_node_t	*ext;
1727 	sp_ext_node_t	*free_ext;
1728 	uint_t		numexts = 0;
1729 
1730 	for (ext = oblist; ext != NULL; ext = ext->ext_next) {
1731 
1732 		free_ext = meta_sp_list_find(*head,
1733 		    ext->ext_offset - MD_SP_WMSIZE);
1734 
1735 		/* Make sure the allocation is within the free extent */
1736 		if ((free_ext == NULL) ||
1737 		    (ext->ext_offset + ext->ext_length >
1738 		    free_ext->ext_offset + free_ext->ext_length) ||
1739 		    (free_ext->ext_type != EXTTYP_FREE))
1740 			return (-1);
1741 
1742 		meta_sp_alloc_by_ext(sp, np, head, free_ext,
1743 		    ext->ext_offset - MD_SP_WMSIZE,
1744 		    ext->ext_length + MD_SP_WMSIZE, ext->ext_seq);
1745 
1746 		numexts++;
1747 	}
1748 
1749 	assert(meta_sp_list_overlaps(*head) == 0);
1750 
1751 	if (getenv(META_SP_DEBUG)) {
1752 		meta_sp_debug("meta_sp_alloc_by_list: Extent list after "
1753 		    "allocation:\n");
1754 		meta_sp_list_dump(*head);
1755 	}
1756 
1757 	return (numexts);
1758 }
1759 
1760 /*
1761  * **************************************************************************
1762  *                     Extent List Population Functions                     *
1763  * **************************************************************************
1764  */
1765 
1766 /*
1767  * FUNCTION:	meta_sp_extlist_from_namelist()
1768  * INPUT:	sp	- the set name for the device the node belongs to
1769  *		spnplp	- the namelist of soft partitions to build a list from
1770  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1771  *		ep	- return error pointer
1772  * RETURNS:	int	- -1 if error, 0 on success
1773  * PURPOSE:	builds an extent list representing the soft partitions
1774  *		specified in the namelist.  Each extent in each soft
1775  *		partition is added to the list with the type EXTTYP_ALLOC.
1776  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1777  *		extent in the list includes the space occupied by the
1778  *		watermark, which is not included in the unit structures.
1779  */
1780 static int
1781 meta_sp_extlist_from_namelist(
1782 	mdsetname_t	*sp,
1783 	mdnamelist_t	*spnlp,
1784 	sp_ext_node_t	**extlist,
1785 	md_error_t	*ep
1786 )
1787 {
1788 	int		extn;
1789 	md_sp_t		*msp;		/* unit structure of the sp's */
1790 	mdnamelist_t	*namep;
1791 
1792 	assert(sp != NULL);
1793 
1794 	/*
1795 	 * Now go through the soft partitions and add a node to the used
1796 	 * list for each allocated extent.
1797 	 */
1798 	for (namep = spnlp; namep != NULL; namep = namep->next) {
1799 		mdname_t	*curnp = namep->namep;
1800 
1801 		/* get the unit structure */
1802 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
1803 			return (-1);
1804 
1805 		for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1806 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1807 
1808 			/*
1809 			 * subtract from offset and add to the length
1810 			 * to account for the watermark, which is not
1811 			 * contained in the extents in the unit structure.
1812 			 */
1813 			meta_sp_list_insert(sp, curnp, extlist,
1814 			    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
1815 			    EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset);
1816 		}
1817 	}
1818 	return (0);
1819 }
1820 
1821 /*
1822  * FUNCTION:	meta_sp_extlist_from_wm()
1823  * INPUT:	sp	- the set name for the device the node belongs to
1824  *		compnp	- the name of the device to scan watermarks on
1825  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1826  *		ep	- return error pointer
1827  * RETURNS:	int	- -1 if error, 0 on success
1828  * PURPOSE:	builds an extent list representing the soft partitions
1829  *		specified in the namelist.  Each extent in each soft
1830  *		partition is added to the list with the type EXTTYP_ALLOC.
1831  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1832  *		extent in the list includes the space occupied by the
1833  *		watermark, which is not included in the unit structures.
1834  */
1835 static int
1836 meta_sp_extlist_from_wm(
1837 	mdsetname_t	*sp,
1838 	mdname_t	*compnp,
1839 	sp_ext_node_t	**extlist,
1840 	ext_cmpfunc_t	compare,
1841 	md_error_t	*ep
1842 )
1843 {
1844 	mp_watermark_t	wm;
1845 	mdname_t	*np = NULL;
1846 	mdsetname_t	*spsetp = NULL;
1847 	sp_ext_offset_t	cur_off;
1848 	md_set_desc	*sd;
1849 	int		init = 0;
1850 	mdkey_t		key;
1851 	minor_t		mnum;
1852 
1853 	if (!metaislocalset(sp)) {
1854 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1855 			return (-1);
1856 	}
1857 
1858 	if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR)
1859 		return (-1);
1860 
1861 	for (;;) {
1862 		if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) {
1863 			return (-1);
1864 		}
1865 
1866 		/* get the set and name pointers */
1867 		if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) {
1868 			if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) {
1869 				return (-1);
1870 			}
1871 		}
1872 
1873 		/*
1874 		 * For the MN set, meta_init_make_device needs to
1875 		 * be run on all the nodes so the entries for the
1876 		 * softpart device name and its comp can be created
1877 		 * in the same order in the replica namespace.  If
1878 		 * we have it run on mdmn_do_iocset then the mddbs
1879 		 * will be out of sync between master node and slave
1880 		 * nodes.
1881 		 */
1882 		if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) {
1883 
1884 			if (!metaislocalset(sp) && MD_MNSET_DESC(sd)) {
1885 				md_mn_msg_addmdname_t	*send_params;
1886 				int			result;
1887 				md_mn_result_t		*resp = NULL;
1888 				int			message_size;
1889 
1890 				message_size =  sizeof (*send_params) +
1891 				    strlen(wm.wm_mdname) + 1;
1892 				send_params = Zalloc(message_size);
1893 				send_params->addmdname_setno = sp->setno;
1894 				(void) strcpy(&send_params->addmdname_name[0],
1895 				    wm.wm_mdname);
1896 				result = mdmn_send_message(sp->setno,
1897 				    MD_MN_MSG_ADDMDNAME,
1898 				    MD_MSGF_PANIC_WHEN_INCONSISTENT,
1899 				    (char *)send_params, message_size, &resp,
1900 				    ep);
1901 				Free(send_params);
1902 				if (resp != NULL) {
1903 					if (resp->mmr_exitval != 0) {
1904 						free_result(resp);
1905 						return (-1);
1906 					}
1907 					free_result(resp);
1908 				}
1909 				if (result != 0)
1910 					return (-1);
1911 			} else {
1912 
1913 				if (!is_existing_meta_hsp(sp, wm.wm_mdname)) {
1914 					if ((key = meta_init_make_device(&sp,
1915 					    wm.wm_mdname, ep)) <= 0) {
1916 						return (-1);
1917 					}
1918 					init = 1;
1919 				}
1920 			}
1921 
1922 			np = metaname(&spsetp, wm.wm_mdname, META_DEVICE, ep);
1923 			if (np == NULL) {
1924 				if (init) {
1925 					if (meta_getnmentbykey(sp->setno,
1926 					    MD_SIDEWILD, key, NULL, &mnum,
1927 					    NULL, ep) != NULL) {
1928 						(void) metaioctl(MD_IOCREM_DEV,
1929 						    &mnum, ep, NULL);
1930 					}
1931 					(void) del_self_name(sp, key, ep);
1932 				}
1933 				return (-1);
1934 			}
1935 		}
1936 
1937 		/* insert watermark into extent list */
1938 		meta_sp_list_insert(spsetp, np, extlist, cur_off,
1939 		    wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq,
1940 		    EXTFLG_UPDATE, compare);
1941 
1942 		/* if we see the end watermark, we're done */
1943 		if (wm.wm_type == EXTTYP_END)
1944 			break;
1945 
1946 		cur_off += wm.wm_length + 1;
1947 
1948 		/* clear out set and name pointers for next iteration */
1949 		np = NULL;
1950 		spsetp = NULL;
1951 	}
1952 
1953 	return (0);
1954 }
1955 
1956 /*
1957  * **************************************************************************
1958  *                        Print (metastat) Functions                        *
1959  * **************************************************************************
1960  */
1961 
1962 /*
1963  * FUNCTION:	meta_sp_short_print()
1964  * INPUT:	msp	- the unit structure to display
1965  *		fp	- the file pointer to send output to
1966  *		options	- print options from the command line processor
1967  * OUTPUT:	ep	- return error pointer
1968  * RETURNS:	int	- -1 if error, 0 on success
1969  * PURPOSE:	display a short report of the soft partition in md.tab
1970  *		form, primarily used for metastat -p.
1971  */
1972 static int
1973 meta_sp_short_print(
1974 	md_sp_t		*msp,
1975 	char		*fname,
1976 	FILE		*fp,
1977 	mdprtopts_t	options,
1978 	md_error_t	*ep
1979 )
1980 {
1981 	int	extn;
1982 
1983 	if (options & PRINT_LARGEDEVICES) {
1984 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0)
1985 			return (0);
1986 	}
1987 
1988 	if (options & PRINT_FN) {
1989 		if ((msp->common.revision & MD_FN_META_DEV) == 0)
1990 			return (0);
1991 	}
1992 
1993 	/* print name and -p */
1994 	if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF)
1995 		return (mdsyserror(ep, errno, fname));
1996 
1997 	/* print the component */
1998 	/*
1999 	 * Always print the full path name
2000 	 */
2001 	if (fprintf(fp, " %s", msp->compnamep->rname) == EOF)
2002 		return (mdsyserror(ep, errno, fname));
2003 
2004 	/* print out each extent */
2005 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2006 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2007 		if (fprintf(fp, " -o %llu -b %llu ", extp->poff,
2008 		    extp->len) == EOF)
2009 			return (mdsyserror(ep, errno, fname));
2010 	}
2011 
2012 	if (fprintf(fp, "\n") == EOF)
2013 		return (mdsyserror(ep, errno, fname));
2014 
2015 	/* success */
2016 	return (0);
2017 }
2018 
2019 /*
2020  * FUNCTION:	meta_sp_status_to_name()
2021  * INPUT:	xsp_status	- the status value to convert to a string
2022  *		tstate		- transient errored device state. If set the
2023  *				  device is Unavailable
2024  * OUTPUT:	none
2025  * RETURNS:	char *	- a pointer to the string representing the status value
2026  * PURPOSE:	return an internationalized string representing the
2027  *		status value for a soft partition.  The strings are
2028  *		strdup'd and must be freed by the caller.
2029  */
2030 static char *
2031 meta_sp_status_to_name(
2032 	xsp_status_t	xsp_status,
2033 	uint_t		tstate
2034 )
2035 {
2036 	char *rval = NULL;
2037 
2038 	/*
2039 	 * Check to see if we have MD_INACCESSIBLE set. This is the only valid
2040 	 * value for an 'Unavailable' return. tstate can be set because of
2041 	 * other multi-node reasons (e.g. ABR being set)
2042 	 */
2043 	if (tstate & MD_INACCESSIBLE) {
2044 		return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable")));
2045 	}
2046 
2047 	switch (xsp_status) {
2048 	case MD_SP_CREATEPEND:
2049 		rval = Strdup(dgettext(TEXT_DOMAIN, "Creating"));
2050 		break;
2051 	case MD_SP_GROWPEND:
2052 		rval = Strdup(dgettext(TEXT_DOMAIN, "Growing"));
2053 		break;
2054 	case MD_SP_DELPEND:
2055 		rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting"));
2056 		break;
2057 	case MD_SP_OK:
2058 		rval = Strdup(dgettext(TEXT_DOMAIN, "Okay"));
2059 		break;
2060 	case MD_SP_ERR:
2061 		rval = Strdup(dgettext(TEXT_DOMAIN, "Errored"));
2062 		break;
2063 	case MD_SP_RECOVER:
2064 		rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering"));
2065 		break;
2066 	}
2067 
2068 	if (rval == NULL)
2069 		rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid"));
2070 
2071 	return (rval);
2072 }
2073 
2074 /*
2075  * FUNCTION:	meta_sp_report()
2076  * INPUT:	sp	- the set name for the unit being displayed
2077  *		msp	- the unit structure to display
2078  *		nlpp	- pass back the large devs
2079  *		fp	- the file pointer to send output to
2080  *		options	- print options from the command line processor
2081  * OUTPUT:	ep	- return error pointer
2082  * RETURNS:	int	- -1 if error, 0 on success
2083  * PURPOSE:	print a full report of the device specified
2084  */
2085 static int
2086 meta_sp_report(
2087 	mdsetname_t	*sp,
2088 	md_sp_t		*msp,
2089 	mdnamelist_t	**nlpp,
2090 	char		*fname,
2091 	FILE		*fp,
2092 	mdprtopts_t	options,
2093 	md_error_t	*ep
2094 )
2095 {
2096 	uint_t		extn;
2097 	char		*status;
2098 	char		*devid = "";
2099 	mdname_t	*didnp = NULL;
2100 	ddi_devid_t	dtp;
2101 	int		len;
2102 	uint_t		tstate = 0;
2103 
2104 	if (options & PRINT_LARGEDEVICES) {
2105 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0) {
2106 			return (0);
2107 		} else {
2108 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2109 				return (-1);
2110 		}
2111 	}
2112 
2113 	if (options & PRINT_FN) {
2114 		if ((msp->common.revision & MD_FN_META_DEV) == 0) {
2115 			return (0);
2116 		} else {
2117 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2118 				return (-1);
2119 		}
2120 	}
2121 
2122 	if (options & PRINT_HEADER) {
2123 		if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"),
2124 		    msp->common.namep->cname) == EOF)
2125 			return (mdsyserror(ep, errno, fname));
2126 	}
2127 
2128 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Device: %s\n"),
2129 	    msp->compnamep->cname) == EOF)
2130 		return (mdsyserror(ep, errno, fname));
2131 
2132 	/* Determine if device is available before displaying status */
2133 	if (metaismeta(msp->common.namep)) {
2134 		if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0)
2135 			return (-1);
2136 	}
2137 	status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED);
2138 
2139 	/* print out "State" to be consistent with other metadevices */
2140 	if (tstate & MD_ABR_CAP) {
2141 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2142 		    "    State: %s - Application Based Recovery (ABR)\n"),
2143 		    status) == EOF) {
2144 			Free(status);
2145 			return (mdsyserror(ep, errno, fname));
2146 		}
2147 	} else {
2148 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2149 		    "    State: %s\n"), status) == EOF) {
2150 			Free(status);
2151 			return (mdsyserror(ep, errno, fname));
2152 		}
2153 	}
2154 	free(status);
2155 
2156 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Size: %llu blocks (%s)\n"),
2157 	    msp->common.size,
2158 	    meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF)
2159 		return (mdsyserror(ep, errno, fname));
2160 
2161 	/* print component details */
2162 	if (! metaismeta(msp->compnamep)) {
2163 		diskaddr_t	start_blk;
2164 		int		has_mddb;
2165 		char		*has_mddb_str;
2166 
2167 		/* print header */
2168 		/*
2169 		 * Building a format string on the fly that will
2170 		 * be used in (f)printf. This allows the length
2171 		 * of the ctd to vary from small to large without
2172 		 * looking horrible.
2173 		 */
2174 		len = strlen(msp->compnamep->cname);
2175 		len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device")));
2176 		len += 2;
2177 		if (fprintf(fp,
2178 		    "\t%-*.*s %-12.12s %-5.5s %s\n",
2179 		    len, len,
2180 		    dgettext(TEXT_DOMAIN, "Device"),
2181 		    dgettext(TEXT_DOMAIN, "Start Block"),
2182 		    dgettext(TEXT_DOMAIN, "Dbase"),
2183 		    dgettext(TEXT_DOMAIN, "Reloc")) == EOF) {
2184 			return (mdsyserror(ep, errno, fname));
2185 		}
2186 
2187 
2188 		/* get info */
2189 		if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) ==
2190 		    MD_DISKADDR_ERROR)
2191 			return (-1);
2192 
2193 		if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0)
2194 			return (-1);
2195 
2196 		if (has_mddb)
2197 			has_mddb_str = dgettext(TEXT_DOMAIN, "Yes");
2198 		else
2199 			has_mddb_str = dgettext(TEXT_DOMAIN, "No");
2200 
2201 		/* populate the key in the name_p structure */
2202 		didnp = metadevname(&sp, msp->compnamep->dev, ep);
2203 		if (didnp == NULL) {
2204 			return (-1);
2205 		}
2206 
2207 		/* determine if devid does NOT exist */
2208 		if (options & PRINT_DEVID) {
2209 			if ((dtp = meta_getdidbykey(sp->setno,
2210 			    getmyside(sp, ep), didnp->key, ep)) == NULL)
2211 				devid = dgettext(TEXT_DOMAIN, "No ");
2212 			else {
2213 				devid = dgettext(TEXT_DOMAIN, "Yes");
2214 				free(dtp);
2215 			}
2216 		}
2217 
2218 		/* print info */
2219 		/*
2220 		 * This allows the length
2221 		 * of the ctd to vary from small to large without
2222 		 * looking horrible.
2223 		 */
2224 		if (fprintf(fp, "\t%-*s %8lld     %-5.5s %s\n",
2225 		    len, msp->compnamep->cname,
2226 		    start_blk, has_mddb_str, devid) == EOF) {
2227 			return (mdsyserror(ep, errno, fname));
2228 		}
2229 		(void) fprintf(fp, "\n");
2230 	}
2231 
2232 
2233 	/* print the headers */
2234 	if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n",
2235 	    dgettext(TEXT_DOMAIN, "Extent"),
2236 	    dgettext(TEXT_DOMAIN, "Start Block"),
2237 	    dgettext(TEXT_DOMAIN, "Block count")) == EOF)
2238 		return (mdsyserror(ep, errno, fname));
2239 
2240 	/* print out each extent */
2241 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2242 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2243 
2244 		/* If PRINT_TIMES option is ever supported, add output here */
2245 		if (fprintf(fp, "\t%6u %24llu %24llu\n",
2246 		    extn, extp->poff, extp->len) == EOF)
2247 			return (mdsyserror(ep, errno, fname));
2248 	}
2249 
2250 	/* separate records with a newline */
2251 	(void) fprintf(fp, "\n");
2252 	return (0);
2253 }
2254 
2255 /*
2256  * FUNCTION:	meta_sp_print()
2257  * INPUT:	sp	- the set name for the unit being displayed
2258  *		np	- the name of the device to print
2259  *		fname	- ??? not used
2260  *		fp	- the file pointer to send output to
2261  *		options	- print options from the command line processor
2262  * OUTPUT:	ep	- return error pointer
2263  * RETURNS:	int	- -1 if error, 0 on success
2264  * PURPOSE:	print a full report of the device specified by metastat.
2265  *		This is the main entry point for printing.
2266  */
2267 int
2268 meta_sp_print(
2269 	mdsetname_t	*sp,
2270 	mdname_t	*np,
2271 	mdnamelist_t	**nlpp,
2272 	char		*fname,
2273 	FILE		*fp,
2274 	mdprtopts_t	options,
2275 	md_error_t	*ep
2276 )
2277 {
2278 	md_sp_t		*msp;
2279 	md_unit_t	*mdp;
2280 	int		rval = 0;
2281 
2282 	/* should always have the same set */
2283 	assert(sp != NULL);
2284 
2285 	/* print all the soft partitions */
2286 	if (np == NULL) {
2287 		mdnamelist_t	*nlp = NULL;
2288 		mdnamelist_t	*p;
2289 		int		cnt;
2290 
2291 		if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0)
2292 			return (-1);
2293 		else if (cnt == 0)
2294 			return (0);
2295 
2296 		/* recusively print them out */
2297 		for (p = nlp; (p != NULL); p = p->next) {
2298 			mdname_t	*curnp = p->namep;
2299 
2300 			/*
2301 			 * one problem with the rval of -1 here is that
2302 			 * the error gets "lost" when the next device is
2303 			 * printed, but we want to print them all anyway.
2304 			 */
2305 			rval = meta_sp_print(sp, curnp, nlpp, fname, fp,
2306 			    options, ep);
2307 		}
2308 
2309 		/* clean up, return success */
2310 		metafreenamelist(nlp);
2311 		return (rval);
2312 	}
2313 
2314 	/* get the unit structure */
2315 	if ((msp = meta_get_sp_common(sp, np,
2316 	    ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL)
2317 		return (-1);
2318 
2319 	/* check for parented */
2320 	if ((! (options & PRINT_SUBDEVS)) &&
2321 	    (MD_HAS_PARENT(msp->common.parent))) {
2322 		return (0);
2323 	}
2324 
2325 	/* print appropriate detail */
2326 	if (options & PRINT_SHORT) {
2327 		if (meta_sp_short_print(msp, fname, fp, options, ep) != 0)
2328 			return (-1);
2329 	} else {
2330 		if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0)
2331 			return (-1);
2332 	}
2333 
2334 	/*
2335 	 * Print underlying metadevices if they are parented to us and
2336 	 * if the info for the underlying metadevice has not been printed.
2337 	 */
2338 	if (metaismeta(msp->compnamep)) {
2339 		/* get the unit structure for the subdevice */
2340 		if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL)
2341 			return (-1);
2342 
2343 		/* If info not already printed, recurse */
2344 		if (!BT_TEST(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)))) {
2345 			if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp,
2346 			    (options | PRINT_HEADER | PRINT_SUBDEVS),
2347 			    NULL, ep) != 0) {
2348 				return (-1);
2349 			}
2350 			BT_SET(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)));
2351 		}
2352 	}
2353 	return (0);
2354 }
2355 
2356 /*
2357  * **************************************************************************
2358  *                     Watermark Manipulation Functions                     *
2359  * **************************************************************************
2360  */
2361 
2362 /*
2363  * FUNCTION:	meta_sp_get_start()
2364  * INPUT:	sp	- the operating set
2365  *		np 	- device upon which the sp is being built
2366  * OUTPUT:	ep	- return error pointer
2367  * RETURNS:	daddr_t	- -1 if error, otherwise the start block
2368  * PURPOSE:	Encapsulate the determination of the start block of the
2369  *		device upon which the sp is built or being built.
2370  */
2371 static diskaddr_t
2372 meta_sp_get_start(
2373 	mdsetname_t	*sp,
2374 	mdname_t	*np,
2375 	md_error_t	*ep
2376 )
2377 {
2378 	daddr_t		start_block;
2379 
2380 	if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR)
2381 		start_block += MD_SP_START;
2382 
2383 	return (start_block);
2384 }
2385 
2386 /*
2387  * FUNCTION:	meta_sp_update_wm()
2388  * INPUT:	sp	- the operating set
2389  *		msp	- a pointer to the XDR unit structure
2390  *		extlist	- the extent list specifying watermarks to update
2391  * OUTPUT:	ep	- return error pointer
2392  * RETURNS:	int	- -1 if error, 0 on success
2393  * PURPOSE:	steps backwards through the extent list updating
2394  *		watermarks for all extents with the EXTFLG_UPDATE flag
2395  *		set.  Writing the watermarks guarantees consistency when
2396  *		extents must be broken into pieces since the original
2397  *		watermark will be the last to be updated, and will be
2398  *		changed to point to a new watermark that is already
2399  *		known to be consistent.  If one of the writes fails, the
2400  *		original watermark stays intact and none of the changes
2401  *		are realized.
2402  */
2403 static int
2404 meta_sp_update_wm(
2405 	mdsetname_t	*sp,
2406 	md_sp_t		*msp,
2407 	sp_ext_node_t	*extlist,
2408 	md_error_t	*ep
2409 )
2410 {
2411 	sp_ext_node_t	*ext;
2412 	sp_ext_node_t	*tail;
2413 	mp_watermark_t	*wmp, *watermarks;
2414 	xsp_offset_t	*osp, *offsets;
2415 	int		update_count = 0;
2416 	int		rval = 0;
2417 	md_unit_t	*mdp;
2418 	md_sp_update_wm_t	update_params;
2419 
2420 	if (getenv(META_SP_DEBUG)) {
2421 		meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n");
2422 		meta_sp_list_dump(extlist);
2423 	}
2424 
2425 	/*
2426 	 * find the last node so we can write the watermarks backwards
2427 	 * and count watermarks to update so we can allocate space
2428 	 */
2429 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
2430 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2431 			update_count++;
2432 		}
2433 
2434 		if (ext->ext_next == NULL) {
2435 			tail = ext;
2436 		}
2437 	}
2438 	ext = tail;
2439 
2440 	wmp = watermarks =
2441 	    Zalloc(update_count * sizeof (mp_watermark_t));
2442 	osp = offsets =
2443 	    Zalloc(update_count * sizeof (sp_ext_offset_t));
2444 
2445 	while (ext != NULL) {
2446 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2447 			/* update watermark */
2448 			wmp->wm_magic = MD_SP_MAGIC;
2449 			wmp->wm_version = MD_SP_VERSION;
2450 			wmp->wm_type = ext->ext_type;
2451 			wmp->wm_seq = ext->ext_seq;
2452 			wmp->wm_length = ext->ext_length - MD_SP_WMSIZE;
2453 
2454 			/* fill in the volume name and set name */
2455 			if (ext->ext_namep != NULL)
2456 				(void) strcpy(wmp->wm_mdname,
2457 				    ext->ext_namep->cname);
2458 			else
2459 				(void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME);
2460 			if (ext->ext_setp != NULL &&
2461 			    ext->ext_setp->setno != MD_LOCAL_SET)
2462 				(void) strcpy(wmp->wm_setname,
2463 				    ext->ext_setp->setname);
2464 			else
2465 				(void) strcpy(wmp->wm_setname,
2466 				    MD_SP_LOCALSETNAME);
2467 
2468 			/* Generate the checksum */
2469 			wmp->wm_checksum = 0;
2470 			crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum,
2471 			    sizeof (*wmp), NULL);
2472 
2473 			/* record the extent offset */
2474 			*osp = ext->ext_offset;
2475 
2476 			/* Advance the placeholders */
2477 			osp++; wmp++;
2478 		}
2479 		ext = ext->ext_prev;
2480 	}
2481 
2482 	mdp = meta_get_mdunit(sp, msp->common.namep, ep);
2483 	if (mdp == NULL) {
2484 		rval = -1;
2485 		goto out;
2486 	}
2487 
2488 	(void) memset(&update_params, 0, sizeof (update_params));
2489 	update_params.mnum = MD_SID(mdp);
2490 	update_params.count = update_count;
2491 	update_params.wmp = (uintptr_t)watermarks;
2492 	update_params.osp = (uintptr_t)offsets;
2493 	MD_SETDRIVERNAME(&update_params, MD_SP,
2494 	    MD_MIN2SET(update_params.mnum));
2495 
2496 	if (metaioctl(MD_IOC_SPUPDATEWM, &update_params,
2497 	    &update_params.mde, msp->common.namep->cname) != 0) {
2498 		(void) mdstealerror(ep, &update_params.mde);
2499 		rval = -1;
2500 		goto out;
2501 	}
2502 
2503 out:
2504 	Free(watermarks);
2505 	Free(offsets);
2506 
2507 	return (rval);
2508 }
2509 
2510 /*
2511  * FUNCTION:	meta_sp_clear_wm()
2512  * INPUT:	sp	- the operating set
2513  *		msp	- the unit structure for the soft partition to clear
2514  * OUTPUT:	ep	- return error pointer
2515  * RETURNS:	int	- -1 if error, 0 on success
2516  * PURPOSE:	steps through the extents for a soft partition unit and
2517  *		creates an extent list designed to mark all of the
2518  *		watermarks for those extents as free.  The extent list
2519  *		is then passed to meta_sp_update_wm() to actually write
2520  *		the watermarks out.
2521  */
2522 static int
2523 meta_sp_clear_wm(
2524 	mdsetname_t	*sp,
2525 	md_sp_t		*msp,
2526 	md_error_t	*ep
2527 )
2528 {
2529 	sp_ext_node_t	*extlist = NULL;
2530 	int		numexts = msp->ext.ext_len;
2531 	uint_t		i;
2532 	int		rval = 0;
2533 
2534 	/* for each watermark must set the flag to SP_FREE */
2535 	for (i = 0; i < numexts; i++) {
2536 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
2537 
2538 		meta_sp_list_insert(NULL, NULL, &extlist,
2539 		    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
2540 		    EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
2541 	}
2542 
2543 	/* update watermarks */
2544 	rval = meta_sp_update_wm(sp, msp, extlist, ep);
2545 
2546 	meta_sp_list_free(&extlist);
2547 	return (rval);
2548 }
2549 
2550 /*
2551  * FUNCTION:	meta_sp_read_wm()
2552  * INPUT:	sp	- setname for component
2553  *		compnp	- mdname_t for component
2554  *		offset	- the offset of the watermark to read (sectors)
2555  * OUTPUT:	wm	- the watermark structure to read into
2556  *		ep	- return error pointer
2557  * RETURNS:	int	- -1 if error, 0 on success
2558  * PURPOSE:	seeks out to the requested offset and reads a watermark.
2559  *		It then verifies that the magic number is correct and
2560  *		that the checksum is valid, returning an error if either
2561  *		is wrong.
2562  */
2563 static int
2564 meta_sp_read_wm(
2565 	mdsetname_t	*sp,
2566 	mdname_t	*compnp,
2567 	mp_watermark_t	*wm,
2568 	sp_ext_offset_t	offset,
2569 	md_error_t	*ep
2570 )
2571 {
2572 	md_sp_read_wm_t	read_params;
2573 
2574 	/*
2575 	 * make sure block offset does not overflow 2^64 bytes and it's a
2576 	 * multiple of the block size.
2577 	 */
2578 	assert(offset <= (1LL << (64 - DEV_BSHIFT)));
2579 	/* LINTED */
2580 	assert((sizeof (*wm) % DEV_BSIZE) == 0);
2581 
2582 	(void) memset(wm, 0, sizeof (*wm));
2583 
2584 	(void) memset(&read_params, 0, sizeof (read_params));
2585 	read_params.rdev = compnp->dev;
2586 	read_params.wmp = (uintptr_t)wm;
2587 	read_params.offset = offset;
2588 	MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno);
2589 
2590 	if (metaioctl(MD_IOC_SPREADWM, &read_params,
2591 	    &read_params.mde, compnp->cname) != 0) {
2592 
2593 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2594 		    "Extent header read failed, block %llu.\n"), offset);
2595 		return (mdstealerror(ep, &read_params.mde));
2596 	}
2597 
2598 	/* make sure magic number is correct */
2599 	if (wm->wm_magic != MD_SP_MAGIC) {
2600 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2601 		    "found incorrect magic number %x, expected %x.\n"),
2602 		    wm->wm_magic, MD_SP_MAGIC);
2603 		/*
2604 		 * Pass NULL for the device name as we don't have
2605 		 * valid watermark contents.
2606 		 */
2607 		return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL));
2608 	}
2609 
2610 	if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
2611 	    sizeof (*wm), NULL)) {
2612 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2613 		    "found incorrect checksum %x.\n"),
2614 		    wm->wm_checksum);
2615 		return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname));
2616 	}
2617 
2618 	return (0);
2619 }
2620 
2621 /*
2622  * **************************************************************************
2623  *                  Query Functions
2624  * **************************************************************************
2625  */
2626 
2627 /*
2628  * IMPORTANT NOTE: This is a static function that assumes that
2629  *		   its input parameters have been checked and
2630  *		   have valid values that lie within acceptable
2631  *		   ranges.
2632  *
2633  * FUNCTION:	meta_sp_enough_space()
2634  * INPUT:	desired_number_of_sps - the number of soft partitions desired;
2635  *					must be > 0
2636  *		desired_sp_size - the desired soft partition size in blocks;
2637  *				  must be > 0
2638  *		extent_listpp - a reference to a reference to an extent
2639  *				list that lists the extents on a device;
2640  *				must be a reference to a reference to a
2641  *				valid extent list
2642  *		alignment - the desired data space alignment for the sp's
2643  * OUTPUT:	boolean_t return value
2644  * RETURNS:	boolean_t - B_TRUE if there's enough space in the extent
2645  *			    list to create the desired soft partitions,
2646  *			    B_FALSE if there's not enough space
2647  * PURPOSE:	determines whether there's enough free space in an extent
2648  *		list to allow creation of a set of soft partitions
2649  */
2650 static boolean_t
2651 meta_sp_enough_space(
2652 	int		desired_number_of_sps,
2653 	blkcnt_t	desired_sp_size,
2654 	sp_ext_node_t	**extent_listpp,
2655 	sp_ext_length_t	alignment
2656 )
2657 {
2658 	boolean_t		enough_space;
2659 	int			number_of_sps;
2660 	int			number_of_extents_used;
2661 	sp_ext_length_t		desired_ext_length = desired_sp_size;
2662 
2663 	enough_space = B_TRUE;
2664 	number_of_sps = 0;
2665 	while ((enough_space == B_TRUE) &&
2666 	    (number_of_sps < desired_number_of_sps)) {
2667 		/*
2668 		 * Use the extent allocation algorithm implemented by
2669 		 * meta_sp_alloc_by_len() to test whether the free
2670 		 * extents in the extent list referenced by *extent_listpp
2671 		 * contain enough space to accomodate a soft partition
2672 		 * of size desired_ext_length.
2673 		 *
2674 		 * Repeat the test <desired_number_of_sps> times
2675 		 * or until it fails, whichever comes first,
2676 		 * each time allocating the extents required to
2677 		 * create the soft partition without actually
2678 		 * creating the soft partition.
2679 		 */
2680 		number_of_extents_used = meta_sp_alloc_by_len(
2681 		    TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2682 		    extent_listpp, &desired_ext_length,
2683 		    NO_OFFSET, alignment);
2684 		if (number_of_extents_used == -1) {
2685 			enough_space = B_FALSE;
2686 		} else {
2687 			number_of_sps++;
2688 		}
2689 	}
2690 	return (enough_space);
2691 }
2692 
2693 /*
2694  * IMPORTANT NOTE: This is a static function that calls other functions
2695  *		   that check its mdsetnamep and device_mdnamep
2696  *		   input parameters, but expects extent_listpp to
2697  *		   be a initialized to a valid address to which
2698  *		   it can write a reference to the extent list that
2699  *		   it creates.
2700  *
2701  * FUNCTION:	meta_sp_get_extent_list()
2702  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2703  *			     for the set containing the device for
2704  *			     which the extents are to be listed
2705  *		device_mdnamep - a reference to the mdname_t structure
2706  *				 for the device for which the extents
2707  *				 are to be listed
2708  * OUTPUT:	*extent_listpp - a reference to the extent list for
2709  *				 the device; NULL if the function fails
2710  *		*ep - the libmeta error encountered, if any
2711  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2712  *			    B_FALSE if not
2713  * PURPOSE:	gets the extent list for a device
2714  */
2715 static boolean_t
2716 meta_sp_get_extent_list(
2717 	mdsetname_t	*mdsetnamep,
2718 	mdname_t	*device_mdnamep,
2719 	sp_ext_node_t	**extent_listpp,
2720 	md_error_t	*ep
2721 )
2722 {
2723 	diskaddr_t		device_size_in_blocks;
2724 	mdnamelist_t		*sp_name_listp;
2725 	diskaddr_t		start_block_address_in_blocks;
2726 
2727 	*extent_listpp = NULL;
2728 	sp_name_listp = NULL;
2729 
2730 	start_block_address_in_blocks = meta_sp_get_start(mdsetnamep,
2731 	    device_mdnamep, ep);
2732 	if (start_block_address_in_blocks == MD_DISKADDR_ERROR) {
2733 		if (getenv(META_SP_DEBUG)) {
2734 			mde_perror(ep,
2735 			    "meta_sp_get_extent_list:meta_sp_get_start");
2736 		}
2737 		return (B_FALSE);
2738 	}
2739 
2740 	device_size_in_blocks = metagetsize(device_mdnamep, ep);
2741 	if (device_size_in_blocks == MD_DISKADDR_ERROR) {
2742 		if (getenv(META_SP_DEBUG)) {
2743 			mde_perror(ep,
2744 			    "meta_sp_get_extent_list:metagetsize");
2745 		}
2746 		return (B_FALSE);
2747 	}
2748 
2749 	/*
2750 	 * Sanity check: the start block will have skipped an integer
2751 	 * number of cylinders, C.  C will usually be zero.  If (C > 0),
2752 	 * and the disk slice happens to only be C cylinders in total
2753 	 * size, we'll fail this check.
2754 	 */
2755 	if (device_size_in_blocks <=
2756 	    (start_block_address_in_blocks + MD_SP_WMSIZE)) {
2757 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname);
2758 		return (B_FALSE);
2759 	}
2760 
2761 	/*
2762 	 * After this point, we will have allocated resources, so any
2763 	 * failure returns must be through the supplied "fail" label
2764 	 * to properly deallocate things.
2765 	 */
2766 
2767 	/*
2768 	 * Create an empty extent list that starts one watermark past
2769 	 * the start block of the device and ends one watermark before
2770 	 * the end of the device.
2771 	 */
2772 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2773 	    extent_listpp, NO_OFFSET,
2774 	    (sp_ext_length_t)start_block_address_in_blocks,
2775 	    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2776 	    meta_sp_cmp_by_offset);
2777 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2778 	    extent_listpp, (sp_ext_offset_t)(device_size_in_blocks -
2779 	    MD_SP_WMSIZE), MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER,
2780 	    NO_FLAGS, meta_sp_cmp_by_offset);
2781 
2782 	/*
2783 	 * Get the list of soft partitions that are already on the
2784 	 * device.
2785 	 */
2786 	if (meta_sp_get_by_component(mdsetnamep, device_mdnamep,
2787 	    &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) {
2788 		if (getenv(META_SP_DEBUG)) {
2789 			mde_perror(ep,
2790 			    "meta_sp_get_extent_list:meta_sp_get_by_component");
2791 		}
2792 		goto fail;
2793 	}
2794 
2795 	if (sp_name_listp != NULL) {
2796 		/*
2797 		 * If there are soft partitions on the device, add the
2798 		 * extents used in them to the extent list.
2799 		 */
2800 		if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp,
2801 		    extent_listpp, ep) == -1) {
2802 			if (getenv(META_SP_DEBUG)) {
2803 				mde_perror(ep, "meta_sp_get_extent_list:"
2804 				    "meta_sp_extlist_from_namelist");
2805 			}
2806 			goto fail;
2807 		}
2808 		metafreenamelist(sp_name_listp);
2809 	}
2810 
2811 	/*
2812 	 * Add free extents to the extent list to represent
2813 	 * the remaining regions of free space on the
2814 	 * device.
2815 	 */
2816 	meta_sp_list_freefill(extent_listpp, device_size_in_blocks);
2817 	return (B_TRUE);
2818 
2819 fail:
2820 	if (sp_name_listp != NULL) {
2821 		metafreenamelist(sp_name_listp);
2822 	}
2823 
2824 	if (*extent_listpp != NULL) {
2825 		/*
2826 		 * meta_sp_list_free sets *extent_listpp to NULL.
2827 		 */
2828 		meta_sp_list_free(extent_listpp);
2829 	}
2830 	return (B_FALSE);
2831 }
2832 
2833 /*
2834  * IMPORTANT NOTE: This is a static function that calls other functions
2835  *		   that check its mdsetnamep and mddrivenamep
2836  *		   input parameters, but expects extent_listpp to
2837  *		   be a initialized to a valid address to which
2838  *		   it can write a reference to the extent list that
2839  *		   it creates.
2840  *
2841  * FUNCTION:	meta_sp_get_extent_list_for_drive()
2842  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2843  *			     for the set containing the drive for
2844  *			     which the extents are to be listed
2845  *		mddrivenamep   - a reference to the mddrivename_t structure
2846  *				 for the drive for which the extents
2847  *				 are to be listed
2848  * OUTPUT:	*extent_listpp - a reference to the extent list for
2849  *				 the drive; NULL if the function fails
2850  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2851  *			    B_FALSE if not
2852  * PURPOSE:	gets the extent list for a drive when the entire drive
2853  *		is to be soft partitioned
2854  */
2855 static boolean_t
2856 meta_sp_get_extent_list_for_drive(
2857 	mdsetname_t	*mdsetnamep,
2858 	mddrivename_t	*mddrivenamep,
2859 	sp_ext_node_t	**extent_listpp
2860 )
2861 {
2862 	boolean_t		can_use;
2863 	diskaddr_t		free_space;
2864 	md_error_t		mderror;
2865 	mdvtoc_t		proposed_vtoc;
2866 	int			repartition_options;
2867 	int			return_value;
2868 	md_sp_t			test_sp_struct;
2869 
2870 	can_use = B_TRUE;
2871 	*extent_listpp = NULL;
2872 	mderror = mdnullerror;
2873 	test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0,
2874 	    &mderror);
2875 	if (test_sp_struct.compnamep == NULL) {
2876 		can_use = B_FALSE;
2877 	}
2878 
2879 	if (can_use == B_TRUE) {
2880 		mderror = mdnullerror;
2881 		repartition_options = 0;
2882 		return_value = meta_check_sp(mdsetnamep, &test_sp_struct,
2883 		    MDCMD_USE_WHOLE_DISK, &repartition_options, &mderror);
2884 		if (return_value != 0) {
2885 			can_use = B_FALSE;
2886 		}
2887 	}
2888 
2889 	if (can_use == B_TRUE) {
2890 		mderror = mdnullerror;
2891 		repartition_options = repartition_options |
2892 		    (MD_REPART_FORCE | MD_REPART_DONT_LABEL);
2893 		return_value = meta_repartition_drive(mdsetnamep, mddrivenamep,
2894 		    repartition_options, &proposed_vtoc, &mderror);
2895 		if (return_value != 0) {
2896 			can_use = B_FALSE;
2897 		}
2898 	}
2899 
2900 	if (can_use == B_TRUE) {
2901 		free_space = proposed_vtoc.parts[MD_SLICE0].size;
2902 		if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) {
2903 			can_use = B_FALSE;
2904 		}
2905 	}
2906 
2907 	if (can_use == B_TRUE) {
2908 		/*
2909 		 * Create an extent list that starts with
2910 		 * a reserved extent that ends at the start
2911 		 * of the usable space on slice zero of the
2912 		 * proposed VTOC, ends with an extent that
2913 		 * reserves space for a watermark at the end
2914 		 * of slice zero, and contains a single free
2915 		 * extent that occupies the rest of the space
2916 		 * on the slice.
2917 		 *
2918 		 * NOTE:
2919 		 *
2920 		 * Don't use metagetstart() or metagetsize() to
2921 		 * find the usable space.  They query the mdname_t
2922 		 * structure that represents an actual device to
2923 		 * determine the amount of space on the device that
2924 		 * contains metadata and the total amount of space
2925 		 * on the device.  Since this function creates a
2926 		 * proposed extent list that doesn't reflect the
2927 		 * state of an actual device, there's no mdname_t
2928 		 * structure to be queried.
2929 		 *
2930 		 * When a drive is reformatted to prepare for
2931 		 * soft partitioning, all of slice seven is
2932 		 * reserved for metadata, all of slice zero is
2933 		 * available for soft partitioning, and all other
2934 		 * slices on the drive are empty.  The proposed
2935 		 * extent list for the drive therefore contains
2936 		 * only three extents: a reserved extent that ends
2937 		 * at the start of the usable space on slice zero,
2938 		 * a single free extent that occupies all the usable
2939 		 * space on slice zero, and an ending extent that
2940 		 * reserves space for a watermark at the end of
2941 		 * slice zero.
2942 		 */
2943 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2944 		    extent_listpp, NO_OFFSET, (sp_ext_length_t)(MD_SP_START),
2945 		    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2946 		    meta_sp_cmp_by_offset);
2947 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2948 		    extent_listpp, (sp_ext_offset_t)(free_space - MD_SP_WMSIZE),
2949 		    MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER, NO_FLAGS,
2950 		    meta_sp_cmp_by_offset);
2951 		meta_sp_list_freefill(extent_listpp, free_space);
2952 	}
2953 	return (can_use);
2954 }
2955 
2956 /*
2957  * FUNCTION:	meta_sp_can_create_sps()
2958  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2959  *			     for the set containing the device for
2960  *			     which the extents are to be listed
2961  *		mdnamep - a reference to the mdname_t of the device
2962  *			  on which the soft parititions are to be created
2963  *		number_of_sps - the desired number of soft partitions
2964  *		sp_size - the desired soft partition size
2965  * OUTPUT:	boolean_t return value
2966  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
2967  *			    B_FALSE if not
2968  * PURPOSE:	determines whether a set of soft partitions can be created
2969  *		on a device
2970  */
2971 boolean_t
2972 meta_sp_can_create_sps(
2973 	mdsetname_t	*mdsetnamep,
2974 	mdname_t	*mdnamep,
2975 	int		number_of_sps,
2976 	blkcnt_t	sp_size
2977 )
2978 {
2979 	sp_ext_node_t	*extent_listp;
2980 	boolean_t	succeeded;
2981 	md_error_t	mde;
2982 
2983 	if ((number_of_sps > 0) && (sp_size > 0)) {
2984 		succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
2985 		    &extent_listp, &mde);
2986 	} else {
2987 		succeeded = B_FALSE;
2988 	}
2989 
2990 	/*
2991 	 * We don't really care about an error return from the
2992 	 * alignment call; that will just result in passing zero,
2993 	 * which will be interpreted as no alignment.
2994 	 */
2995 
2996 	if (succeeded == B_TRUE) {
2997 		succeeded = meta_sp_enough_space(number_of_sps,
2998 		    sp_size, &extent_listp,
2999 		    meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde));
3000 		meta_sp_list_free(&extent_listp);
3001 	}
3002 	return (succeeded);
3003 }
3004 
3005 /*
3006  * FUNCTION:	meta_sp_can_create_sps_on_drive()
3007  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3008  *			     for the set containing the drive for
3009  *			     which the extents are to be listed
3010  *		mddrivenamep - a reference to the mddrivename_t of the drive
3011  *			       on which the soft parititions are to be created
3012  *		number_of_sps - the desired number of soft partitions
3013  *		sp_size - the desired soft partition size
3014  * OUTPUT:	boolean_t return value
3015  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3016  *			    B_FALSE if not
3017  * PURPOSE:	determines whether a set of soft partitions can be created
3018  *		on a drive if the entire drive is soft partitioned
3019  */
3020 boolean_t
3021 meta_sp_can_create_sps_on_drive(
3022 	mdsetname_t	*mdsetnamep,
3023 	mddrivename_t	*mddrivenamep,
3024 	int		number_of_sps,
3025 	blkcnt_t	sp_size
3026 )
3027 {
3028 	sp_ext_node_t	*extent_listp;
3029 	boolean_t	succeeded;
3030 
3031 	if ((number_of_sps > 0) && (sp_size > 0)) {
3032 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3033 		    mddrivenamep, &extent_listp);
3034 	} else {
3035 		succeeded = B_FALSE;
3036 	}
3037 
3038 	/*
3039 	 * We don't care about alignment on the space call because
3040 	 * we're specifically dealing with a drive, which will have no
3041 	 * inherent alignment.
3042 	 */
3043 
3044 	if (succeeded == B_TRUE) {
3045 		succeeded = meta_sp_enough_space(number_of_sps, sp_size,
3046 		    &extent_listp, SP_UNALIGNED);
3047 		meta_sp_list_free(&extent_listp);
3048 	}
3049 	return (succeeded);
3050 }
3051 
3052 /*
3053  * FUNCTION:	meta_sp_get_free_space()
3054  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3055  *			     for the set containing the device for
3056  *			     which the free space is to be returned
3057  *		mdnamep - a reference to the mdname_t of the device
3058  *			  for which the free space is to be returned
3059  * OUTPUT:	blkcnt_t return value
3060  * RETURNS:	blkcnt_t - the number of blocks of free space on the device
3061  * PURPOSE:	returns the number of blocks of free space on a device
3062  */
3063 blkcnt_t
3064 meta_sp_get_free_space(
3065 	mdsetname_t	*mdsetnamep,
3066 	mdname_t	*mdnamep
3067 )
3068 {
3069 	sp_ext_node_t		*extent_listp;
3070 	sp_ext_length_t		free_blocks;
3071 	boolean_t		succeeded;
3072 	md_error_t		mde;
3073 
3074 	extent_listp = NULL;
3075 	free_blocks = 0;
3076 	succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3077 	    &extent_listp, &mde);
3078 	if (succeeded == B_TRUE) {
3079 		free_blocks = meta_sp_list_size(extent_listp,
3080 		    EXTTYP_FREE, INCLUDE_WM);
3081 		meta_sp_list_free(&extent_listp);
3082 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3083 			/*
3084 			 * Subtract a safety margin for watermarks when
3085 			 * computing the number of blocks available for
3086 			 * use.  The actual number of watermarks can't
3087 			 * be calculated without knowing the exact numbers
3088 			 * and sizes of both the free extents and the soft
3089 			 * partitions to be created.  The calculation is
3090 			 * highly complex and error-prone even if those
3091 			 * quantities are known.  The approximate value
3092 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3093 			 * correct value in all practical cases.
3094 			 */
3095 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3096 		} else {
3097 			free_blocks = 0;
3098 		}
3099 	} else {
3100 		mdclrerror(&mde);
3101 	}
3102 
3103 	return (free_blocks);
3104 }
3105 
3106 /*
3107  * FUNCTION:	meta_sp_get_free_space_on_drive()
3108  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3109  *			     for the set containing the drive for
3110  *			     which the free space is to be returned
3111  *		mddrivenamep - a reference to the mddrivename_t of the drive
3112  *			       for which the free space is to be returned
3113  * OUTPUT:	blkcnt_t return value
3114  * RETURNS:	blkcnt_t - the number of blocks of free space on the drive
3115  * PURPOSE:	returns the number of blocks of space usable for soft
3116  *		partitions on an entire drive, if the entire drive is
3117  *		soft partitioned
3118  */
3119 blkcnt_t
3120 meta_sp_get_free_space_on_drive(
3121 	mdsetname_t	*mdsetnamep,
3122 	mddrivename_t	*mddrivenamep
3123 )
3124 {
3125 	sp_ext_node_t		*extent_listp;
3126 	sp_ext_length_t		free_blocks;
3127 	boolean_t		succeeded;
3128 
3129 	extent_listp = NULL;
3130 	free_blocks = 0;
3131 	succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3132 	    mddrivenamep, &extent_listp);
3133 	if (succeeded == B_TRUE) {
3134 		free_blocks = meta_sp_list_size(extent_listp,
3135 		    EXTTYP_FREE, INCLUDE_WM);
3136 		meta_sp_list_free(&extent_listp);
3137 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3138 			/*
3139 			 * Subtract a safety margin for watermarks when
3140 			 * computing the number of blocks available for
3141 			 * use.  The actual number of watermarks can't
3142 			 * be calculated without knowing the exact numbers
3143 			 * and sizes of both the free extents and the soft
3144 			 * partitions to be created.  The calculation is
3145 			 * highly complex and error-prone even if those
3146 			 * quantities are known.  The approximate value
3147 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3148 			 * correct value in all practical cases.
3149 			 */
3150 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3151 		} else {
3152 			free_blocks = 0;
3153 		}
3154 	}
3155 	return (free_blocks);
3156 }
3157 
3158 /*
3159  * FUNCTION:	meta_sp_get_number_of_possible_sps()
3160  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3161  *			     for the set containing the device for
3162  *			     which the number of possible soft partitions
3163  *			     is to be returned
3164  *		mdnamep - a reference to the mdname_t of the device
3165  *			  for which the number of possible soft partitions
3166  *			  is to be returned
3167  * OUTPUT:	int return value
3168  * RETURNS:	int - the number of soft partitions of the desired size
3169  *		      that can be created on the device
3170  * PURPOSE:	returns the number of soft partitions of a given size
3171  *		that can be created on a device
3172  */
3173 int
3174 meta_sp_get_number_of_possible_sps(
3175 	mdsetname_t	*mdsetnamep,
3176 	mdname_t	*mdnamep,
3177 	blkcnt_t	sp_size
3178 )
3179 {
3180 	sp_ext_node_t	*extent_listp;
3181 	int		number_of_possible_sps;
3182 	boolean_t	succeeded;
3183 	md_error_t	mde;
3184 	sp_ext_length_t	alignment;
3185 
3186 	extent_listp = NULL;
3187 	number_of_possible_sps = 0;
3188 	if (sp_size > 0) {
3189 		if ((succeeded = meta_sp_get_extent_list(mdsetnamep,
3190 		    mdnamep, &extent_listp, &mde)) == B_FALSE)
3191 			mdclrerror(&mde);
3192 	} else {
3193 		succeeded = B_FALSE;
3194 	}
3195 
3196 	if (succeeded == B_TRUE) {
3197 		alignment = meta_sp_get_default_alignment(mdsetnamep,
3198 		    mdnamep, &mde);
3199 	}
3200 
3201 	while (succeeded == B_TRUE) {
3202 		/*
3203 		 * Keep allocating space from the extent list
3204 		 * for soft partitions of the desired size until
3205 		 * there's not enough free space left in the list
3206 		 * for another soft partiition of that size.
3207 		 * Add one to the number of possible soft partitions
3208 		 * for each soft partition for which there is
3209 		 * enough free space left.
3210 		 */
3211 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3212 		    sp_size, &extent_listp, alignment);
3213 		if (succeeded == B_TRUE) {
3214 			number_of_possible_sps++;
3215 		}
3216 	}
3217 	if (extent_listp != NULL) {
3218 		meta_sp_list_free(&extent_listp);
3219 	}
3220 	return (number_of_possible_sps);
3221 }
3222 
3223 /*
3224  * FUNCTION:	meta_sp_get_number_of_possible_sps_on_drive()
3225  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3226  *			     for the set containing the drive for
3227  *			     which the number of possible soft partitions
3228  *			     is to be returned
3229  *		mddrivenamep - a reference to the mddrivename_t of the drive
3230  *			       for which the number of possible soft partitions
3231  *			       is to be returned
3232  *		sp_size - the size in blocks of the proposed soft partitions
3233  * OUTPUT:	int return value
3234  * RETURNS:	int - the number of soft partitions of the desired size
3235  *		      that can be created on the drive
3236  * PURPOSE:	returns the number of soft partitions of a given size
3237  *		that can be created on a drive, if the entire drive is
3238  *		soft partitioned
3239  */
3240 int
3241 meta_sp_get_number_of_possible_sps_on_drive(
3242 	mdsetname_t	*mdsetnamep,
3243 	mddrivename_t	*mddrivenamep,
3244 	blkcnt_t	sp_size
3245 )
3246 {
3247 	sp_ext_node_t	*extent_listp;
3248 	int		number_of_possible_sps;
3249 	boolean_t	succeeded;
3250 
3251 	extent_listp = NULL;
3252 	number_of_possible_sps = 0;
3253 	if (sp_size > 0) {
3254 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3255 		    mddrivenamep, &extent_listp);
3256 	} else {
3257 		succeeded = B_FALSE;
3258 	}
3259 	while (succeeded == B_TRUE) {
3260 		/*
3261 		 * Keep allocating space from the extent list
3262 		 * for soft partitions of the desired size until
3263 		 * there's not enough free space left in the list
3264 		 * for another soft partition of that size.
3265 		 * Add one to the number of possible soft partitions
3266 		 * for each soft partition for which there is
3267 		 * enough free space left.
3268 		 *
3269 		 * Since it's a drive, not a metadevice, make no
3270 		 * assumptions about alignment.
3271 		 */
3272 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3273 		    sp_size, &extent_listp, SP_UNALIGNED);
3274 		if (succeeded == B_TRUE) {
3275 			number_of_possible_sps++;
3276 		}
3277 	}
3278 	if (extent_listp != NULL) {
3279 		meta_sp_list_free(&extent_listp);
3280 	}
3281 	return (number_of_possible_sps);
3282 }
3283 
3284 /*
3285  * FUNCTION:	meta_sp_get_possible_sp_size()
3286  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3287  *			     for the set containing the device for
3288  *			     which the possible soft partition size
3289  *			     is to be returned
3290  *		mdnamep - a reference to the mdname_t of the device
3291  *			  for which the possible soft partition size
3292  *			  is to be returned
3293  *		number_of_sps - the desired number of soft partitions
3294  * OUTPUT:	blkcnt_t return value
3295  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3296  * PURPOSE:	returns the maximum possible size of each of a given number of
3297  *		soft partitions of equal size that can be created on a device
3298  */
3299 blkcnt_t
3300 meta_sp_get_possible_sp_size(
3301 	mdsetname_t	*mdsetnamep,
3302 	mdname_t	*mdnamep,
3303 	int		number_of_sps
3304 )
3305 {
3306 	blkcnt_t	free_blocks;
3307 	blkcnt_t	sp_size;
3308 	boolean_t	succeeded;
3309 
3310 	sp_size = 0;
3311 	if (number_of_sps > 0) {
3312 		free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep);
3313 		sp_size = free_blocks / number_of_sps;
3314 		succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3315 		    number_of_sps, sp_size);
3316 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3317 			/*
3318 			 * To compensate for space that may have been
3319 			 * occupied by watermarks, reduce sp_size by a
3320 			 * number of blocks equal to the number of soft
3321 			 * partitions desired, and test again to see
3322 			 * whether the desired number of soft partitions
3323 			 * can be created.
3324 			 */
3325 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3326 			succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3327 			    number_of_sps, sp_size);
3328 		}
3329 		if (sp_size < 0) {
3330 			sp_size = 0;
3331 		}
3332 	}
3333 	return (sp_size);
3334 }
3335 
3336 /*
3337  * FUNCTION:	meta_sp_get_possible_sp_size_on_drive()
3338  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3339  *			     for the set containing the drive for
3340  *			     which the possible soft partition size
3341  *			     is to be returned
3342  *		mddrivenamep - a reference to the mddrivename_t of the drive
3343  *			       for which the possible soft partition size
3344  *			       is to be returned
3345  *		number_of_sps - the desired number of soft partitions
3346  * OUTPUT:	blkcnt_t return value
3347  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3348  * PURPOSE:	returns the maximum possible size of each of a given number of
3349  *		soft partitions of equal size that can be created on a drive
3350  *              if the entire drive is soft partitioned
3351  */
3352 blkcnt_t
3353 meta_sp_get_possible_sp_size_on_drive(
3354 	mdsetname_t	*mdsetnamep,
3355 	mddrivename_t	*mddrivenamep,
3356 	int		number_of_sps
3357 )
3358 {
3359 	blkcnt_t	free_blocks;
3360 	blkcnt_t	sp_size;
3361 	boolean_t	succeeded;
3362 
3363 	sp_size = 0;
3364 	if (number_of_sps > 0) {
3365 		free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep,
3366 		    mddrivenamep);
3367 		sp_size = free_blocks / number_of_sps;
3368 		succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3369 		    mddrivenamep, number_of_sps, sp_size);
3370 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3371 			/*
3372 			 * To compensate for space that may have been
3373 			 * occupied by watermarks, reduce sp_size by a
3374 			 * number of blocks equal to the number of soft
3375 			 * partitions desired, and test again to see
3376 			 * whether the desired number of soft partitions
3377 			 * can be created.
3378 			 */
3379 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3380 			succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3381 			    mddrivenamep, number_of_sps, sp_size);
3382 		}
3383 		if (sp_size < 0) {
3384 			sp_size = 0;
3385 		}
3386 	}
3387 	return (sp_size);
3388 }
3389 
3390 /*
3391  * **************************************************************************
3392  *                  Unit Structure Manipulation Functions                   *
3393  * **************************************************************************
3394  */
3395 
3396 /*
3397  * FUNCTION:	meta_sp_fillextarray()
3398  * INPUT:	mp	- the unit structure to fill
3399  *		extlist	- the list of extents to fill with
3400  * OUTPUT:	none
3401  * RETURNS:	void
3402  * PURPOSE:	fills in the unit structure extent list with the extents
3403  *		specified by extlist.  Only extents in extlist with the
3404  *		EXTFLG_UPDATE flag are changed in the unit structure,
3405  *		and the index into the unit structure is the sequence
3406  *		number in the extent list.  After all of the nodes have
3407  *		been updated the virtual offsets in the unit structure
3408  *		are updated to reflect the new lengths.
3409  */
3410 static void
3411 meta_sp_fillextarray(
3412 	mp_unit_t	*mp,
3413 	sp_ext_node_t	*extlist
3414 )
3415 {
3416 	int	i;
3417 	sp_ext_node_t	*ext;
3418 	sp_ext_offset_t	curvoff = 0LL;
3419 
3420 	assert(mp != NULL);
3421 
3422 	/* go through the allocation list and fill in our unit structure */
3423 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
3424 		if ((ext->ext_type == EXTTYP_ALLOC) &&
3425 		    (ext->ext_flags & EXTFLG_UPDATE) != 0) {
3426 			mp->un_ext[ext->ext_seq].un_poff =
3427 			    ext->ext_offset + MD_SP_WMSIZE;
3428 			mp->un_ext[ext->ext_seq].un_len =
3429 			    ext->ext_length - MD_SP_WMSIZE;
3430 		}
3431 	}
3432 
3433 	for (i = 0; i < mp->un_numexts; i++) {
3434 		assert(mp->un_ext[i].un_poff != 0);
3435 		assert(mp->un_ext[i].un_len  != 0);
3436 		mp->un_ext[i].un_voff = curvoff;
3437 		curvoff += mp->un_ext[i].un_len;
3438 	}
3439 }
3440 
3441 /*
3442  * FUNCTION:	meta_sp_createunit()
3443  * INPUT:	np	- the name of the device to create a unit structure for
3444  *		compnp	- the name of the device the soft partition is on
3445  *		extlist	- the extent list to populate the new unit with
3446  *		numexts	- the number of extents in the extent list
3447  *		len	- the total size of the soft partition (sectors)
3448  *		status	- the initial status of the unit structure
3449  * OUTPUT:	ep	- return error pointer
3450  * RETURNS:	mp_unit_t * - the new unit structure.
3451  * PURPOSE:	allocates and fills in a new soft partition unit
3452  *		structure to be passed to the soft partitioning driver
3453  *		for creation.
3454  */
3455 static mp_unit_t *
3456 meta_sp_createunit(
3457 	mdname_t	*np,
3458 	mdname_t	*compnp,
3459 	sp_ext_node_t	*extlist,
3460 	int		numexts,
3461 	sp_ext_length_t	len,
3462 	sp_status_t	status,
3463 	md_error_t	*ep
3464 )
3465 {
3466 	mp_unit_t	*mp;
3467 	uint_t		ms_size;
3468 
3469 	ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) +
3470 	    (numexts * sizeof (mp->un_ext[0]));
3471 
3472 	mp = Zalloc(ms_size);
3473 
3474 	/* fill in fields in common unit structure */
3475 	mp->c.un_type = MD_METASP;
3476 	mp->c.un_size = ms_size;
3477 	MD_SID(mp) = meta_getminor(np->dev);
3478 	mp->c.un_total_blocks = len;
3479 	mp->c.un_actual_tb = len;
3480 
3481 	/* set up geometry */
3482 	(void) meta_sp_setgeom(np, compnp, mp, ep);
3483 
3484 	/* if we're building on metadevice we can't parent */
3485 	if (metaismeta(compnp))
3486 		MD_CAPAB(mp) = MD_CANT_PARENT;
3487 	else
3488 		MD_CAPAB(mp) = MD_CAN_PARENT;
3489 
3490 	/* fill soft partition-specific fields */
3491 	mp->un_dev = compnp->dev;
3492 	mp->un_key = compnp->key;
3493 
3494 	/* mdname_t start_blk field is not 64-bit! */
3495 	mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk;
3496 	mp->un_status = status;
3497 	mp->un_numexts = numexts;
3498 	mp->un_length = len;
3499 
3500 	/* fill in the extent array */
3501 	meta_sp_fillextarray(mp, extlist);
3502 
3503 	return (mp);
3504 }
3505 
3506 /*
3507  * FUNCTION:	meta_sp_updateunit()
3508  * INPUT:	np       - name structure for the metadevice being updated
3509  *		old_un	 - the original unit structure that is being updated
3510  *		extlist	 - the extent list to populate the new unit with
3511  *		grow_len - the amount by which the partition is being grown
3512  *		numexts	 - the number of extents in the extent list
3513  *		ep       - return error pointer
3514  * OUTPUT:	none
3515  * RETURNS:	mp_unit_t * - the updated unit structure
3516  * PURPOSE:	allocates and fills in a new soft partition unit structure to
3517  *		be passed to the soft partitioning driver for creation.  The
3518  *		old unit structure is first copied in, and then the updated
3519  *		extents are changed in the new unit structure.  This is
3520  *		typically used when the size of an existing unit is changed.
3521  */
3522 static mp_unit_t *
3523 meta_sp_updateunit(
3524 	mdname_t	*np,
3525 	mp_unit_t	*old_un,
3526 	sp_ext_node_t	*extlist,
3527 	sp_ext_length_t	grow_len,
3528 	int		numexts,
3529 	md_error_t	*ep
3530 )
3531 {
3532 	mp_unit_t	*new_un;
3533 	sp_ext_length_t	new_len;
3534 	uint_t		new_size;
3535 
3536 	assert(old_un != NULL);
3537 	assert(extlist != NULL);
3538 
3539 	/* allocate new unit structure and copy in old unit */
3540 	new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) +
3541 	    ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0]));
3542 	new_len = old_un->un_length + grow_len;
3543 	new_un = Zalloc(new_size);
3544 	bcopy(old_un, new_un, old_un->c.un_size);
3545 
3546 	/* update size and geometry information */
3547 	new_un->c.un_size = new_size;
3548 	new_un->un_length = new_len;
3549 	new_un->c.un_total_blocks = new_len;
3550 	new_un->c.un_actual_tb = new_len;
3551 	if (meta_adjust_geom((md_unit_t *)new_un, np,
3552 	    old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct,
3553 	    0, ep) != 0) {
3554 		Free(new_un);
3555 		return (NULL);
3556 	}
3557 
3558 	/* update extent information */
3559 	new_un->un_numexts += numexts;
3560 
3561 	meta_sp_fillextarray(new_un, extlist);
3562 
3563 	return (new_un);
3564 }
3565 
3566 /*
3567  * FUNCTION:	meta_get_sp()
3568  * INPUT:	sp	- the set name for the device to get
3569  *		np	- the name of the device to get
3570  * OUTPUT:	ep	- return error pointer
3571  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition
3572  * PURPOSE:	interface to the rest of libmeta for fetching a unit structure
3573  *		for the named device.  Just a wrapper for meta_get_sp_common().
3574  */
3575 md_sp_t *
3576 meta_get_sp(
3577 	mdsetname_t	*sp,
3578 	mdname_t	*np,
3579 	md_error_t	*ep
3580 )
3581 {
3582 	return (meta_get_sp_common(sp, np, 0, ep));
3583 }
3584 
3585 /*
3586  * FUNCTION:	meta_get_sp_common()
3587  * INPUT:	sp	- the set name for the device to get
3588  *		np	- the name of the device to get
3589  *		fast	- whether to use the cache or not (NOT IMPLEMENTED!)
3590  * OUTPUT:	ep	- return error pointer
3591  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition,
3592  *			    NULL if np is not a soft partition
3593  * PURPOSE:	common routine for fetching a soft partition unit structure
3594  */
3595 md_sp_t *
3596 meta_get_sp_common(
3597 	mdsetname_t	*sp,
3598 	mdname_t	*np,
3599 	int		fast,
3600 	md_error_t	*ep
3601 )
3602 {
3603 	mddrivename_t	*dnp = np->drivenamep;
3604 	char		*miscname;
3605 	mp_unit_t	*mp;
3606 	md_sp_t		*msp;
3607 	int		i;
3608 
3609 	/* must have set */
3610 	assert(sp != NULL);
3611 
3612 	/* short circuit */
3613 	if (dnp->unitp != NULL) {
3614 		if (dnp->unitp->type != MD_METASP)
3615 			return (NULL);
3616 		return ((md_sp_t *)dnp->unitp);
3617 	}
3618 	/* get miscname and unit */
3619 	if ((miscname = metagetmiscname(np, ep)) == NULL)
3620 		return (NULL);
3621 
3622 	if (strcmp(miscname, MD_SP) != 0) {
3623 		(void) mdmderror(ep, MDE_NOT_SP, 0, np->cname);
3624 		return (NULL);
3625 	}
3626 
3627 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
3628 		return (NULL);
3629 
3630 	assert(mp->c.un_type == MD_METASP);
3631 
3632 	/* allocate soft partition */
3633 	msp = Zalloc(sizeof (*msp));
3634 
3635 	/* get the common information */
3636 	msp->common.namep = np;
3637 	msp->common.type = mp->c.un_type;
3638 	msp->common.state = mp->c.un_status;
3639 	msp->common.capabilities = mp->c.un_capabilities;
3640 	msp->common.parent = mp->c.un_parent;
3641 	msp->common.size = mp->c.un_total_blocks;
3642 	msp->common.user_flags = mp->c.un_user_flags;
3643 	msp->common.revision = mp->c.un_revision;
3644 
3645 	/* get soft partition information */
3646 	if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL)
3647 		goto out;
3648 
3649 	/*
3650 	 * Fill in the key and the start block.  Note that the start
3651 	 * block in the unit structure is 64 bits but the name pointer
3652 	 * only supports 32 bits.
3653 	 */
3654 	msp->compnamep->key = mp->un_key;
3655 	msp->compnamep->start_blk = mp->un_start_blk;
3656 
3657 	/* fill in status field */
3658 	msp->status = mp->un_status;
3659 
3660 	/* allocate the extents */
3661 	msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val));
3662 	msp->ext.ext_len = mp->un_numexts;
3663 
3664 	/* do the extents for this soft partition */
3665 	for (i = 0; i < mp->un_numexts; i++) {
3666 		struct mp_ext	*mde = &mp->un_ext[i];
3667 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
3668 
3669 		extp->voff = mde->un_voff;
3670 		extp->poff = mde->un_poff;
3671 		extp->len = mde->un_len;
3672 	}
3673 
3674 	/* cleanup, return success */
3675 	Free(mp);
3676 	dnp->unitp = (md_common_t *)msp;
3677 	return (msp);
3678 
3679 out:
3680 	/* clean up and return error */
3681 	Free(mp);
3682 	Free(msp);
3683 	return (NULL);
3684 }
3685 
3686 
3687 /*
3688  * FUNCTION:	meta_init_sp()
3689  * INPUT:	spp	- the set name for the new device
3690  *		argc	- the remaining argument count for the metainit cmdline
3691  *		argv	- the remainder of the unparsed command line
3692  *		options	- global options parsed by metainit
3693  * OUTPUT:	ep	- return error pointer
3694  * RETURNS:	int	- -1 failure, 0 success
3695  * PURPOSE:	provides the command line parsing and name management overhead
3696  *		for creating a new soft partition.  Ultimately this calls
3697  *		meta_create_sp() which does the real work of allocating space
3698  *		for the new soft partition.
3699  */
3700 int
3701 meta_init_sp(
3702 	mdsetname_t	**spp,
3703 	int		argc,
3704 	char		*argv[],
3705 	mdcmdopts_t	options,
3706 	md_error_t	*ep
3707 )
3708 {
3709 	char		*compname = NULL;
3710 	mdname_t	*spcompnp = NULL;	/* name of component volume */
3711 	char		*devname = argv[0];	/* unit name */
3712 	mdname_t	*np = NULL;		/* name of soft partition */
3713 	md_sp_t		*msp = NULL;
3714 	int		c;
3715 	int		old_optind;
3716 	sp_ext_length_t	len = 0LL;
3717 	int		rval = -1;
3718 	uint_t		seq;
3719 	int		oflag;
3720 	int		failed;
3721 	mddrivename_t	*dnp = NULL;
3722 	sp_ext_length_t	alignment = 0LL;
3723 	sp_ext_node_t	*extlist = NULL;
3724 
3725 	assert(argc > 0);
3726 
3727 	/* expect sp name, -p, optional -e, compname, and size parameters */
3728 	/* grab soft partition name */
3729 	if ((np = metaname(spp, devname, META_DEVICE, ep)) == NULL)
3730 		goto out;
3731 
3732 	/* see if it exists already */
3733 	if (metagetmiscname(np, ep) != NULL) {
3734 		(void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP,
3735 		    meta_getminor(np->dev), devname);
3736 		goto out;
3737 	} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) {
3738 		goto out;
3739 	} else {
3740 		mdclrerror(ep);
3741 	}
3742 	--argc, ++argv;
3743 
3744 	if (argc == 0)
3745 		goto syntax;
3746 
3747 	/* grab -p */
3748 	if (strcmp(argv[0], "-p") != 0)
3749 		goto syntax;
3750 	--argc, ++argv;
3751 
3752 	if (argc == 0)
3753 		goto syntax;
3754 
3755 	/* see if -e is there */
3756 	if (strcmp(argv[0], "-e") == 0) {
3757 		/* use the whole disk */
3758 		options |= MDCMD_USE_WHOLE_DISK;
3759 		--argc, ++argv;
3760 	}
3761 
3762 	if (argc == 0)
3763 		goto syntax;
3764 
3765 	/* get component name */
3766 	compname = Strdup(argv[0]);
3767 
3768 	if (options & MDCMD_USE_WHOLE_DISK) {
3769 		if ((dnp = metadrivename(spp, compname, ep)) == NULL) {
3770 			goto out;
3771 		}
3772 		if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) {
3773 			goto out;
3774 		}
3775 	} else if ((spcompnp = metaname(spp, compname, UNKNOWN, ep)) == NULL) {
3776 		goto out;
3777 	}
3778 	assert(*spp != NULL);
3779 
3780 	if (!(options & MDCMD_NOLOCK)) {
3781 		/* grab set lock */
3782 		if (meta_lock(*spp, TRUE, ep))
3783 			goto out;
3784 
3785 		if (meta_check_ownership(*spp, ep) != 0)
3786 			goto out;
3787 	}
3788 
3789 	/* allocate the soft partition */
3790 	msp = Zalloc(sizeof (*msp));
3791 
3792 	/* setup common */
3793 	msp->common.namep = np;
3794 	msp->common.type = MD_METASP;
3795 
3796 	compname = spcompnp->cname;
3797 
3798 	assert(spcompnp->rname != NULL);
3799 	--argc, ++argv;
3800 
3801 	if (argc == 0) {
3802 		goto syntax;
3803 	}
3804 
3805 	if (*argv[0] == '-') {
3806 		/*
3807 		 * parse any other command line options, this includes
3808 		 * the recovery options -o and -b. The special thing
3809 		 * with these options is that the len needs to be
3810 		 * kept track of otherwise when the geometry of the
3811 		 * "device" is built it will create an invalid geometry
3812 		 */
3813 		old_optind = optind = 0;
3814 		opterr = 0;
3815 		oflag = 0;
3816 		seq = 0;
3817 		failed = 0;
3818 		while ((c = getopt(argc, argv, "A:o:b:")) != -1) {
3819 			sp_ext_offset_t	offset;
3820 			sp_ext_length_t	length;
3821 			longlong_t	tmp_size;
3822 
3823 			switch (c) {
3824 			case 'A':	/* data alignment */
3825 				if (meta_sp_parsesizestring(optarg,
3826 				    &alignment) == -1) {
3827 					failed = 1;
3828 				}
3829 				break;
3830 			case 'o':	/* offset in the partition */
3831 				if (oflag == 1) {
3832 					failed = 1;
3833 				} else {
3834 					tmp_size = atoll(optarg);
3835 					if (tmp_size <= 0) {
3836 						failed = 1;
3837 					} else {
3838 						oflag = 1;
3839 						options |= MDCMD_DIRECT;
3840 
3841 						offset = tmp_size;
3842 					}
3843 				}
3844 
3845 				break;
3846 			case 'b':	/* number of blocks */
3847 				if (oflag == 0) {
3848 					failed = 1;
3849 				} else {
3850 					tmp_size = atoll(optarg);
3851 					if (tmp_size <= 0) {
3852 						failed = 1;
3853 					} else {
3854 						oflag = 0;
3855 
3856 						length = tmp_size;
3857 
3858 						/* we have a pair of values */
3859 						meta_sp_list_insert(*spp, np,
3860 						    &extlist, offset, length,
3861 						    EXTTYP_ALLOC, seq++,
3862 						    EXTFLG_UPDATE,
3863 						    meta_sp_cmp_by_offset);
3864 						len += length;
3865 					}
3866 				}
3867 
3868 				break;
3869 			default:
3870 				argc -= old_optind;
3871 				argv += old_optind;
3872 				goto options;
3873 			}
3874 
3875 			if (failed) {
3876 				argc -= old_optind;
3877 				argv += old_optind;
3878 				goto syntax;
3879 			}
3880 
3881 			old_optind = optind;
3882 		}
3883 		argc -= optind;
3884 		argv += optind;
3885 
3886 		/*
3887 		 * Must have matching pairs of -o and -b flags
3888 		 */
3889 		if (oflag != 0)
3890 			goto syntax;
3891 
3892 		/*
3893 		 * Can't specify both layout (indicated indirectly by
3894 		 * len being set by thye -o/-b cases above) AND
3895 		 * alignment
3896 		 */
3897 		if ((len > 0LL) && (alignment > 0LL))
3898 			goto syntax;
3899 
3900 		/*
3901 		 * sanity check the allocation list
3902 		 */
3903 		if ((extlist != NULL) && meta_sp_list_overlaps(extlist))
3904 			goto syntax;
3905 	}
3906 
3907 	if (len == 0LL) {
3908 		if (argc == 0)
3909 			goto syntax;
3910 		if (meta_sp_parsesize(argv[0], &len) == -1)
3911 			goto syntax;
3912 		--argc, ++argv;
3913 	}
3914 
3915 	msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val));
3916 	msp->ext.ext_val->len = len;
3917 	msp->compnamep = spcompnp;
3918 
3919 	/* we should be at the end */
3920 	if (argc != 0)
3921 		goto syntax;
3922 
3923 	/* create soft partition */
3924 	if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0)
3925 		goto out;
3926 	rval = 0;
3927 
3928 	/* let em know */
3929 	if (options & MDCMD_PRINT) {
3930 		(void) printf(dgettext(TEXT_DOMAIN,
3931 		    "%s: Soft Partition is setup\n"),
3932 		    devname);
3933 		(void) fflush(stdout);
3934 	}
3935 	goto out;
3936 
3937 syntax:
3938 	/* syntax error */
3939 	rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv);
3940 	goto out;
3941 
3942 options:
3943 	/* options error */
3944 	rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv);
3945 	goto out;
3946 
3947 out:
3948 	if (msp != NULL) {
3949 		if (msp->ext.ext_val != NULL) {
3950 			Free(msp->ext.ext_val);
3951 		}
3952 		Free(msp);
3953 	}
3954 
3955 	return (rval);
3956 }
3957 
3958 /*
3959  * FUNCTION:	meta_free_sp()
3960  * INPUT:	msp	- the soft partition unit to free
3961  * OUTPUT:	none
3962  * RETURNS:	void
3963  * PURPOSE:	provides an interface from the rest of libmeta for freeing a
3964  *		soft partition unit
3965  */
3966 void
3967 meta_free_sp(md_sp_t *msp)
3968 {
3969 	Free(msp);
3970 }
3971 
3972 /*
3973  * FUNCTION:	meta_sp_issp()
3974  * INPUT:	sp	- the set name to check
3975  *		np	- the name to check
3976  * OUTPUT:	ep	- return error pointer
3977  * RETURNS:	int	- 0 means sp,np is a soft partition
3978  *			  1 means sp,np is not a soft partition
3979  * PURPOSE:	determines whether the given device is a soft partition
3980  *		device.  This is called by other metadevice check routines.
3981  */
3982 int
3983 meta_sp_issp(
3984 	mdsetname_t	*sp,
3985 	mdname_t	*np,
3986 	md_error_t	*ep
3987 )
3988 {
3989 	if (meta_get_sp_common(sp, np, 0, ep) == NULL)
3990 		return (1);
3991 
3992 	return (0);
3993 }
3994 
3995 /*
3996  * FUNCTION:	meta_check_sp()
3997  * INPUT:	sp	- the set name to check
3998  *		msp	- the unit structure to check
3999  *		options	- creation options
4000  * OUTPUT:	repart_options - options to be passed to
4001  *				meta_repartition_drive()
4002  *		ep	- return error pointer
4003  * RETURNS:	int	-  0 ok to create on this component
4004  *			  -1 error or not ok to create on this component
4005  * PURPOSE:	Checks to determine whether the rules for creation of
4006  *		soft partitions allow creation of a soft partition on
4007  *		the device described by the mdname_t structure referred
4008  *		to by msp->compnamep.
4009  *
4010  *		NOTE: Does NOT check to determine whether the extents
4011  *		      described in the md_sp_t structure referred to by
4012  *		      msp will fit on the device described by the mdname_t
4013  *		      structure located at msp->compnamep.
4014  */
4015 static int
4016 meta_check_sp(
4017 	mdsetname_t	*sp,
4018 	md_sp_t		*msp,
4019 	mdcmdopts_t	options,
4020 	int		*repart_options,
4021 	md_error_t	*ep
4022 )
4023 {
4024 	md_common_t	*mdp;
4025 	mdname_t	*compnp = msp->compnamep;
4026 	uint_t		slice;
4027 	mddrivename_t	*dnp;
4028 	mdname_t	*slicenp;
4029 	mdvtoc_t	*vtocp;
4030 
4031 	/* make sure it is in the set */
4032 	if (meta_check_inset(sp, compnp, ep) != 0)
4033 		return (-1);
4034 
4035 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4036 		uint_t	rep_slice;
4037 
4038 		/*
4039 		 * check to make sure we can partition this drive.
4040 		 * we cannot continue if any of the following are
4041 		 * true:
4042 		 * The drive is a metadevice.
4043 		 * The drive contains a mounted slice.
4044 		 * The drive contains a slice being swapped to.
4045 		 * The drive contains slices which are part of other
4046 		 * metadevices.
4047 		 * The drive contains a metadb.
4048 		 */
4049 		if (metaismeta(compnp))
4050 			return (mddeverror(ep, MDE_IS_META, compnp->dev,
4051 			    compnp->cname));
4052 
4053 		assert(compnp->drivenamep != NULL);
4054 
4055 		/*
4056 		 * ensure that we have slice 0 since the disk will be
4057 		 * repartitioned in the USE_WHOLE_DISK case.  this check
4058 		 * is redundant unless the user incorrectly specifies a
4059 		 * a fully qualified drive AND slice name (i.e.,
4060 		 * /dev/dsk/cXtXdXsX), which will be incorrectly
4061 		 * recognized as a drive name by the metaname code.
4062 		 */
4063 
4064 		if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL)
4065 			return (-1);
4066 		if (slice != MD_SLICE0)
4067 			return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname));
4068 
4069 		dnp = compnp->drivenamep;
4070 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
4071 			return (-1);
4072 
4073 		for (slice = 0; slice < vtocp->nparts; slice++) {
4074 
4075 			/* only check if the slice really exists */
4076 			if (vtocp->parts[slice].size == 0)
4077 				continue;
4078 
4079 			slicenp = metaslicename(dnp, slice, ep);
4080 			if (slicenp == NULL)
4081 				return (-1);
4082 
4083 			/* check to ensure that it is not already in use */
4084 			if (meta_check_inuse(sp,
4085 			    slicenp, MDCHK_INUSE, ep) != 0) {
4086 				return (-1);
4087 			}
4088 
4089 			/*
4090 			 * Up to this point, tests are applied to all
4091 			 * slices uniformly.
4092 			 */
4093 
4094 			if (slice == rep_slice) {
4095 				/*
4096 				 * Tests inside the body of this
4097 				 * conditional are applied only to
4098 				 * slice seven.
4099 				 */
4100 				if (meta_check_inmeta(sp, slicenp,
4101 				    options | MDCHK_ALLOW_MDDB |
4102 				    MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0)
4103 					return (-1);
4104 
4105 				/*
4106 				 * For slice seven, a metadb is NOT an
4107 				 * automatic failure. It merely means
4108 				 * that we're not allowed to muck
4109 				 * about with the partitioning of that
4110 				 * slice.  We indicate this by masking
4111 				 * in the MD_REPART_LEAVE_REP flag.
4112 				 */
4113 				if (metahasmddb(sp, slicenp, ep)) {
4114 					assert(repart_options !=
4115 					    NULL);
4116 					*repart_options |=
4117 					    MD_REPART_LEAVE_REP;
4118 				}
4119 
4120 				/*
4121 				 * Skip the remaining tests for slice
4122 				 * seven
4123 				 */
4124 				continue;
4125 			}
4126 
4127 			/*
4128 			 * Tests below this point will be applied to
4129 			 * all slices EXCEPT for the replica slice.
4130 			 */
4131 
4132 
4133 			/* check if component is in a metadevice */
4134 			if (meta_check_inmeta(sp, slicenp, options, 0,
4135 			    -1, ep) != 0)
4136 				return (-1);
4137 
4138 			/* check to see if component has a metadb */
4139 			if (metahasmddb(sp, slicenp, ep))
4140 				return (mddeverror(ep, MDE_HAS_MDDB,
4141 				    slicenp->dev, slicenp->cname));
4142 		}
4143 		/*
4144 		 * This should be all of the testing necessary when
4145 		 * the MDCMD_USE_WHOLE_DISK flag is set; the rest of
4146 		 * meta_check_sp() is oriented towards component
4147 		 * arguments instead of disks.
4148 		 */
4149 		goto meta_check_sp_ok;
4150 
4151 	}
4152 
4153 	/* check to ensure that it is not already in use */
4154 	if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) {
4155 		return (-1);
4156 	}
4157 
4158 	if (!metaismeta(compnp)) {	/* handle non-metadevices */
4159 
4160 		/*
4161 		 * The component can have one or more soft partitions on it
4162 		 * already, but can't be part of any other type of metadevice,
4163 		 * so if it is used for a metadevice, but the metadevice
4164 		 * isn't a soft partition, return failure.
4165 		 */
4166 
4167 		if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 &&
4168 		    meta_check_insp(sp, compnp, 0, -1, ep) == 0) {
4169 			return (-1);
4170 		}
4171 	} else {			/* handle metadevices */
4172 		/* get underlying unit & check capabilities */
4173 		if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL)
4174 			return (-1);
4175 
4176 		if ((! (mdp->capabilities & MD_CAN_PARENT)) ||
4177 		    (! (mdp->capabilities & MD_CAN_SP)))
4178 			return (mdmderror(ep, MDE_INVAL_UNIT,
4179 			    meta_getminor(compnp->dev), compnp->cname));
4180 	}
4181 
4182 meta_check_sp_ok:
4183 	mdclrerror(ep);
4184 	return (0);
4185 }
4186 
4187 /*
4188  * FUNCTION:	meta_create_sp()
4189  * INPUT:	sp	- the set name to create in
4190  *		msp	- the unit structure to create
4191  *		oblist	- an optional list of requested extents (-o/-b options)
4192  *		options	- creation options
4193  *		alignment - data alignment
4194  * OUTPUT:	ep	- return error pointer
4195  * RETURNS:	int	-  0 success, -1 error
4196  * PURPOSE:	does most of the work for creating a soft partition.  If
4197  *		metainit -p -e was used, first partition the drive.  Then
4198  *		create an extent list based on the existing soft partitions
4199  *		and assume all space not used by them is free.  Storage for
4200  *		the new soft partition is allocated from the free extents
4201  *		based on the length specified on the command line or the
4202  *		oblist passed in.  The unit structure is then committed and
4203  *		the watermarks are updated.  Finally, the status is changed to
4204  *		Okay and the process is complete.
4205  */
4206 static int
4207 meta_create_sp(
4208 	mdsetname_t	*sp,
4209 	md_sp_t		*msp,
4210 	sp_ext_node_t	*oblist,
4211 	mdcmdopts_t	options,
4212 	sp_ext_length_t	alignment,
4213 	md_error_t	*ep
4214 )
4215 {
4216 	mdname_t	*np = msp->common.namep;
4217 	mdname_t	*compnp = msp->compnamep;
4218 	mp_unit_t	*mp = NULL;
4219 	mdnamelist_t	*keynlp = NULL, *spnlp = NULL;
4220 	md_set_params_t	set_params;
4221 	int		rval = -1;
4222 	diskaddr_t	comp_size;
4223 	diskaddr_t	sp_start;
4224 	sp_ext_node_t	*extlist = NULL;
4225 	int		numexts = 0;	/* number of extents */
4226 	int		count = 0;
4227 	int		committed = 0;
4228 	int		repart_options = MD_REPART_FORCE;
4229 	int		create_flag = MD_CRO_32BIT;
4230 
4231 	md_set_desc	*sd;
4232 	mm_unit_t	*mm;
4233 	md_set_mmown_params_t	*ownpar = NULL;
4234 	int		comp_is_mirror = 0;
4235 
4236 	/* validate soft partition */
4237 	if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0)
4238 		return (-1);
4239 
4240 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4241 		if ((options & MDCMD_DOIT) != 0) {
4242 			if (meta_repartition_drive(sp,
4243 			    compnp->drivenamep,
4244 			    repart_options,
4245 			    NULL, /* Don't return the VTOC */
4246 			    ep) != 0)
4247 
4248 				return (-1);
4249 		} else {
4250 			/*
4251 			 * If -n and -e are both specified, it doesn't make
4252 			 * sense to continue without actually partitioning
4253 			 * the drive.
4254 			 */
4255 			return (0);
4256 		}
4257 	}
4258 
4259 	/* populate the start_blk field of the component name */
4260 	if ((sp_start = meta_sp_get_start(sp, compnp, ep)) ==
4261 	    MD_DISKADDR_ERROR) {
4262 		rval = -1;
4263 		goto out;
4264 	}
4265 
4266 	if (options & MDCMD_DOIT) {
4267 		/* store name in namespace */
4268 		if (add_key_name(sp, compnp, &keynlp, ep) != 0) {
4269 			rval = -1;
4270 			goto out;
4271 		}
4272 	}
4273 
4274 	/*
4275 	 * Get a list of the soft partitions that currently reside on
4276 	 * the component.  We should ALWAYS force reload the cache,
4277 	 * because if this is a single creation, there will not BE a
4278 	 * cached list, and if we're using the md.tab, we must rebuild
4279 	 * the list because it won't contain the previous (if any)
4280 	 * soft partition.
4281 	 */
4282 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4283 	if (count < 0) {
4284 		/* error occured */
4285 		rval = -1;
4286 		goto out;
4287 	}
4288 
4289 	/*
4290 	 * get the size of the underlying device.  if the size is smaller
4291 	 * than or equal to the watermark size, we know there isn't
4292 	 * enough space.
4293 	 */
4294 	if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) {
4295 		rval = -1;
4296 		goto out;
4297 	} else if (comp_size <= MD_SP_WMSIZE) {
4298 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname);
4299 		rval = -1;
4300 		goto out;
4301 	}
4302 	/*
4303 	 * seed extlist with reserved space at the beginning of the volume and
4304 	 * enough space for the end watermark.  The end watermark always gets
4305 	 * updated, but if the underlying device changes size it may not be
4306 	 * pointed to until the extent before it is updated.  Since the
4307 	 * end of the reserved space is where the first watermark starts,
4308 	 * the reserved extent should never be marked for updating.
4309 	 */
4310 
4311 	meta_sp_list_insert(NULL, NULL, &extlist,
4312 	    0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4313 	meta_sp_list_insert(NULL, NULL, &extlist,
4314 	    (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE,
4315 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4316 
4317 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4318 		rval = -1;
4319 		goto out;
4320 	}
4321 
4322 	metafreenamelist(spnlp);
4323 
4324 	if (getenv(META_SP_DEBUG)) {
4325 		meta_sp_debug("meta_create_sp: list of used extents:\n");
4326 		meta_sp_list_dump(extlist);
4327 	}
4328 
4329 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4330 
4331 	/* get extent list from -o/-b options or from free space */
4332 	if (options & MDCMD_DIRECT) {
4333 		if (getenv(META_SP_DEBUG)) {
4334 			meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n");
4335 			meta_sp_list_dump(oblist);
4336 		}
4337 
4338 		numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist);
4339 		if (numexts == -1) {
4340 			(void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname);
4341 			rval = -1;
4342 			goto out;
4343 		}
4344 	} else {
4345 		numexts = meta_sp_alloc_by_len(sp, np, &extlist,
4346 		    &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment :
4347 		    meta_sp_get_default_alignment(sp, compnp, ep));
4348 		if (numexts == -1) {
4349 			(void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname);
4350 			rval = -1;
4351 			goto out;
4352 		}
4353 	}
4354 
4355 	assert(extlist != NULL);
4356 
4357 	/* create soft partition */
4358 	mp = meta_sp_createunit(msp->common.namep, msp->compnamep,
4359 	    extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep);
4360 
4361 	create_flag = meta_check_devicesize(mp->c.un_total_blocks);
4362 
4363 	/* if we're not doing anything (metainit -n), return success */
4364 	if (! (options & MDCMD_DOIT)) {
4365 		rval = 0;	/* success */
4366 		goto out;
4367 	}
4368 
4369 	(void) memset(&set_params, 0, sizeof (set_params));
4370 
4371 	if (create_flag == MD_CRO_64BIT) {
4372 		mp->c.un_revision |= MD_64BIT_META_DEV;
4373 		set_params.options = MD_CRO_64BIT;
4374 	} else {
4375 		mp->c.un_revision &= ~MD_64BIT_META_DEV;
4376 		set_params.options = MD_CRO_32BIT;
4377 	}
4378 
4379 	if (getenv(META_SP_DEBUG)) {
4380 		meta_sp_debug("meta_create_sp: printing unit structure\n");
4381 		meta_sp_printunit(mp);
4382 	}
4383 
4384 	/*
4385 	 * Check to see if we're trying to create a partition on a mirror. If so
4386 	 * we may have to enforce an ownership change before writing the
4387 	 * watermark out.
4388 	 */
4389 	if (metaismeta(compnp)) {
4390 		char *miscname;
4391 
4392 		miscname = metagetmiscname(compnp, ep);
4393 		if (miscname != NULL)
4394 			comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0);
4395 		else
4396 			comp_is_mirror = 0;
4397 	} else {
4398 		comp_is_mirror = 0;
4399 	}
4400 
4401 	/*
4402 	 * For a multi-node environment we have to ensure that the master
4403 	 * node owns an underlying mirror before we issue the MD_IOCSET ioctl.
4404 	 * If the master does not own the device we will deadlock as the
4405 	 * implicit write of the watermarks (in sp_ioctl.c) will cause an
4406 	 * ownership change that will block as the MD_IOCSET is still in
4407 	 * progress. To close this window we force an owner change to occur
4408 	 * before issuing the MD_IOCSET. We cannot simply open the device and
4409 	 * write to it as this will only work for the first soft-partition
4410 	 * creation.
4411 	 */
4412 
4413 	if (comp_is_mirror && !metaislocalset(sp)) {
4414 
4415 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4416 			rval = -1;
4417 			goto out;
4418 		}
4419 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
4420 			mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
4421 			if (mm == NULL) {
4422 				rval = -1;
4423 				goto out;
4424 			} else {
4425 				rval = meta_mn_change_owner(&ownpar, sp->setno,
4426 				    meta_getminor(compnp->dev),
4427 				    sd->sd_mn_mynode->nd_nodeid,
4428 				    MD_MN_MM_PREVENT_CHANGE |
4429 				    MD_MN_MM_SPAWN_THREAD);
4430 				if (rval == -1)
4431 					goto out;
4432 			}
4433 		}
4434 	}
4435 
4436 	set_params.mnum = MD_SID(mp);
4437 	set_params.size = mp->c.un_size;
4438 	set_params.mdp = (uintptr_t)mp;
4439 	MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum));
4440 
4441 	/* first phase of commit. */
4442 	if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
4443 	    np->cname) != 0) {
4444 		(void) mdstealerror(ep, &set_params.mde);
4445 		rval = -1;
4446 		goto out;
4447 	}
4448 
4449 	/* we've successfully committed the record */
4450 	committed = 1;
4451 
4452 	/* write watermarks */
4453 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
4454 		rval = -1;
4455 		goto out;
4456 	}
4457 
4458 	/*
4459 	 * Allow mirror ownership to change. If we don't succeed in this
4460 	 * ioctl it isn't fatal, but the cluster will probably hang fairly
4461 	 * soon as the mirror owner won't change. However, we have
4462 	 * successfully written the watermarks out to the device so the
4463 	 * softpart creation has succeeded
4464 	 */
4465 	if (ownpar) {
4466 		(void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum,
4467 		    ownpar->d.owner,
4468 		    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
4469 	}
4470 
4471 	/* second phase of commit, set status to MD_SP_OK */
4472 	if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) {
4473 		rval = -1;
4474 		goto out;
4475 	}
4476 	rval = 0;
4477 out:
4478 	Free(mp);
4479 	if (ownpar)
4480 		Free(ownpar);
4481 
4482 	if (extlist != NULL)
4483 		meta_sp_list_free(&extlist);
4484 
4485 	if (rval != 0 && keynlp != NULL && committed != 1)
4486 		(void) del_key_names(sp, keynlp, NULL);
4487 
4488 	metafreenamelist(keynlp);
4489 
4490 	return (rval);
4491 }
4492 
4493 /*
4494  * **************************************************************************
4495  *                      Reset (metaclear) Functions                         *
4496  * **************************************************************************
4497  */
4498 
4499 /*
4500  * FUNCTION:	meta_sp_reset_common()
4501  * INPUT:	sp	- the set name of the device to reset
4502  *		np	- the name of the device to reset
4503  *		msp	- the unit structure to reset
4504  *		options	- metaclear options
4505  * OUTPUT:	ep	- return error pointer
4506  * RETURNS:	int	-  0 success, -1 error
4507  * PURPOSE:	"resets", or more accurately deletes, the soft partition
4508  *		specified.  First the state is set to "deleting" and then the
4509  *		watermarks are all cleared out.  Once the watermarks have been
4510  *		updated, the unit structure is deleted from the metadb.
4511  */
4512 static int
4513 meta_sp_reset_common(
4514 	mdsetname_t	*sp,
4515 	mdname_t	*np,
4516 	md_sp_t		*msp,
4517 	md_sp_reset_t	reset_params,
4518 	mdcmdopts_t	options,
4519 	md_error_t	*ep
4520 )
4521 {
4522 	char	*miscname;
4523 	int	rval = -1;
4524 	int	is_open = 0;
4525 
4526 	/* make sure that nobody owns us */
4527 	if (MD_HAS_PARENT(msp->common.parent))
4528 		return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev),
4529 		    np->cname));
4530 
4531 	/* make sure that the soft partition isn't open */
4532 	if ((is_open = meta_isopen(sp, np, ep, options)) < 0)
4533 		return (-1);
4534 	else if (is_open)
4535 		return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev),
4536 		    np->cname));
4537 
4538 	/* get miscname */
4539 	if ((miscname = metagetmiscname(np, ep)) == NULL)
4540 		return (-1);
4541 
4542 	/* fill in reset params */
4543 	MD_SETDRIVERNAME(&reset_params, miscname, sp->setno);
4544 	reset_params.mnum = meta_getminor(np->dev);
4545 	reset_params.force = (options & MDCMD_FORCE) ? 1 : 0;
4546 
4547 	/*
4548 	 * clear soft partition - phase one.
4549 	 * place the soft partition into the "delete pending" state.
4550 	 */
4551 	if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0)
4552 		return (-1);
4553 
4554 	/*
4555 	 * Now clear the watermarks.  If the force flag is specified,
4556 	 * ignore any errors writing the watermarks and delete the unit
4557 	 * structure anyway.  An error may leave the on-disk format in a
4558 	 * corrupt state.  If force is not specified and we fail here,
4559 	 * the soft partition will remain in the "delete pending" state.
4560 	 */
4561 	if ((meta_sp_clear_wm(sp, msp, ep) < 0) &&
4562 	    ((options & MDCMD_FORCE) == 0))
4563 		goto out;
4564 
4565 	/*
4566 	 * clear soft partition - phase two.
4567 	 * the driver removes the soft partition from the metadb and
4568 	 * zeros out incore version.
4569 	 */
4570 	if (metaioctl(MD_IOCRESET, &reset_params,
4571 	    &reset_params.mde, np->cname) != 0) {
4572 		(void) mdstealerror(ep, &reset_params.mde);
4573 		goto out;
4574 	}
4575 
4576 	/*
4577 	 * Wait for the /dev to be cleaned up. Ignore the return
4578 	 * value since there's not much we can do.
4579 	 */
4580 	(void) meta_update_devtree(meta_getminor(np->dev));
4581 
4582 	rval = 0;	/* success */
4583 
4584 	if (options & MDCMD_PRINT) {
4585 		(void) printf(dgettext(TEXT_DOMAIN,
4586 		    "%s: Soft Partition is cleared\n"),
4587 		    np->cname);
4588 		(void) fflush(stdout);
4589 	}
4590 
4591 	/*
4592 	 * if told to recurse and on a metadevice, then attempt to
4593 	 * clear the subdevices.  Indicate failure if the clear fails.
4594 	 */
4595 	if ((options & MDCMD_RECURSE) &&
4596 	    (metaismeta(msp->compnamep)) &&
4597 	    (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0))
4598 		rval = -1;
4599 
4600 out:
4601 	meta_invalidate_name(np);
4602 	return (rval);
4603 }
4604 
4605 /*
4606  * FUNCTION:	meta_sp_reset()
4607  * INPUT:	sp	- the set name of the device to reset
4608  *		np	- the name of the device to reset
4609  *		options	- metaclear options
4610  * OUTPUT:	ep	- return error pointer
4611  * RETURNS:	int	-  0 success, -1 error
4612  * PURPOSE:	provides the entry point to the rest of libmeta for deleting a
4613  *		soft partition.  If np is NULL, then soft partitions are
4614  *		all deleted at the current level and then recursively deleted.
4615  *		Otherwise, if a name is specified either directly or as a
4616  *		result of a recursive operation, it deletes only that name.
4617  *		Since something sitting under a soft partition may be parented
4618  *		to it, we have to reparent that other device to another soft
4619  *		partition on the same component if we're deleting the one it's
4620  *		parented to.
4621  */
4622 int
4623 meta_sp_reset(
4624 	mdsetname_t	*sp,
4625 	mdname_t	*np,
4626 	mdcmdopts_t	options,
4627 	md_error_t	*ep
4628 )
4629 {
4630 	md_sp_t		*msp;
4631 	int		rval = -1;
4632 	mdnamelist_t	*spnlp = NULL, *nlp = NULL;
4633 	md_sp_reset_t	reset_params;
4634 	int		num_sp;
4635 
4636 	assert(sp != NULL);
4637 
4638 	/* reset/delete all soft paritions */
4639 	if (np == NULL) {
4640 		/*
4641 		 * meta_reset_all sets MDCMD_RECURSE, but this behavior
4642 		 * is incorrect for soft partitions.  We want to clear
4643 		 * all soft partitions at a particular level in the
4644 		 * metadevice stack before moving to the next level.
4645 		 * Thus, we clear MDCMD_RECURSE from the options.
4646 		 */
4647 		options &= ~MDCMD_RECURSE;
4648 
4649 		/* for each soft partition */
4650 		rval = 0;
4651 		if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
4652 			rval = -1;
4653 
4654 		for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) {
4655 			np = nlp->namep;
4656 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4657 				rval = -1;
4658 				break;
4659 			}
4660 			/*
4661 			 * meta_reset_all calls us twice to get soft
4662 			 * partitions at the top and bottom of the stack.
4663 			 * thus, if we have a parent, we'll get deleted
4664 			 * on the next call.
4665 			 */
4666 			if (MD_HAS_PARENT(msp->common.parent))
4667 				continue;
4668 			/*
4669 			 * If this is a multi-node set, we send a series
4670 			 * of individual metaclear commands.
4671 			 */
4672 			if (meta_is_mn_set(sp, ep)) {
4673 				if (meta_mn_send_metaclear_command(sp,
4674 				    np->cname, options, 0, ep) != 0) {
4675 					rval = -1;
4676 					break;
4677 				}
4678 			} else {
4679 				if (meta_sp_reset(sp, np, options, ep) != 0) {
4680 					rval = -1;
4681 					break;
4682 				}
4683 			}
4684 		}
4685 		/* cleanup return status */
4686 		metafreenamelist(spnlp);
4687 		return (rval);
4688 	}
4689 
4690 	/* check the name */
4691 	if (metachkmeta(np, ep) != 0)
4692 		return (-1);
4693 
4694 	/* get the unit structure */
4695 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4696 		return (-1);
4697 
4698 	/* clear out reset parameters */
4699 	(void) memset(&reset_params, 0, sizeof (reset_params));
4700 
4701 	/* if our child is a metadevice, we need to deparent/reparent it */
4702 	if (metaismeta(msp->compnamep)) {
4703 		/* get sp's on this component */
4704 		if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep,
4705 		    &spnlp, 1, ep)) <= 0)
4706 			/* no sp's on this device.  error! */
4707 			return (-1);
4708 		else if (num_sp == 1)
4709 			/* last sp on this device, so we deparent */
4710 			reset_params.new_parent = MD_NO_PARENT;
4711 		else {
4712 			/* have to reparent this metadevice */
4713 			for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4714 				if (meta_getminor(nlp->namep->dev) ==
4715 				    meta_getminor(np->dev))
4716 					continue;
4717 				/*
4718 				 * this isn't the softpart we are deleting,
4719 				 * so use this device as the new parent.
4720 				 */
4721 				reset_params.new_parent =
4722 				    meta_getminor(nlp->namep->dev);
4723 				break;
4724 			}
4725 		}
4726 		metafreenamelist(spnlp);
4727 	}
4728 
4729 	if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0)
4730 		return (-1);
4731 
4732 	return (0);
4733 }
4734 
4735 /*
4736  * FUNCTION:	meta_sp_reset_component()
4737  * INPUT:	sp	- the set name of the device to reset
4738  *		name	- the string name of the device to reset
4739  *		options	- metaclear options
4740  * OUTPUT:	ep	- return error pointer
4741  * RETURNS:	int	-  0 success, -1 error
4742  * PURPOSE:	provides the ability to delete all soft partitions on a
4743  *		specified device (metaclear -p).  It first gets all of the
4744  *		soft partitions on the component and then deletes each one
4745  *		individually.
4746  */
4747 int
4748 meta_sp_reset_component(
4749 	mdsetname_t	*sp,
4750 	char		*name,
4751 	mdcmdopts_t	options,
4752 	md_error_t	*ep
4753 )
4754 {
4755 	mdname_t	*compnp, *np;
4756 	mdnamelist_t	*spnlp = NULL;
4757 	mdnamelist_t	*nlp = NULL;
4758 	md_sp_t		*msp;
4759 	int		count;
4760 	md_sp_reset_t	reset_params;
4761 
4762 	if ((compnp = metaname(&sp, name, UNKNOWN, ep)) == NULL)
4763 		return (-1);
4764 
4765 	/* If we're starting out with no soft partitions, it's an error */
4766 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4767 	if (count == 0)
4768 		return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname));
4769 	else if (count < 0)
4770 		return (-1);
4771 
4772 	/*
4773 	 * clear all soft partitions on this component.
4774 	 * NOTE: we reparent underlying metadevices as we go so that
4775 	 * things stay sane.  Also, if we encounter an error, we stop
4776 	 * and go no further in case recovery might be needed.
4777 	 */
4778 	for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4779 		/* clear out reset parameters */
4780 		(void) memset(&reset_params, 0, sizeof (reset_params));
4781 
4782 		/* check the name */
4783 		np = nlp->namep;
4784 
4785 		if (metachkmeta(np, ep) != 0) {
4786 			metafreenamelist(spnlp);
4787 			return (-1);
4788 		}
4789 
4790 		/* get the unit structure */
4791 		if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4792 			metafreenamelist(spnlp);
4793 			return (-1);
4794 		}
4795 
4796 		/* have to deparent/reparent metadevices */
4797 		if (metaismeta(compnp)) {
4798 			if (nlp->next == NULL)
4799 				reset_params.new_parent = MD_NO_PARENT;
4800 			else
4801 				reset_params.new_parent =
4802 				    meta_getminor(spnlp->next->namep->dev);
4803 		}
4804 
4805 		/* clear soft partition */
4806 		if (meta_sp_reset_common(sp, np, msp, reset_params,
4807 		    options, ep) < 0) {
4808 			metafreenamelist(spnlp);
4809 			return (-1);
4810 		}
4811 	}
4812 	metafreenamelist(spnlp);
4813 	return (0);
4814 }
4815 
4816 /*
4817  * **************************************************************************
4818  *                      Grow (metattach) Functions                          *
4819  * **************************************************************************
4820  */
4821 
4822 /*
4823  * FUNCTION:	meta_sp_attach()
4824  * INPUT:	sp	- the set name of the device to attach to
4825  *		np	- the name of the device to attach to
4826  *		addsize	- the unparsed string holding the amount of space to add
4827  *		options	- metattach options
4828  *		alignment - data alignment
4829  * OUTPUT:	ep	- return error pointer
4830  * RETURNS:	int	-  0 success, -1 error
4831  * PURPOSE:	grows a soft partition by reading in the existing unit
4832  *		structure and setting its state to Growing, allocating more
4833  *		space (similar to meta_create_sp()), updating the watermarks,
4834  *		and then writing out the new unit structure in the Okay state.
4835  */
4836 int
4837 meta_sp_attach(
4838 	mdsetname_t	*sp,
4839 	mdname_t	*np,
4840 	char		*addsize,
4841 	mdcmdopts_t	options,
4842 	sp_ext_length_t	alignment,
4843 	md_error_t	*ep
4844 )
4845 {
4846 	md_grow_params_t	grow_params;
4847 	sp_ext_length_t		grow_len;	/* amount to grow */
4848 	mp_unit_t		*mp, *new_un;
4849 	mdname_t		*compnp = NULL;
4850 
4851 	sp_ext_node_t		*extlist = NULL;
4852 	int			numexts;
4853 	mdnamelist_t		*spnlp = NULL;
4854 	int			count;
4855 	md_sp_t			*msp;
4856 	daddr_t			start_block;
4857 
4858 	/* should have the same set */
4859 	assert(sp != NULL);
4860 	assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev)));
4861 
4862 	/* check name */
4863 	if (metachkmeta(np, ep) != 0)
4864 		return (-1);
4865 
4866 	if (meta_sp_parsesize(addsize, &grow_len) == -1) {
4867 		return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname));
4868 	}
4869 
4870 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
4871 		return (-1);
4872 
4873 	/* make sure we don't have a parent */
4874 	if (MD_HAS_PARENT(mp->c.un_parent)) {
4875 		Free(mp);
4876 		return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname));
4877 	}
4878 
4879 	if (getenv(META_SP_DEBUG)) {
4880 		meta_sp_debug("meta_sp_attach: Unit structure before new "
4881 		    "space:\n");
4882 		meta_sp_printunit(mp);
4883 	}
4884 
4885 	/*
4886 	 * NOTE: the fast option to metakeyname is 0 as opposed to 1
4887 	 * If this was not the case we would suffer the following
4888 	 * assertion failure:
4889 	 * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP
4890 	 * file meta_check.x, line 315
4891 	 * I guess this is because we have not "seen" this drive before
4892 	 * and hence hit the failure - this is of course the attach routine
4893 	 */
4894 	if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) {
4895 		Free(mp);
4896 		return (-1);
4897 	}
4898 
4899 	/* metakeyname does not fill in the key. */
4900 	compnp->key = mp->un_key;
4901 
4902 	/* work out the space on the component that we are dealing with */
4903 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
4904 
4905 	/*
4906 	 * see if the component has been soft partitioned yet, or if an
4907 	 * error occurred.
4908 	 */
4909 	if (count == 0) {
4910 		Free(mp);
4911 		return (mdmderror(ep, MDE_NOT_SP, 0, np->cname));
4912 	} else if (count < 0) {
4913 		Free(mp);
4914 		return (-1);
4915 	}
4916 
4917 	/*
4918 	 * seed extlist with reserved space at the beginning of the volume and
4919 	 * enough space for the end watermark.  The end watermark always gets
4920 	 * updated, but if the underlying device changes size it may not be
4921 	 * pointed to until the extent before it is updated.  Since the
4922 	 * end of the reserved space is where the first watermark starts,
4923 	 * the reserved extent should never be marked for updating.
4924 	 */
4925 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
4926 	    MD_DISKADDR_ERROR) {
4927 		Free(mp);
4928 		return (-1);
4929 	}
4930 
4931 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
4932 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4933 	meta_sp_list_insert(NULL, NULL, &extlist,
4934 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
4935 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4936 
4937 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4938 		Free(mp);
4939 		return (-1);
4940 	}
4941 
4942 	metafreenamelist(spnlp);
4943 
4944 	if (getenv(META_SP_DEBUG)) {
4945 		meta_sp_debug("meta_sp_attach: list of used extents:\n");
4946 		meta_sp_list_dump(extlist);
4947 	}
4948 
4949 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4950 
4951 	assert(mp->un_numexts >= 1);
4952 	numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len,
4953 	    mp->un_ext[mp->un_numexts - 1].un_poff,
4954 	    (alignment > 0) ? alignment :
4955 	    meta_sp_get_default_alignment(sp, compnp, ep));
4956 
4957 	if (numexts == -1) {
4958 		Free(mp);
4959 		return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname));
4960 	}
4961 
4962 	/* allocate new unit structure and copy in old unit */
4963 	if ((new_un = meta_sp_updateunit(np, mp, extlist,
4964 	    grow_len, numexts, ep)) == NULL) {
4965 		Free(mp);
4966 		return (-1);
4967 	}
4968 	Free(mp);
4969 
4970 	/* If running in dryrun mode (-n option), we're done here */
4971 	if ((options & MDCMD_DOIT) == 0) {
4972 		if (options & MDCMD_PRINT) {
4973 			(void) printf(dgettext(TEXT_DOMAIN,
4974 			    "%s: Soft Partition would grow\n"),
4975 			    np->cname);
4976 			(void) fflush(stdout);
4977 		}
4978 		return (0);
4979 	}
4980 
4981 	if (getenv(META_SP_DEBUG)) {
4982 		meta_sp_debug("meta_sp_attach: updated unit structure:\n");
4983 		meta_sp_printunit(new_un);
4984 	}
4985 
4986 	assert(new_un != NULL);
4987 
4988 	(void) memset(&grow_params, 0, sizeof (grow_params));
4989 	if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) {
4990 		grow_params.options = MD_CRO_64BIT;
4991 		new_un->c.un_revision |= MD_64BIT_META_DEV;
4992 	} else {
4993 		grow_params.options = MD_CRO_32BIT;
4994 		new_un->c.un_revision &= ~MD_64BIT_META_DEV;
4995 	}
4996 	grow_params.mnum = MD_SID(new_un);
4997 	grow_params.size = new_un->c.un_size;
4998 	grow_params.mdp = (uintptr_t)new_un;
4999 	MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum));
5000 
5001 	if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde,
5002 	    np->cname) != 0) {
5003 		(void) mdstealerror(ep, &grow_params.mde);
5004 		return (-1);
5005 	}
5006 
5007 	/* update all watermarks */
5008 
5009 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
5010 		return (-1);
5011 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0)
5012 		return (-1);
5013 
5014 
5015 	/* second phase of commit, set status to MD_SP_OK */
5016 	if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0)
5017 		return (-1);
5018 
5019 	meta_invalidate_name(np);
5020 
5021 	if (options & MDCMD_PRINT) {
5022 		(void) printf(dgettext(TEXT_DOMAIN,
5023 		    "%s: Soft Partition has been grown\n"),
5024 		    np->cname);
5025 		(void) fflush(stdout);
5026 	}
5027 
5028 	return (0);
5029 }
5030 
5031 /*
5032  * **************************************************************************
5033  *                    Recovery (metarecover) Functions                      *
5034  * **************************************************************************
5035  */
5036 
5037 /*
5038  * FUNCTION:	meta_recover_sp()
5039  * INPUT:	sp	- the name of the set we are recovering on
5040  *		compnp	- name pointer for device we are recovering on
5041  *		argc	- argument count
5042  *		argv	- left over arguments not parsed by metarecover command
5043  *		options	- metarecover options
5044  * OUTPUT:	ep	- return error pointer
5045  * RETURNS:	int	- 0 - success, -1 - error
5046  * PURPOSE:	parse soft partitioning-specific metarecover options and
5047  *		dispatch to the appropriate function to handle recovery.
5048  */
5049 int
5050 meta_recover_sp(
5051 	mdsetname_t	*sp,
5052 	mdname_t	*compnp,
5053 	int		argc,
5054 	char		*argv[],
5055 	mdcmdopts_t	options,
5056 	md_error_t	*ep
5057 )
5058 {
5059 	md_set_desc	*sd;
5060 
5061 	if (argc > 1) {
5062 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5063 		    argc, argv);
5064 		return (-1);
5065 	}
5066 
5067 	/*
5068 	 * For a MN set, this operation must be performed on the master
5069 	 * as it is responsible for maintaining the watermarks
5070 	 */
5071 	if (!metaislocalset(sp)) {
5072 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
5073 			return (-1);
5074 		if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) {
5075 			(void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno,
5076 			    sd->sd_mn_master_nodenm, NULL, NULL);
5077 			return (-1);
5078 		}
5079 	}
5080 	if (argc == 0) {
5081 		/*
5082 		 * if no additional arguments are passed, metarecover should
5083 		 * validate both on-disk and metadb structures as well as
5084 		 * checking that both are consistent with each other
5085 		 */
5086 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5087 			return (-1);
5088 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5089 			return (-1);
5090 		if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0)
5091 			return (-1);
5092 	} else if (strcmp(argv[0], "-d") == 0) {
5093 		/*
5094 		 * Ensure that there is no existing valid record for this
5095 		 * soft-partition. If there is we have nothing to do.
5096 		 */
5097 		if (meta_sp_validate_unit(sp, compnp, options, ep) == 0)
5098 			return (-1);
5099 		/* validate and recover from on-disk structures */
5100 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5101 			return (-1);
5102 		if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0)
5103 			return (-1);
5104 	} else if (strcmp(argv[0], "-m") == 0) {
5105 		/* validate and recover from metadb structures */
5106 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5107 			return (-1);
5108 		if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0)
5109 			return (-1);
5110 	} else {
5111 		/* syntax error */
5112 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5113 		    argc, argv);
5114 		return (-1);
5115 	}
5116 
5117 	return (0);
5118 }
5119 
5120 /*
5121  * FUNCTION:	meta_sp_display_exthdr()
5122  * INPUT:	none
5123  * OUTPUT:	none
5124  * RETURNS:	void
5125  * PURPOSE:	print header line for sp_ext_node_t information.  to be used
5126  *		in conjunction with meta_sp_display_ext().
5127  */
5128 static void
5129 meta_sp_display_exthdr(void)
5130 {
5131 	(void) printf("%20s %5s %7s %20s %20s\n",
5132 	    dgettext(TEXT_DOMAIN, "Name"),
5133 	    dgettext(TEXT_DOMAIN, "Seq#"),
5134 	    dgettext(TEXT_DOMAIN, "Type"),
5135 	    dgettext(TEXT_DOMAIN, "Offset"),
5136 	    dgettext(TEXT_DOMAIN, "Length"));
5137 }
5138 
5139 
5140 /*
5141  * FUNCTION:	meta_sp_display_ext()
5142  * INPUT:	ext	- extent to display
5143  * OUTPUT:	none
5144  * RETURNS:	void
5145  * PURPOSE:	print selected fields from sp_ext_node_t.
5146  */
5147 static void
5148 meta_sp_display_ext(sp_ext_node_t *ext)
5149 {
5150 	/* print extent information */
5151 	if (ext->ext_namep != NULL)
5152 		(void) printf("%20s ", ext->ext_namep->cname);
5153 	else
5154 		(void) printf("%20s ", "NONE");
5155 
5156 	(void) printf("%5u ", ext->ext_seq);
5157 
5158 	switch (ext->ext_type) {
5159 	case EXTTYP_ALLOC:
5160 		(void) printf("%7s ", "ALLOC");
5161 		break;
5162 	case EXTTYP_FREE:
5163 		(void) printf("%7s ", "FREE");
5164 		break;
5165 	case EXTTYP_RESERVED:
5166 		(void) printf("%7s ", "RESV");
5167 		break;
5168 	case EXTTYP_END:
5169 		(void) printf("%7s ", "END");
5170 		break;
5171 	default:
5172 		(void) printf("%7s ", "INVLD");
5173 		break;
5174 	}
5175 
5176 	(void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length);
5177 }
5178 
5179 
5180 /*
5181  * FUNCTION:	meta_sp_checkseq()
5182  * INPUT:	extlist	- list of extents to be checked
5183  * OUTPUT:	none
5184  * RETURNS:	int	- 0 - success, -1 - error
5185  * PURPOSE:	check soft partition sequence numbers.  this function assumes
5186  *		that a list of extents representing 1 or more soft partitions
5187  *		is passed in sorted in sequence number order.  within a
5188  *		single soft partition, there may not be any missing or
5189  *		duplicate sequence numbers.
5190  */
5191 static int
5192 meta_sp_checkseq(sp_ext_node_t *extlist)
5193 {
5194 	sp_ext_node_t *ext;
5195 
5196 	assert(extlist != NULL);
5197 
5198 	for (ext = extlist;
5199 	    ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC;
5200 	    ext = ext->ext_next) {
5201 		if (ext->ext_next->ext_namep != NULL &&
5202 		    strcmp(ext->ext_next->ext_namep->cname,
5203 		    ext->ext_namep->cname) != 0)
5204 				continue;
5205 
5206 		if (ext->ext_next->ext_seq != ext->ext_seq + 1) {
5207 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5208 			    "%s: sequence numbers are "
5209 			    "incorrect: %d should be %d\n"),
5210 			    ext->ext_next->ext_namep->cname,
5211 			    ext->ext_next->ext_seq, ext->ext_seq + 1);
5212 			return (-1);
5213 		}
5214 	}
5215 	return (0);
5216 }
5217 
5218 
5219 /*
5220  * FUNCTION:	meta_sp_resolve_name_conflict()
5221  * INPUT:	sp	- name of set we're are recovering in.
5222  *		old_np	- name pointer of soft partition we found on disk.
5223  * OUTPUT:	new_np	- name pointer for new soft partition name.
5224  *		ep	- error pointer returned.
5225  * RETURNS:	int	- 0 - name not replace, 1 - name replaced, -1 - error
5226  * PURPOSE:	Check to see if the name of one of the soft partitions we found
5227  *		on disk already exists in the metadb.  If so, prompt for a new
5228  *		name.  In addition, we keep a static array of names that
5229  *		will be recovered from this device since these names don't
5230  *		exist in the configuration at this point but cannot be
5231  *		recovered more than once.
5232  */
5233 static int
5234 meta_sp_resolve_name_conflict(
5235 	mdsetname_t	*sp,
5236 	mdname_t	*old_np,
5237 	mdname_t	**new_np,
5238 	md_error_t	*ep
5239 )
5240 {
5241 	char		yesno[255];
5242 	char		*yes;
5243 	char		newname[MD_SP_MAX_DEVNAME_PLUS_1];
5244 	int		nunits;
5245 	static int	*used_names = NULL;
5246 
5247 	assert(old_np != NULL);
5248 
5249 	if (used_names == NULL) {
5250 		if ((nunits = meta_get_nunits(ep)) < 0)
5251 			return (-1);
5252 		used_names = Zalloc(nunits * sizeof (int));
5253 	}
5254 
5255 	/* see if it exists already */
5256 	if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 &&
5257 	    metagetmiscname(old_np, ep) == NULL) {
5258 		if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5259 			return (-1);
5260 		else {
5261 			used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1;
5262 			mdclrerror(ep);
5263 			return (0);
5264 		}
5265 	}
5266 
5267 	/* name exists, ask the user for a new one */
5268 	(void) printf(dgettext(TEXT_DOMAIN,
5269 	    "WARNING: A soft partition named %s was found in the extent\n"
5270 	    "headers, but this name already exists in the metadb "
5271 	    "configuration.\n"
5272 	    "In order to continue recovery you must supply\n"
5273 	    "a new name for this soft partition.\n"), old_np->cname);
5274 	(void) printf(dgettext(TEXT_DOMAIN,
5275 	    "Would you like to continue and supply a new name? (yes/no) "));
5276 
5277 	(void) fflush(stdout);
5278 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
5279 	    (strlen(yesno) == 1))
5280 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
5281 		    dgettext(TEXT_DOMAIN, "no"));
5282 	yes = dgettext(TEXT_DOMAIN, "yes");
5283 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
5284 		return (-1);
5285 	}
5286 
5287 	(void) fflush(stdin);
5288 
5289 	/* get the new name */
5290 	for (;;) {
5291 		(void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name "
5292 		    "for this soft partition (dXXXX) "));
5293 		(void) fflush(stdout);
5294 		if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL)
5295 			(void) strcpy(newname, "");
5296 
5297 		/* remove newline character */
5298 		if (newname[strlen(newname) - 1] == '\n')
5299 			newname[strlen(newname) - 1] = '\0';
5300 
5301 		if (!(is_metaname(newname)) ||
5302 		    (meta_init_make_device(&sp, newname, ep) <= 0)) {
5303 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5304 			    "Invalid metadevice name\n"));
5305 			(void) fflush(stderr);
5306 			continue;
5307 		}
5308 
5309 		if ((*new_np = metaname(&sp, newname,
5310 		    META_DEVICE, ep)) == NULL) {
5311 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5312 			    "Invalid metadevice name\n"));
5313 			(void) fflush(stderr);
5314 			continue;
5315 		}
5316 
5317 		assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits);
5318 		/* make sure the name isn't already being used */
5319 		if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] ||
5320 		    metagetmiscname(*new_np, ep) != NULL) {
5321 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5322 			    "That name already exists\n"));
5323 			continue;
5324 		} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5325 			return (-1);
5326 
5327 		break;
5328 	}
5329 
5330 	/* got a new name, place in used array and return */
5331 	used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1;
5332 	mdclrerror(ep);
5333 	return (1);
5334 }
5335 
5336 /*
5337  * FUNCTION:	meta_sp_validate_wm()
5338  * INPUT:	sp	- set name we are recovering in
5339  *		compnp	- name pointer for device we are recovering from
5340  *		options	- metarecover options
5341  * OUTPUT:	ep	- error pointer returned
5342  * RETURNS:	int	- 0 - success, -1 - error
5343  * PURPOSE:	validate and display watermark configuration.  walk the
5344  *		on-disk watermark structures and validate the information
5345  *		found within.  since a watermark configuration is
5346  *		"self-defining", the act of traversing the watermarks
5347  *		is part of the validation process.
5348  */
5349 static int
5350 meta_sp_validate_wm(
5351 	mdsetname_t	*sp,
5352 	mdname_t	*compnp,
5353 	mdcmdopts_t	options,
5354 	md_error_t	*ep
5355 )
5356 {
5357 	sp_ext_node_t	*extlist = NULL;
5358 	sp_ext_node_t	*ext;
5359 	int		num_sps = 0;
5360 	int		rval;
5361 
5362 	if ((options & MDCMD_VERBOSE) != 0)
5363 		(void) printf(dgettext(TEXT_DOMAIN,
5364 		    "Verifying on-disk structures on %s.\n"),
5365 		    compnp->cname);
5366 
5367 	/*
5368 	 * for each watermark, build an ext_node, place on list.
5369 	 */
5370 	rval = meta_sp_extlist_from_wm(sp, compnp, &extlist,
5371 	    meta_sp_cmp_by_nameseq, ep);
5372 
5373 	if ((options & MDCMD_VERBOSE) != 0) {
5374 		/* print out what we found */
5375 		if (extlist == NULL)
5376 			(void) printf(dgettext(TEXT_DOMAIN,
5377 			    "No extent headers found on %s.\n"),
5378 			    compnp->cname);
5379 		else {
5380 			(void) printf(dgettext(TEXT_DOMAIN,
5381 			    "The following extent headers were found on %s.\n"),
5382 			    compnp->cname);
5383 			meta_sp_display_exthdr();
5384 		}
5385 		for (ext = extlist; ext != NULL; ext = ext->ext_next)
5386 			meta_sp_display_ext(ext);
5387 	}
5388 
5389 	if (rval < 0) {
5390 		(void) printf(dgettext(TEXT_DOMAIN,
5391 		    "%s: On-disk structures invalid or "
5392 		    "no soft partitions found.\n"),
5393 		    compnp->cname);
5394 		return (-1);
5395 	}
5396 
5397 	assert(extlist != NULL);
5398 
5399 	/* count number of soft partitions */
5400 	for (ext = extlist;
5401 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5402 	    ext = ext->ext_next) {
5403 		if (ext->ext_next != NULL &&
5404 		    ext->ext_next->ext_namep != NULL &&
5405 		    strcmp(ext->ext_next->ext_namep->cname,
5406 		    ext->ext_namep->cname) == 0)
5407 				continue;
5408 		num_sps++;
5409 	}
5410 
5411 	if ((options & MDCMD_VERBOSE) != 0)
5412 		(void) printf(dgettext(TEXT_DOMAIN,
5413 		    "Found %d soft partition(s) on %s.\n"), num_sps,
5414 		    compnp->cname);
5415 
5416 	if (num_sps == 0) {
5417 		(void) printf(dgettext(TEXT_DOMAIN,
5418 		    "%s: No soft partitions.\n"), compnp->cname);
5419 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5420 	}
5421 
5422 	/* check sequence numbers */
5423 	if ((options & MDCMD_VERBOSE) != 0)
5424 		(void) printf(dgettext(TEXT_DOMAIN,
5425 		    "Checking sequence numbers.\n"));
5426 
5427 	if (meta_sp_checkseq(extlist) != 0)
5428 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5429 
5430 	return (0);
5431 }
5432 
5433 /*
5434  * FUNCTION:	meta_sp_validate_unit()
5435  * INPUT:	sp	- name of set we are recovering in
5436  *		compnp	- name of component we are recovering from
5437  *		options	- metarecover options
5438  * OUTPUT:	ep	- error pointer returned
5439  * RETURNS:	int	- 0 - success, -1 - error
5440  * PURPOSE:	validate and display metadb configuration.  begin by getting
5441  *		all soft partitions built on the specified component.  get
5442  *		the unit structure for each one and validate the fields within.
5443  */
5444 static int
5445 meta_sp_validate_unit(
5446 	mdsetname_t	*sp,
5447 	mdname_t	*compnp,
5448 	mdcmdopts_t	options,
5449 	md_error_t	*ep
5450 )
5451 {
5452 	md_sp_t		*msp;
5453 	mdnamelist_t	*spnlp = NULL;
5454 	mdnamelist_t	*namep = NULL;
5455 	int		count;
5456 	uint_t		extn;
5457 	sp_ext_length_t	size;
5458 
5459 	if ((options & MDCMD_VERBOSE) != 0)
5460 		(void) printf(dgettext(TEXT_DOMAIN,
5461 		    "%s: Validating soft partition metadb entries.\n"),
5462 		    compnp->cname);
5463 
5464 	if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR)
5465 		return (-1);
5466 
5467 	/* get all soft partitions on component */
5468 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
5469 
5470 	if (count == 0) {
5471 		(void) printf(dgettext(TEXT_DOMAIN,
5472 		    "%s: No soft partitions.\n"), compnp->cname);
5473 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5474 	} else if (count < 0) {
5475 		return (-1);
5476 	}
5477 
5478 	/* Now go through the soft partitions and check each one */
5479 	for (namep = spnlp; namep != NULL; namep = namep->next) {
5480 		mdname_t	*curnp = namep->namep;
5481 		sp_ext_offset_t	curvoff;
5482 
5483 		/* get the unit structure */
5484 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
5485 			return (-1);
5486 
5487 		/* verify generic unit structure parameters */
5488 		if ((options & MDCMD_VERBOSE) != 0)
5489 			(void) printf(dgettext(TEXT_DOMAIN,
5490 			    "\nVerifying device %s.\n"),
5491 			    curnp->cname);
5492 
5493 		/*
5494 		 * MD_SP_LAST is an invalid state and is always the
5495 		 * highest numbered.
5496 		 */
5497 		if (msp->status >= MD_SP_LAST) {
5498 			(void) printf(dgettext(TEXT_DOMAIN,
5499 			    "%s: status value %u is out of range.\n"),
5500 			    curnp->cname, msp->status);
5501 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5502 			    0, curnp->cname));
5503 		} else if ((options & MDCMD_VERBOSE) != 0) {
5504 			uint_t	tstate = 0;
5505 
5506 			if (metaismeta(msp->compnamep)) {
5507 				if (meta_get_tstate(msp->common.namep->dev,
5508 				    &tstate, ep) != 0)
5509 					return (-1);
5510 			}
5511 			(void) printf(dgettext(TEXT_DOMAIN,
5512 			    "%s: Status \"%s\" is valid.\n"),
5513 			    curnp->cname, meta_sp_status_to_name(msp->status,
5514 			    tstate & MD_DEV_ERRORED));
5515 		}
5516 
5517 		/* Now verify each extent */
5518 		if ((options & MDCMD_VERBOSE) != 0)
5519 			(void) printf("%14s %21s %21s %21s\n",
5520 			    dgettext(TEXT_DOMAIN, "Extent Number"),
5521 			    dgettext(TEXT_DOMAIN, "Virtual Offset"),
5522 			    dgettext(TEXT_DOMAIN, "Physical Offset"),
5523 			    dgettext(TEXT_DOMAIN, "Length"));
5524 
5525 		curvoff = 0ULL;
5526 		for (extn = 0; extn < msp->ext.ext_len; extn++) {
5527 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
5528 
5529 			if ((options & MDCMD_VERBOSE) != 0)
5530 				(void) printf("%14u %21llu %21llu %21llu\n",
5531 				    extn, extp->voff, extp->poff, extp->len);
5532 
5533 			if (extp->voff != curvoff) {
5534 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5535 				    "%s: virtual offset for extent %u "
5536 				    "is inconsistent, expected %llu, "
5537 				    "got %llu.\n"), curnp->cname, extn,
5538 				    curvoff, extp->voff);
5539 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5540 				    0, compnp->cname));
5541 			}
5542 
5543 			/* make sure extent does not drop off the end */
5544 			if ((extp->poff + extp->len) == size) {
5545 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5546 				    "%s: extent %u at offset %llu, "
5547 				    "length %llu exceeds the size of the "
5548 				    "device, %llu.\n"), curnp->cname,
5549 				    extn, extp->poff, extp->len, size);
5550 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5551 				    0, compnp->cname));
5552 			}
5553 
5554 			curvoff += extp->len;
5555 		}
5556 	}
5557 	if (options & MDCMD_PRINT) {
5558 		(void) printf(dgettext(TEXT_DOMAIN,
5559 		    "%s: Soft Partition metadb configuration is valid\n"),
5560 		    compnp->cname);
5561 	}
5562 	return (0);
5563 }
5564 
5565 /*
5566  * FUNCTION:	meta_sp_validate_wm_and_unit()
5567  * INPUT:	sp	- name of set we are recovering in
5568  *		compnp	- name of device we are recovering from
5569  *		options	- metarecover options
5570  * OUTPUT:	ep	- error pointer returned
5571  * RETURNS:	int	- 0 - success, -1 error
5572  * PURPOSE:	cross-validate and display watermarks and metadb records.
5573  *		get both the unit structures for the soft partitions built
5574  *		on the specified component and the watermarks found on that
5575  *		component and check to make sure they are consistent with
5576  *		each other.
5577  */
5578 static int
5579 meta_sp_validate_wm_and_unit(
5580 	mdsetname_t	*sp,
5581 	mdname_t	*np,
5582 	mdcmdopts_t	options,
5583 	md_error_t	*ep
5584 )
5585 {
5586 	sp_ext_node_t	*wmlist = NULL;
5587 	sp_ext_node_t	*unitlist = NULL;
5588 	sp_ext_node_t	*unitext;
5589 	sp_ext_node_t	*wmext;
5590 	sp_ext_offset_t	tmpunitoff;
5591 	mdnamelist_t	*spnlp = NULL;
5592 	int		count;
5593 	int		rval = 0;
5594 	int		verbose = (options & MDCMD_VERBOSE);
5595 
5596 	/* get unit structure list */
5597 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
5598 	if (count <= 0)
5599 		return (-1);
5600 
5601 	meta_sp_list_insert(NULL, NULL, &unitlist,
5602 	    metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
5603 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
5604 
5605 	if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) {
5606 		metafreenamelist(spnlp);
5607 		return (-1);
5608 	}
5609 
5610 	metafreenamelist(spnlp);
5611 
5612 	meta_sp_list_freefill(&unitlist, metagetsize(np, ep));
5613 
5614 	if (meta_sp_extlist_from_wm(sp, np, &wmlist,
5615 	    meta_sp_cmp_by_offset, ep) < 0) {
5616 		meta_sp_list_free(&unitlist);
5617 		return (-1);
5618 	}
5619 
5620 	if (getenv(META_SP_DEBUG)) {
5621 		meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n");
5622 		meta_sp_list_dump(unitlist);
5623 		meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n");
5624 		meta_sp_list_dump(wmlist);
5625 	}
5626 
5627 	/*
5628 	 * step through both lists and compare allocated nodes.  Free
5629 	 * nodes and end watermarks may differ between the two but
5630 	 * that's generally ok, and if they're wrong will typically
5631 	 * cause misplaced allocated extents.
5632 	 */
5633 	if (verbose)
5634 		(void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb "
5635 		    "allocations match extent headers.\n"), np->cname);
5636 
5637 	unitext = unitlist;
5638 	wmext = wmlist;
5639 	while ((wmext != NULL) && (unitext != NULL)) {
5640 		/* find next allocated extents in each list */
5641 		while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC)
5642 			wmext = wmext->ext_next;
5643 
5644 		while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC)
5645 			unitext = unitext->ext_next;
5646 
5647 		if (wmext == NULL || unitext == NULL)
5648 			break;
5649 
5650 		if (verbose) {
5651 			(void) printf(dgettext(TEXT_DOMAIN,
5652 			    "Metadb extent:\n"));
5653 			meta_sp_display_exthdr();
5654 			meta_sp_display_ext(unitext);
5655 			(void) printf(dgettext(TEXT_DOMAIN,
5656 			    "Extent header extent:\n"));
5657 			meta_sp_display_exthdr();
5658 			meta_sp_display_ext(wmext);
5659 			(void) printf("\n");
5660 		}
5661 
5662 		if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0)
5663 			rval = -1;
5664 
5665 		/*
5666 		 * if the offsets aren't equal, only increment the
5667 		 * lowest one in hopes of getting the lists back in sync.
5668 		 */
5669 		tmpunitoff = unitext->ext_offset;
5670 		if (unitext->ext_offset <= wmext->ext_offset)
5671 			unitext = unitext->ext_next;
5672 		if (wmext->ext_offset <= tmpunitoff)
5673 			wmext = wmext->ext_next;
5674 	}
5675 
5676 	/*
5677 	 * if both lists aren't at the end then there are extra
5678 	 * allocated nodes in one of them.
5679 	 */
5680 	if (wmext != NULL) {
5681 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5682 		    "%s: extent headers contain allocations not in "
5683 		    "the metadb\n\n"), np->cname);
5684 		rval = -1;
5685 	}
5686 
5687 	if (unitext != NULL) {
5688 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5689 		    "%s: metadb contains allocations not in the extent "
5690 		    "headers\n\n"), np->cname);
5691 		rval = -1;
5692 	}
5693 
5694 	if (options & MDCMD_PRINT) {
5695 		if (rval == 0) {
5696 			(void) printf(dgettext(TEXT_DOMAIN,
5697 			    "%s: Soft Partition metadb matches extent "
5698 			    "header configuration\n"), np->cname);
5699 		} else {
5700 			(void) printf(dgettext(TEXT_DOMAIN,
5701 			    "%s: Soft Partition metadb does not match extent "
5702 			    "header configuration\n"), np->cname);
5703 		}
5704 	}
5705 
5706 	return (rval);
5707 }
5708 
5709 /*
5710  * FUNCTION:	meta_sp_validate_exts()
5711  * INPUT:	compnp	- name pointer for device we are recovering from
5712  *		wmext	- extent node representing watermark
5713  *		unitext	- extent node from unit structure
5714  * OUTPUT:	ep	- return error pointer
5715  * RETURNS:	int	- 0 - succes, mdmderror return code - error
5716  * PURPOSE:	Takes two extent nodes and checks them against each other.
5717  *		offset, length, sequence number, set, and name are compared.
5718  */
5719 static int
5720 meta_sp_validate_exts(
5721 	mdname_t	*compnp,
5722 	sp_ext_node_t	*wmext,
5723 	sp_ext_node_t	*unitext,
5724 	md_error_t	*ep
5725 )
5726 {
5727 	if (wmext->ext_offset != unitext->ext_offset) {
5728 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5729 		    "%s: unit structure and extent header offsets differ.\n"),
5730 		    compnp->cname);
5731 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5732 	}
5733 
5734 	if (wmext->ext_length != unitext->ext_length) {
5735 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5736 		    "%s: unit structure and extent header lengths differ.\n"),
5737 		    compnp->cname);
5738 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5739 	}
5740 
5741 	if (wmext->ext_seq != unitext->ext_seq) {
5742 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5743 		    "%s: unit structure and extent header sequence numbers "
5744 		    "differ.\n"), compnp->cname);
5745 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5746 	}
5747 
5748 	if (wmext->ext_type != unitext->ext_type) {
5749 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5750 		    "%s: unit structure and extent header types differ.\n"),
5751 		    compnp->cname);
5752 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5753 	}
5754 
5755 	/*
5756 	 * If one has a set pointer and the other doesn't, error.
5757 	 * If both extents have setnames, then make sure they match
5758 	 * If both are NULL, it's ok, they match.
5759 	 */
5760 	if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) {
5761 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5762 		    "%s: unit structure and extent header set values "
5763 		    "differ.\n"), compnp->cname);
5764 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5765 	}
5766 
5767 	if (unitext->ext_setp != NULL) {
5768 		if (strcmp(unitext->ext_setp->setname,
5769 		    wmext->ext_setp->setname) != 0) {
5770 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5771 			    "%s: unit structure and extent header set names "
5772 			    "differ.\n"), compnp->cname);
5773 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5774 			    0, compnp->cname));
5775 		}
5776 	}
5777 
5778 	/*
5779 	 * If one has a name pointer and the other doesn't, error.
5780 	 * If both extents have names, then make sure they match
5781 	 * If both are NULL, it's ok, they match.
5782 	 */
5783 	if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) {
5784 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5785 		    "%s: unit structure and extent header name values "
5786 		    "differ.\n"), compnp->cname);
5787 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5788 	}
5789 
5790 	if (unitext->ext_namep != NULL) {
5791 		if (strcmp(wmext->ext_namep->cname,
5792 		    unitext->ext_namep->cname) != 0) {
5793 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5794 			    "%s: unit structure and extent header names "
5795 			    "differ.\n"), compnp->cname);
5796 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5797 			    0, compnp->cname));
5798 		}
5799 	}
5800 
5801 	return (0);
5802 }
5803 
5804 /*
5805  * FUNCTION:	update_sp_status()
5806  * INPUT:	sp	- name of set we are recovering in
5807  *		minors	- pointer to an array of soft partition minor numbers
5808  *		num_sps	- number of minor numbers in array
5809  *		status	- new status to be applied to all soft parts in array
5810  *		mn_set	- set if current set is a multi-node set
5811  * OUTPUT:	ep	- return error pointer
5812  * RETURNS:	int	- 0 - success, -1 - error
5813  * PURPOSE:	update  status of soft partitions to new status. minors is an
5814  *		array of minor numbers to apply the new status to.
5815  *		If mn_set is set, a message is sent to all nodes in the
5816  *		cluster to update the status locally.
5817  */
5818 static int
5819 update_sp_status(
5820 	mdsetname_t	*sp,
5821 	minor_t		*minors,
5822 	int		num_sps,
5823 	sp_status_t	status,
5824 	bool_t		mn_set,
5825 	md_error_t	*ep
5826 )
5827 {
5828 	int	i;
5829 	int	err = 0;
5830 
5831 	if (mn_set) {
5832 		md_mn_msg_sp_setstat_t	sp_setstat_params;
5833 		int			result;
5834 		md_mn_result_t		*resp = NULL;
5835 
5836 		for (i = 0; i < num_sps; i++) {
5837 			sp_setstat_params.sp_setstat_mnum = minors[i];
5838 			sp_setstat_params.sp_setstat_status = status;
5839 
5840 			result = mdmn_send_message(sp->setno,
5841 			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS,
5842 			    (char *)&sp_setstat_params,
5843 			    sizeof (sp_setstat_params),
5844 			    &resp, ep);
5845 			if (resp != NULL) {
5846 				if (resp->mmr_exitval != 0)
5847 					err = -1;
5848 				free_result(resp);
5849 			}
5850 			if (result != 0) {
5851 				err = -1;
5852 			}
5853 		}
5854 	} else {
5855 		if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0)
5856 			err = -1;
5857 	}
5858 	if (err < 0) {
5859 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5860 		    "Error updating status on recovered soft "
5861 		    "partitions.\n"));
5862 	}
5863 	return (err);
5864 }
5865 
5866 /*
5867  * FUNCTION:	meta_sp_recover_from_wm()
5868  * INPUT:	sp	- name of set we are recovering in
5869  *		compnp	- name pointer for component we are recovering from
5870  *		options	- metarecover options
5871  * OUTPUT:	ep	- return error pointer
5872  * RETURNS:	int	- 0 - success, -1 - error
5873  * PURPOSE:	update metadb records to match watermarks.  begin by getting
5874  *		an extlist representing all soft partitions on the component.
5875  *		then build a unit structure for each soft partition.
5876  *		notify user of changes, then commit each soft partition to
5877  *		the metadb one at a time in the "recovering" state.  update
5878  *		any watermarks that may need it	(to reflect possible name
5879  *		changes), and, finally, set the status of all recovered
5880  *		partitions to the "OK" state at once.
5881  */
5882 static int
5883 meta_sp_recover_from_wm(
5884 	mdsetname_t	*sp,
5885 	mdname_t	*compnp,
5886 	mdcmdopts_t	options,
5887 	md_error_t	*ep
5888 )
5889 {
5890 	sp_ext_node_t		*extlist = NULL;
5891 	sp_ext_node_t		*sp_list = NULL;
5892 	sp_ext_node_t		*update_list = NULL;
5893 	sp_ext_node_t		*ext;
5894 	sp_ext_node_t		*sp_ext;
5895 	mp_unit_t		*mp;
5896 	mp_unit_t		**un_array;
5897 	int			numexts = 0, num_sps = 0, i = 0;
5898 	int			err = 0;
5899 	int			not_recovered = 0;
5900 	int			committed = 0;
5901 	sp_ext_length_t		sp_length = 0LL;
5902 	mdnamelist_t		*keynlp = NULL;
5903 	mdname_t		*np;
5904 	mdname_t		*new_np;
5905 	int			new_name;
5906 	md_set_params_t		set_params;
5907 	minor_t			*minors = NULL;
5908 	char			yesno[255];
5909 	char			*yes;
5910 	bool_t			mn_set = 0;
5911 	md_set_desc		*sd;
5912 	mm_unit_t		*mm;
5913 	md_set_mmown_params_t	*ownpar = NULL;
5914 	int			comp_is_mirror = 0;
5915 
5916 	/*
5917 	 * if this component appears in another metadevice already, do
5918 	 * NOT recover from it.
5919 	 */
5920 	if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0)
5921 		return (-1);
5922 
5923 	/* set flag if dealing with a MN set */
5924 	if (!metaislocalset(sp)) {
5925 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5926 			return (-1);
5927 		}
5928 		if (MD_MNSET_DESC(sd))
5929 			mn_set = 1;
5930 	}
5931 	/*
5932 	 * for each watermark, build an ext_node, place on list.
5933 	 */
5934 	if (meta_sp_extlist_from_wm(sp, compnp, &extlist,
5935 	    meta_sp_cmp_by_nameseq, ep) < 0)
5936 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5937 
5938 	assert(extlist != NULL);
5939 
5940 	/* count number of soft partitions */
5941 	for (ext = extlist;
5942 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5943 	    ext = ext->ext_next) {
5944 		if (ext->ext_next != NULL &&
5945 		    ext->ext_next->ext_namep != NULL &&
5946 		    strcmp(ext->ext_next->ext_namep->cname,
5947 		    ext->ext_namep->cname) == 0)
5948 				continue;
5949 		num_sps++;
5950 	}
5951 
5952 	/* allocate array of unit structure pointers */
5953 	un_array = Zalloc(num_sps * sizeof (mp_unit_t *));
5954 
5955 	/*
5956 	 * build unit structures from list of ext_nodes.
5957 	 */
5958 	for (ext = extlist;
5959 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5960 	    ext = ext->ext_next) {
5961 		meta_sp_list_insert(ext->ext_setp, ext->ext_namep,
5962 		    &sp_list, ext->ext_offset, ext->ext_length,
5963 		    ext->ext_type, ext->ext_seq, ext->ext_flags,
5964 		    meta_sp_cmp_by_nameseq);
5965 
5966 		numexts++;
5967 		sp_length += ext->ext_length - MD_SP_WMSIZE;
5968 
5969 		if (ext->ext_next != NULL &&
5970 		    ext->ext_next->ext_namep != NULL &&
5971 		    strcmp(ext->ext_next->ext_namep->cname,
5972 		    ext->ext_namep->cname) == 0)
5973 				continue;
5974 
5975 		/*
5976 		 * if we made it here, we are at a soft partition
5977 		 * boundary in the list.
5978 		 */
5979 		if (getenv(META_SP_DEBUG)) {
5980 			meta_sp_debug("meta_recover_from_wm: dumping wm "
5981 			    "list:\n");
5982 			meta_sp_list_dump(sp_list);
5983 		}
5984 
5985 		assert(sp_list != NULL);
5986 		assert(sp_list->ext_namep != NULL);
5987 
5988 		if ((new_name = meta_sp_resolve_name_conflict(sp,
5989 		    sp_list->ext_namep, &new_np, ep)) < 0) {
5990 			err = 1;
5991 			goto out;
5992 		} else if (new_name) {
5993 			for (sp_ext = sp_list;
5994 			    sp_ext != NULL;
5995 			    sp_ext = sp_ext->ext_next) {
5996 				/*
5997 				 * insert into the update list for
5998 				 * watermark update.
5999 				 */
6000 				meta_sp_list_insert(sp_ext->ext_setp,
6001 				    new_np, &update_list, sp_ext->ext_offset,
6002 				    sp_ext->ext_length, sp_ext->ext_type,
6003 				    sp_ext->ext_seq, EXTFLG_UPDATE,
6004 				    meta_sp_cmp_by_offset);
6005 			}
6006 
6007 		}
6008 		if (options & MDCMD_DOIT) {
6009 			/* store name in namespace */
6010 			if (mn_set) {
6011 				/* send message to all nodes to return key */
6012 				md_mn_msg_addkeyname_t	*send_params;
6013 				int			result;
6014 				md_mn_result_t		*resp = NULL;
6015 				int			message_size;
6016 
6017 				message_size =  sizeof (*send_params) +
6018 				    strlen(compnp->cname) + 1;
6019 				send_params = Zalloc(message_size);
6020 				send_params->addkeyname_setno = sp->setno;
6021 				(void) strcpy(&send_params->addkeyname_name[0],
6022 				    compnp->cname);
6023 				result = mdmn_send_message(sp->setno,
6024 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6025 				    (char *)send_params, message_size, &resp,
6026 				    ep);
6027 				Free(send_params);
6028 				if (resp != NULL) {
6029 					if (resp->mmr_exitval >= 0) {
6030 						compnp->key =
6031 						    (mdkey_t)resp->mmr_exitval;
6032 					} else {
6033 						err = 1;
6034 						free_result(resp);
6035 						goto out;
6036 					}
6037 					free_result(resp);
6038 				}
6039 				if (result != 0) {
6040 					err = 1;
6041 					goto out;
6042 				}
6043 				(void) metanamelist_append(&keynlp, compnp);
6044 			} else {
6045 				if (add_key_name(sp, compnp, &keynlp,
6046 				    ep) != 0) {
6047 					err = 1;
6048 					goto out;
6049 				}
6050 			}
6051 		}
6052 
6053 		/* create the unit structure */
6054 		if ((mp = meta_sp_createunit(
6055 		    (new_name) ? new_np : sp_list->ext_namep, compnp,
6056 		    sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) {
6057 			err = 1;
6058 			goto out;
6059 		}
6060 
6061 		if (getenv(META_SP_DEBUG)) {
6062 			meta_sp_debug("meta_sp_recover_from_wm: "
6063 			    "printing newly created unit structure");
6064 			meta_sp_printunit(mp);
6065 		}
6066 
6067 		/* place in unit structure array */
6068 		un_array[i++] = mp;
6069 
6070 		/* free sp_list */
6071 		meta_sp_list_free(&sp_list);
6072 		sp_list = NULL;
6073 		numexts = 0;
6074 		sp_length = 0LL;
6075 	}
6076 
6077 	/* display configuration updates */
6078 	(void) printf(dgettext(TEXT_DOMAIN,
6079 	    "The following soft partitions were found and will be added to\n"
6080 	    "your metadevice configuration.\n"));
6081 	(void) printf("%5s %15s %18s\n",
6082 	    dgettext(TEXT_DOMAIN, "Name"),
6083 	    dgettext(TEXT_DOMAIN, "Size"),
6084 	    dgettext(TEXT_DOMAIN, "No. of Extents"));
6085 	for (i = 0; i < num_sps; i++) {
6086 		(void) printf("%5s%lu %15llu %9d\n", "d",
6087 		    MD_MIN2UNIT(MD_SID(un_array[i])),
6088 		    un_array[i]->un_length, un_array[i]->un_numexts);
6089 	}
6090 
6091 	if (!(options & MDCMD_DOIT)) {
6092 		not_recovered = 1;
6093 		goto out;
6094 	}
6095 
6096 	/* ask user for confirmation */
6097 	(void) printf(dgettext(TEXT_DOMAIN,
6098 	    "WARNING: You are about to add one or more soft partition\n"
6099 	    "metadevices to your metadevice configuration.  If there\n"
6100 	    "appears to be an error in the soft partition(s) displayed\n"
6101 	    "above, do NOT proceed with this recovery operation.\n"));
6102 	(void) printf(dgettext(TEXT_DOMAIN,
6103 	    "Are you sure you want to do this (yes/no)? "));
6104 
6105 	(void) fflush(stdout);
6106 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6107 	    (strlen(yesno) == 1))
6108 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
6109 		    dgettext(TEXT_DOMAIN, "no"));
6110 	yes = dgettext(TEXT_DOMAIN, "yes");
6111 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
6112 		not_recovered = 1;
6113 		goto out;
6114 	}
6115 
6116 	/* commit records one at a time */
6117 	for (i = 0; i < num_sps; i++) {
6118 		(void) memset(&set_params, 0, sizeof (set_params));
6119 		set_params.mnum = MD_SID(un_array[i]);
6120 		set_params.size = (un_array[i])->c.un_size;
6121 		set_params.mdp = (uintptr_t)(un_array[i]);
6122 		set_params.options =
6123 		    meta_check_devicesize(un_array[i]->un_length);
6124 		if (set_params.options == MD_CRO_64BIT) {
6125 			un_array[i]->c.un_revision |= MD_64BIT_META_DEV;
6126 		} else {
6127 			un_array[i]->c.un_revision &= ~MD_64BIT_META_DEV;
6128 		}
6129 		MD_SETDRIVERNAME(&set_params, MD_SP,
6130 		    MD_MIN2SET(set_params.mnum));
6131 
6132 		np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep);
6133 
6134 		/*
6135 		 * If this is an MN set, send the MD_IOCSET ioctl to all nodes
6136 		 */
6137 		if (mn_set) {
6138 			md_mn_msg_iocset_t	send_params;
6139 			int			result;
6140 			md_mn_result_t		*resp = NULL;
6141 			int			mess_size;
6142 
6143 			/*
6144 			 * Calculate message size. md_mn_msg_iocset_t only
6145 			 * contains one extent, so increment the size to
6146 			 * include all extents
6147 			 */
6148 			mess_size = sizeof (send_params) -
6149 			    sizeof (mp_ext_t) +
6150 			    (un_array[i]->un_numexts * sizeof (mp_ext_t));
6151 
6152 			send_params.iocset_params = set_params;
6153 			(void) memcpy(&send_params.unit, un_array[i],
6154 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
6155 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
6156 			result = mdmn_send_message(sp->setno,
6157 			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS,
6158 			    (char *)&send_params, mess_size, &resp,
6159 			    ep);
6160 			if (resp != NULL) {
6161 				if (resp->mmr_exitval != 0)
6162 					err = 1;
6163 				free_result(resp);
6164 			}
6165 			if (result != 0) {
6166 				err = 1;
6167 			}
6168 		} else {
6169 			if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
6170 			    np->cname) != 0) {
6171 				err = 1;
6172 			}
6173 		}
6174 
6175 		if (err == 1) {
6176 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6177 			    "%s: Error committing record to metadb.\n"),
6178 			    np->cname);
6179 			goto out;
6180 		}
6181 
6182 		/* note that we've committed a record */
6183 		if (!committed)
6184 			committed = 1;
6185 
6186 		/* update any watermarks that need it */
6187 		if (update_list != NULL) {
6188 			md_sp_t *msp;
6189 
6190 			/*
6191 			 * Check to see if we're trying to create a partition
6192 			 * on a mirror. If so we may have to enforce an
6193 			 * ownership change before writing the watermark out.
6194 			 */
6195 			if (metaismeta(compnp)) {
6196 				char *miscname;
6197 
6198 				miscname = metagetmiscname(compnp, ep);
6199 				if (miscname != NULL)
6200 					comp_is_mirror = (strcmp(miscname,
6201 					    MD_MIRROR) == 0);
6202 				else
6203 					comp_is_mirror = 0;
6204 			}
6205 			/*
6206 			 * If this is a MN set and the component is a mirror,
6207 			 * change ownership to this node in order to write the
6208 			 * watermarks
6209 			 */
6210 			if (mn_set && comp_is_mirror) {
6211 				mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
6212 				if (mm == NULL) {
6213 					err = 1;
6214 					goto out;
6215 				} else {
6216 					err = meta_mn_change_owner(&ownpar,
6217 					    sp->setno,
6218 					    meta_getminor(compnp->dev),
6219 					    sd->sd_mn_mynode->nd_nodeid,
6220 					    MD_MN_MM_PREVENT_CHANGE |
6221 					    MD_MN_MM_SPAWN_THREAD);
6222 					if (err != 0)
6223 						goto out;
6224 				}
6225 			}
6226 
6227 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
6228 				err = 1;
6229 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6230 				    "%s: Error updating extent headers.\n"),
6231 				    np->cname);
6232 				goto out;
6233 			}
6234 			if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) {
6235 				err = 1;
6236 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6237 				    "%s: Error updating extent headers "
6238 				    "on disk.\n"), np->cname);
6239 				goto out;
6240 			}
6241 		}
6242 		/*
6243 		 * If we have changed ownership earlier and prevented any
6244 		 * ownership changes, we can now allow ownership changes
6245 		 * again.
6246 		 */
6247 		if (ownpar) {
6248 			(void) meta_mn_change_owner(&ownpar, sp->setno,
6249 			    ownpar->d.mnum,
6250 			    ownpar->d.owner,
6251 			    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
6252 		}
6253 	}
6254 
6255 	/* update status of all soft partitions to OK */
6256 	minors = Zalloc(num_sps * sizeof (minor_t));
6257 	for (i = 0; i < num_sps; i++)
6258 		minors[i] = MD_SID(un_array[i]);
6259 
6260 	err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep);
6261 	if (err != 0)
6262 		goto out;
6263 
6264 	if (options & MDCMD_PRINT)
6265 		(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6266 		    "Soft Partitions recovered from device.\n"),
6267 		    compnp->cname);
6268 out:
6269 	/* free memory */
6270 	if (extlist != NULL)
6271 		meta_sp_list_free(&extlist);
6272 	if (sp_list != NULL)
6273 		meta_sp_list_free(&sp_list);
6274 	if (update_list != NULL)
6275 		meta_sp_list_free(&update_list);
6276 	if (un_array != NULL)	{
6277 		for (i = 0; i < num_sps; i++)
6278 			Free(un_array[i]);
6279 		Free(un_array);
6280 	}
6281 	if (minors != NULL)
6282 		Free(minors);
6283 	if (ownpar != NULL)
6284 		Free(ownpar);
6285 	(void) fflush(stdout);
6286 
6287 	if ((keynlp != NULL) && (committed != 1)) {
6288 		/*
6289 		 * if we haven't committed any softparts, either because of an
6290 		 * error or because the user decided not to proceed, delete
6291 		 * namelist key for the component
6292 		 */
6293 		if (mn_set) {
6294 			mdnamelist_t	*p;
6295 
6296 			for (p = keynlp; (p != NULL); p = p->next) {
6297 				mdname_t		*np = p->namep;
6298 				md_mn_msg_delkeyname_t	send_params;
6299 				md_mn_result_t		*resp = NULL;
6300 
6301 				send_params.delkeyname_dev = np->dev;
6302 				send_params.delkeyname_setno = sp->setno;
6303 				send_params.delkeyname_key = np->key;
6304 				(void) mdmn_send_message(sp->setno,
6305 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6306 				    (char *)&send_params, sizeof (send_params),
6307 				    &resp, ep);
6308 				if (resp != NULL) {
6309 					free_result(resp);
6310 				}
6311 			}
6312 		} else {
6313 			(void) del_key_names(sp, keynlp, NULL);
6314 		}
6315 	}
6316 
6317 	metafreenamelist(keynlp);
6318 
6319 	if (err)
6320 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
6321 
6322 	if (not_recovered)
6323 		if (options & MDCMD_PRINT)
6324 			(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6325 			    "Soft Partitions NOT recovered from device.\n"),
6326 			    compnp->cname);
6327 	return (0);
6328 }
6329 
6330 /*
6331  * FUNCTION:	meta_sp_recover_from_unit()
6332  * INPUT:	sp	- name of set we are recovering in
6333  *		compnp	- name of component we are recovering from
6334  *		options	- metarecover options
6335  * OUTPUT:	ep	- return error pointer
6336  * RETURNS:	int	- 0 - success, -1 - error
6337  * PURPOSE:	update watermarks to match metadb records.  begin by getting
6338  *		a namelist representing all soft partitions on the specified
6339  *		component.  then, build an extlist representing the soft
6340  *		partitions, filling in the freespace extents.  notify user
6341  *		of changes, place all soft partitions into the "recovering"
6342  *		state and update the watermarks.  finally, return all soft
6343  *		partitions to the "OK" state.
6344  */
6345 static int
6346 meta_sp_recover_from_unit(
6347 	mdsetname_t	*sp,
6348 	mdname_t	*compnp,
6349 	mdcmdopts_t	options,
6350 	md_error_t	*ep
6351 )
6352 {
6353 	mdnamelist_t	*spnlp = NULL;
6354 	mdnamelist_t	*nlp = NULL;
6355 	sp_ext_node_t	*ext = NULL;
6356 	sp_ext_node_t	*extlist = NULL;
6357 	int		count;
6358 	char		yesno[255];
6359 	char		*yes;
6360 	int		rval = 0;
6361 	minor_t		*minors = NULL;
6362 	int		i;
6363 	md_sp_t		*msp;
6364 	md_set_desc	*sd;
6365 	bool_t		mn_set = 0;
6366 	daddr_t		start_block;
6367 
6368 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
6369 	if (count <= 0)
6370 		return (-1);
6371 
6372 	/* set flag if dealing with a MN set */
6373 	if (!metaislocalset(sp)) {
6374 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
6375 			return (-1);
6376 		}
6377 		if (MD_MNSET_DESC(sd))
6378 			mn_set = 1;
6379 	}
6380 	/*
6381 	 * Save the XDR unit structure for one of the soft partitions;
6382 	 * we'll use this later to provide metadevice context to
6383 	 * update the watermarks so the device can be resolved by
6384 	 * devid instead of dev_t.
6385 	 */
6386 	if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) {
6387 		metafreenamelist(spnlp);
6388 		return (-1);
6389 	}
6390 
6391 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
6392 	    MD_DISKADDR_ERROR) {
6393 		return (-1);
6394 	}
6395 
6396 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
6397 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
6398 	meta_sp_list_insert(NULL, NULL, &extlist,
6399 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
6400 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
6401 
6402 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
6403 		metafreenamelist(spnlp);
6404 		return (-1);
6405 	}
6406 
6407 	assert(extlist != NULL);
6408 	if ((options & MDCMD_VERBOSE) != 0) {
6409 		(void) printf(dgettext(TEXT_DOMAIN,
6410 		    "Updating extent headers on device %s from metadb.\n\n"),
6411 		    compnp->cname);
6412 		(void) printf(dgettext(TEXT_DOMAIN,
6413 		    "The following extent headers will be written:\n"));
6414 		meta_sp_display_exthdr();
6415 	}
6416 
6417 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
6418 
6419 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
6420 
6421 		/* mark every node for updating except the reserved space */
6422 		if (ext->ext_type != EXTTYP_RESERVED) {
6423 			ext->ext_flags |= EXTFLG_UPDATE;
6424 
6425 			/* print extent information */
6426 			if ((options & MDCMD_VERBOSE) != 0)
6427 				meta_sp_display_ext(ext);
6428 		}
6429 	}
6430 
6431 	/* request verification and then update all watermarks */
6432 	if ((options & MDCMD_DOIT) != 0) {
6433 
6434 		(void) printf(dgettext(TEXT_DOMAIN,
6435 		    "\nWARNING: You are about to overwrite portions of %s\n"
6436 		    "with soft partition metadata. The extent headers will be\n"
6437 		    "written to match the existing metadb configuration.  If\n"
6438 		    "the device was not previously setup with this\n"
6439 		    "configuration, data loss may result.\n\n"),
6440 		    compnp->cname);
6441 		(void) printf(dgettext(TEXT_DOMAIN,
6442 		    "Are you sure you want to do this (yes/no)? "));
6443 
6444 		(void) fflush(stdout);
6445 		if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6446 		    (strlen(yesno) == 1))
6447 			(void) snprintf(yesno, sizeof (yesno),
6448 			    "%s\n", dgettext(TEXT_DOMAIN, "no"));
6449 		yes = dgettext(TEXT_DOMAIN, "yes");
6450 		if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) {
6451 			/* place soft partitions into recovering state */
6452 			minors = Zalloc(count * sizeof (minor_t));
6453 			for (nlp = spnlp, i = 0;
6454 			    nlp != NULL && i < count;
6455 			    nlp = nlp->next, i++) {
6456 				assert(nlp->namep != NULL);
6457 				minors[i] = meta_getminor(nlp->namep->dev);
6458 			}
6459 			if (update_sp_status(sp, minors, count,
6460 			    MD_SP_RECOVER, mn_set, ep) != 0) {
6461 				rval = -1;
6462 				goto out;
6463 			}
6464 
6465 			/* update the watermarks */
6466 			if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
6467 				rval = -1;
6468 				goto out;
6469 			}
6470 
6471 			if (options & MDCMD_PRINT) {
6472 				(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6473 				    "Soft Partitions recovered from metadb\n"),
6474 				    compnp->cname);
6475 			}
6476 
6477 			/* return soft partitions to the OK state */
6478 			if (update_sp_status(sp, minors, count,
6479 			    MD_SP_OK, mn_set, ep) != 0) {
6480 				rval = -1;
6481 				goto out;
6482 			}
6483 
6484 			rval = 0;
6485 			goto out;
6486 		}
6487 	}
6488 
6489 	if (options & MDCMD_PRINT) {
6490 		(void) printf(dgettext(TEXT_DOMAIN,
6491 		    "%s: Soft Partitions NOT recovered from metadb\n"),
6492 		    compnp->cname);
6493 	}
6494 
6495 out:
6496 	if (minors != NULL)
6497 		Free(minors);
6498 	metafreenamelist(spnlp);
6499 	meta_sp_list_free(&extlist);
6500 	(void) fflush(stdout);
6501 	return (rval);
6502 }
6503 
6504 
6505 /*
6506  * FUNCTION:	meta_sp_update_abr()
6507  * INPUT:	sp	- name of set we are recovering in
6508  * OUTPUT:	ep	- return error pointer
6509  * RETURNS:	int	- 0 - success, -1 - error
6510  * PURPOSE:	update the ABR state for all soft partitions in the set. This
6511  *		is called when joining a set. It sends a message to the master
6512  *		node for each soft partition to get the value of tstate and
6513  *		then sets ABR ,if required, by opening the sp, setting ABR
6514  *		and then closing the sp. This approach is taken rather that
6515  *		just issuing the MD_MN_SET_CAP ioctl, in order to deal with
6516  *		the case when we have another node simultaneously unsetting ABR.
6517  */
6518 int
6519 meta_sp_update_abr(
6520 	mdsetname_t	*sp,
6521 	md_error_t	*ep
6522 )
6523 {
6524 	mdnamelist_t	*devnlp = NULL;
6525 	mdnamelist_t	*p;
6526 	mdname_t	*devnp = NULL;
6527 	md_unit_t	*un;
6528 	char		fname[MAXPATHLEN];
6529 	int		mnum, fd;
6530 	volcap_t	vc;
6531 	uint_t		tstate;
6532 
6533 
6534 	if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
6535 		return (-1);
6536 	}
6537 
6538 	/* Exit if no soft partitions in this set */
6539 	if (devnlp == NULL)
6540 		return (0);
6541 
6542 	/* For each soft partition */
6543 	for (p = devnlp; (p != NULL); p = p->next) {
6544 		devnp = p->namep;
6545 
6546 		/* check if this is a top level metadevice */
6547 		if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL)
6548 			goto out;
6549 		if (MD_HAS_PARENT(MD_PARENT(un))) {
6550 			Free(un);
6551 			continue;
6552 		}
6553 		Free(un);
6554 
6555 		/* Get tstate from Master */
6556 		if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) {
6557 			mdname_t	*np;
6558 			np = metamnumname(&sp, meta_getminor(devnp->dev), 0,
6559 			    ep);
6560 			if (np) {
6561 				md_perror(dgettext(TEXT_DOMAIN,
6562 				    "Unable to get tstate for %s"), np->cname);
6563 			}
6564 			continue;
6565 		}
6566 		/* If not set on the master, nothing to do */
6567 		if (!(tstate & MD_ABR_CAP))
6568 			continue;
6569 
6570 		mnum = meta_getminor(devnp->dev);
6571 		(void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u",
6572 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
6573 		if ((fd = open(fname, O_RDWR, 0)) < 0) {
6574 			md_perror(dgettext(TEXT_DOMAIN,
6575 			    "Could not open device %s"), fname);
6576 			continue;
6577 		}
6578 
6579 		/* Set ABR state */
6580 		vc.vc_info = 0;
6581 		vc.vc_set = 0;
6582 		if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
6583 			(void) close(fd);
6584 			continue;
6585 		}
6586 
6587 		vc.vc_set = DKV_ABR_CAP;
6588 		if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
6589 			(void) close(fd);
6590 			goto out;
6591 		}
6592 
6593 		(void) close(fd);
6594 	}
6595 	metafreenamelist(devnlp);
6596 	return (0);
6597 out:
6598 	metafreenamelist(devnlp);
6599 	return (-1);
6600 }
6601 
6602 /*
6603  * FUNCTION:	meta_mn_sp_update_abr()
6604  * INPUT:	arg	- Given set.
6605  * PURPOSE:	update the ABR state for all soft partitions in the set by
6606  *		forking a process to call meta_sp_update_abr()
6607  *		This function is only called via rpc.metad when adding a node
6608  *		to a set, ie this node is beong joined to the set by another
6609  *		node.
6610  */
6611 void *
6612 meta_mn_sp_update_abr(void *arg)
6613 {
6614 	set_t		setno = *((set_t *)arg);
6615 	mdsetname_t	*sp;
6616 	md_error_t	mde = mdnullerror;
6617 	int		fval;
6618 
6619 	/* should have a set */
6620 	assert(setno != NULL);
6621 
6622 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6623 		mde_perror(&mde, "");
6624 		return (NULL);
6625 	}
6626 
6627 	if (!(meta_is_mn_set(sp, &mde))) {
6628 		mde_perror(&mde, "");
6629 		return (NULL);
6630 	}
6631 
6632 	/* fork a process */
6633 	if ((fval = md_daemonize(sp, &mde)) != 0) {
6634 		/*
6635 		 * md_daemonize will fork off a process.  The is the
6636 		 * parent or error.
6637 		 */
6638 		if (fval > 0) {
6639 			return (NULL);
6640 		}
6641 		mde_perror(&mde, "");
6642 		return (NULL);
6643 	}
6644 	/*
6645 	 * Child process should never return back to rpc.metad, but
6646 	 * should exit.
6647 	 * Flush all internally cached data inherited from parent process
6648 	 * since cached data will be cleared when parent process RPC request
6649 	 * has completed (which is possibly before this child process
6650 	 * can complete).
6651 	 * Child process can retrieve and cache its own copy of data from
6652 	 * rpc.metad that won't be changed by the parent process.
6653 	 *
6654 	 * Reset md_in_daemon since this child will be a client of rpc.metad
6655 	 * not part of the rpc.metad daemon itself.
6656 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
6657 	 * this thread is rpc.metad or any other thread.  (If this thread
6658 	 * was rpc.metad it could use some short circuit code to get data
6659 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
6660 	 */
6661 	md_in_daemon = 0;
6662 	metaflushsetname(sp);
6663 	sr_cache_flush_setno(setno);
6664 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6665 		mde_perror(&mde, "");
6666 		md_exit(sp, 1);
6667 	}
6668 
6669 
6670 	/*
6671 	 * Closing stdin/out/err here.
6672 	 */
6673 	(void) close(0);
6674 	(void) close(1);
6675 	(void) close(2);
6676 	assert(fval == 0);
6677 
6678 	(void) meta_sp_update_abr(sp, &mde);
6679 
6680 	md_exit(sp, 0);
6681 	/*NOTREACHED*/
6682 	return (NULL);
6683 }
6684 
6685 int
6686 meta_sp_check_component(
6687 	mdsetname_t	*sp,
6688 	mdname_t	*np,
6689 	md_error_t	*ep
6690 )
6691 {
6692 	md_sp_t	*msp;
6693 	minor_t	mnum = 0;
6694 	md_dev64_t	dev = 0;
6695 	mdnm_params_t	nm;
6696 	md_getdevs_params_t	mgd;
6697 	side_t	sideno;
6698 	char	*miscname;
6699 	md_dev64_t	*mydev = NULL;
6700 	char	*pname = NULL, *t;
6701 	char	*ctd_name = NULL;
6702 	char	*devname = NULL;
6703 	int	len;
6704 	int	rval = -1;
6705 
6706 	(void) memset(&nm, '\0', sizeof (nm));
6707 	if ((msp = meta_get_sp_common(sp, np, 0, ep)) == NULL)
6708 		return (-1);
6709 
6710 	if ((miscname = metagetmiscname(np, ep)) == NULL)
6711 		return (-1);
6712 
6713 	sideno = getmyside(sp, ep);
6714 
6715 	meta_sp_debug("meta_sp_check_component: %s is on %s key: %d"
6716 	    " dev: %llu\n",
6717 	    np->cname, msp->compnamep->cname, msp->compnamep->key,
6718 	    msp->compnamep->dev);
6719 
6720 	/*
6721 	 * Now get the data from the unit structure. The compnamep stuff
6722 	 * contains the data from the namespace and we need the un_dev
6723 	 * from the unit structure.
6724 	 */
6725 	(void) memset(&mgd, '\0', sizeof (mgd));
6726 	MD_SETDRIVERNAME(&mgd, miscname, sp->setno);
6727 	mgd.cnt = 1;		    /* sp's only have one subdevice */
6728 	mgd.mnum = meta_getminor(np->dev);
6729 
6730 	mydev = Zalloc(sizeof (*mydev));
6731 	mgd.devs = (uintptr_t)mydev;
6732 
6733 	if (metaioctl(MD_IOCGET_DEVS, &mgd, &mgd.mde, np->cname) != 0) {
6734 		meta_sp_debug("meta_sp_check_component: ioctl failed\n");
6735 		(void) mdstealerror(ep, &mgd.mde);
6736 		rval = 0;
6737 		goto out;
6738 	} else if (mgd.cnt <= 0) {
6739 		assert(mgd.cnt >= 0);
6740 		rval = 0;
6741 		goto out;
6742 	}
6743 
6744 	/* Get the devname from the name space. */
6745 	if ((devname = meta_getnmentbykey(sp->setno, sideno,
6746 	    msp->compnamep->key, NULL, &mnum, &dev, ep)) == NULL) {
6747 		meta_sp_debug("meta_sp_check_component: key %d not"
6748 		    "found\n", msp->compnamep->key);
6749 		goto out;
6750 	}
6751 
6752 	meta_sp_debug("dev %s from component: (%lu, %lu)\n",
6753 	    devname,
6754 	    meta_getmajor(*mydev),
6755 	    meta_getminor(*mydev));
6756 	meta_sp_debug("minor from the namespace: %lu\n", mnum);
6757 
6758 	if (mnum != meta_getminor(*mydev)) {
6759 		/*
6760 		 * The minor numbers are different. Update the namespace
6761 		 * with the information from the component.
6762 		 */
6763 
6764 		t = strrchr(devname, '/');
6765 		t++;
6766 		ctd_name = Strdup(t);
6767 
6768 		meta_sp_debug("meta_sp_check_component: ctd_name: %s\n",
6769 		    ctd_name);
6770 
6771 		len = strlen(devname);
6772 		t = strrchr(devname, '/');
6773 		t++;
6774 		pname = Zalloc((len - strlen(t)) + 1);
6775 		(void) strncpy(pname, devname, (len - strlen(t)));
6776 		meta_sp_debug("pathname: %s\n", pname);
6777 
6778 		meta_sp_debug("updating the minor number to %lu\n", nm.mnum);
6779 
6780 		if (meta_update_namespace(sp->setno, sideno,
6781 		    ctd_name, *mydev, msp->compnamep->key, pname,
6782 		    ep) != 0) {
6783 			goto out;
6784 		}
6785 	}
6786 out:
6787 	if (pname != NULL)
6788 		Free(pname);
6789 	if (ctd_name != NULL)
6790 		Free(ctd_name);
6791 	if (devname != NULL)
6792 		Free(devname);
6793 	if (mydev != NULL)
6794 		Free(mydev);
6795 	return (rval);
6796 }
6797