xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_sp.c (revision 23a276b1252962c987a613be470dde26561247b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * soft partition operations
38  *
39  * Soft Partitions provide a virtual disk mechanism which is used to
40  * divide a large volume into many small pieces, each appearing as a
41  * separate device.  A soft partition consists of a series of extents,
42  * each having an offset and a length.  The extents are logically
43  * contiguous, so where the first extent leaves off the second extent
44  * picks up.  Which extent a given "virtual offset" belongs to is
45  * dependent on the size of all the previous extents in the soft
46  * partition.
47  *
48  * Soft partitions are represented in memory by an extent node
49  * (sp_ext_node_t) which contains all of the information necessary to
50  * create a unit structure and update the on-disk format, called
51  * "watermarks".  These extent nodes are typically kept in a doubly
52  * linked list and are manipulated by list manipulation routines.  A
53  * list of extents may represent all of the soft partitions on a volume,
54  * a single soft partition, or perhaps just a set of extents that need
55  * to be updated.  Extent lists may be sorted by extent or by name/seq#,
56  * depending on which compare function is used.  Most of the routines
57  * require the list be sorted by offset to work, and that's the typical
58  * configuration.
59  *
60  * In order to do an allocation, knowledge of all soft partitions on the
61  * volume is required.  Then free space is determined from the space
62  * that is not allocated, and new allocations can be made from the free
63  * space.  Once the new allocations are made, a unit structure is created
64  * and the watermarks are updated.  The status is then changed to "okay"
65  * on the unit structure to commit the transaction.  If updating the
66  * watermarks fails, the unit structure is in an intermediate state and
67  * the driver will not allow access to the device.
68  *
69  * A typical sequence of events is:
70  *     1. Fetch the list of names for all soft partitions on a volume
71  *         meta_sp_get_by_component()
72  *     2. Construct an extent list from the name list
73  *         meta_sp_extlist_from_namelist()
74  *     3. Fill the gaps in the extent list with free extents
75  *         meta_sp_list_freefill()
76  *     4. Allocate from the free extents
77  *         meta_sp_alloc_by_len()
78  *         meta_sp_alloc_by_list()
79  *     5. Create the unit structure from the extent list
80  *         meta_sp_createunit()
81  *         meta_sp_updateunit()
82  *     6. Write out the watermarks
83  *         meta_sp_update_wm()
84  *     7. Set the status to "Okay"
85  *         meta_sp_setstatus()
86  *
87  */
88 
89 #include <stdio.h>
90 #include <meta.h>
91 #include "meta_repartition.h"
92 #include <sys/lvm/md_sp.h>
93 #include <sys/lvm/md_crc.h>
94 #include <strings.h>
95 #include <sys/lvm/md_mirror.h>
96 #include <sys/bitmap.h>
97 
98 extern int	md_in_daemon;
99 
100 typedef struct sp_ext_node {
101 	struct sp_ext_node	*ext_next;	/* next element */
102 	struct sp_ext_node	*ext_prev;	/* previous element */
103 	sp_ext_type_t		ext_type;	/* type of extent */
104 	sp_ext_offset_t		ext_offset;	/* starting offset */
105 	sp_ext_length_t		ext_length;	/* length of this node */
106 	uint_t			ext_flags;	/* extent flags */
107 	uint32_t		ext_seq;	/* watermark seq no */
108 	mdname_t		*ext_namep;	/* name pointer */
109 	mdsetname_t		*ext_setp;	/* set pointer */
110 } sp_ext_node_t;
111 
112 /* extent flags */
113 #define	EXTFLG_UPDATE	(1)
114 
115 /* Extent node compare function for list sorting */
116 typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *);
117 
118 
119 /* Function Prototypes */
120 
121 /* Debugging Functions */
122 static void meta_sp_debug(char *format, ...);
123 static void meta_sp_printunit(mp_unit_t *mp);
124 
125 /* Misc Support Functions */
126 int meta_sp_parsesize(char *s, sp_ext_length_t *szp);
127 static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp);
128 static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp,
129 	md_error_t *ep);
130 static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp,
131     mdnamelist_t **nlpp, int force, md_error_t *ep);
132 static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp,
133     mdname_t *compnp, md_error_t *ep);
134 
135 /* Extent List Manipulation Functions */
136 static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2);
137 static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2);
138 static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np,
139     sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length,
140     sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare);
141 static void meta_sp_list_free(sp_ext_node_t **head);
142 static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext);
143 static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head,
144     sp_ext_type_t exttype, int exclude_wm);
145 static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head,
146     sp_ext_offset_t offset);
147 static void meta_sp_list_freefill(sp_ext_node_t **extlist,
148     sp_ext_length_t size);
149 static void meta_sp_list_dump(sp_ext_node_t *head);
150 static int meta_sp_list_overlaps(sp_ext_node_t *head);
151 
152 /* Extent List Query Functions */
153 static boolean_t meta_sp_enough_space(int desired_number_of_sps,
154 	blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp,
155 	sp_ext_length_t alignment);
156 static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep,
157 	mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp,
158 	md_error_t *ep);
159 static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep,
160 	mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp);
161 
162 
163 /* Extent Allocation Functions */
164 static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np,
165     sp_ext_node_t **extlist, sp_ext_node_t *free_ext,
166     sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq);
167 static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np,
168     sp_ext_node_t **extlist, sp_ext_length_t *lp,
169     sp_ext_offset_t last_off, sp_ext_length_t alignment);
170 static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np,
171     sp_ext_node_t **extlist, sp_ext_node_t *oblist);
172 
173 /* Extent List Population Functions */
174 static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp,
175     sp_ext_node_t **extlist, md_error_t *ep);
176 static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp,
177     sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep);
178 
179 /* Print (metastat) Functions */
180 static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp,
181     mdprtopts_t options, md_error_t *ep);
182 static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate);
183 static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp,
184     char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep);
185 
186 /* Watermark Manipulation Functions */
187 static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp,
188     sp_ext_node_t *extlist, md_error_t *ep);
189 static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep);
190 static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp,
191     mp_watermark_t *wm, sp_ext_offset_t offset,  md_error_t *ep);
192 static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp,
193     md_error_t *ep);
194 
195 /* Unit Structure Manipulation Functions */
196 static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist);
197 static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp,
198     sp_ext_node_t *extlist, int numexts, sp_ext_length_t len,
199     sp_status_t status, md_error_t *ep);
200 static mp_unit_t *meta_sp_updateunit(mdname_t *np,  mp_unit_t *old_un,
201     sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts,
202     md_error_t *ep);
203 static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist,
204     mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep);
205 static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options,
206     int *repart_options, md_error_t *ep);
207 
208 /* Reset (metaclear) Functions */
209 static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp,
210     md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep);
211 
212 /* Recovery (metarecover) Functions */
213 static void meta_sp_display_exthdr(void);
214 static void meta_sp_display_ext(sp_ext_node_t *ext);
215 static int meta_sp_checkseq(sp_ext_node_t *extlist);
216 static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *,
217     mdname_t **, md_error_t *);
218 static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np,
219     mdcmdopts_t options, md_error_t *ep);
220 static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp,
221     mdcmdopts_t options, md_error_t *ep);
222 static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np,
223     mdcmdopts_t options, md_error_t *ep);
224 static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext,
225     sp_ext_node_t *unitext, md_error_t *ep);
226 static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp,
227     mdcmdopts_t options, md_error_t *ep);
228 static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np,
229     mdcmdopts_t options, md_error_t *ep);
230 
231 /*
232  * Private Constants
233  */
234 
235 static const int FORCE_RELOAD_CACHE = 1;
236 static const uint_t NO_FLAGS = 0;
237 static const sp_ext_offset_t NO_OFFSET = 0ULL;
238 static const uint_t NO_SEQUENCE_NUMBER = 0;
239 static const int ONE_SOFT_PARTITION = 1;
240 
241 static unsigned long sp_parent_printed[BT_BITOUL(MD_MAXUNITS)];
242 
243 #define	TEST_SOFT_PARTITION_NAMEP NULL
244 #define	TEST_SETNAMEP NULL
245 
246 #define	EXCLUDE_WM	(1)
247 #define	INCLUDE_WM	(0)
248 
249 #define	SP_UNALIGNED	(0LL)
250 
251 /*
252  * **************************************************************************
253  *                          Debugging Functions                             *
254  * **************************************************************************
255  */
256 
257 /*PRINTFLIKE1*/
258 static void
259 meta_sp_debug(char *format, ...)
260 {
261 	static int debug;
262 	static int debug_set = 0;
263 	va_list ap;
264 
265 	if (!debug_set) {
266 		debug = getenv(META_SP_DEBUG) ? 1 : 0;
267 		debug_set = 1;
268 	}
269 
270 	if (debug) {
271 		va_start(ap, format);
272 		(void) vfprintf(stderr, format, ap);
273 		va_end(ap);
274 	}
275 }
276 
277 static void
278 meta_sp_printunit(mp_unit_t *mp)
279 {
280 	int i;
281 
282 	if (mp == NULL)
283 		return;
284 
285 	/* print the common fields we know about */
286 	(void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type);
287 	(void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size);
288 	(void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp));
289 
290 	/* sp-specific fields */
291 	(void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status);
292 	(void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts);
293 	(void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length);
294 	(void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev);
295 	(void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev);
296 	(void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key);
297 
298 	/* print extent information */
299 	(void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n");
300 	for (i = 0; i < mp->un_numexts; i++) {
301 		(void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i,
302 		    mp->un_ext[i].un_voff, mp->un_ext[i].un_poff,
303 		    mp->un_ext[i].un_len);
304 	}
305 }
306 
307 /*
308  * FUNCTION:    meta_sp_parsesize()
309  * INPUT:       s       - the string to parse
310  * OUTPUT:      *szp    - disk block count (0 for "all")
311  * RETURNS:     -1 for error, 0 for success
312  * PURPOSE:     parses the command line parameter that specifies the
313  *              requested size of a soft partition.  The input string
314  *              is either the literal "all" or a numeric value
315  *              followed by a single character, b for disk blocks, k
316  *              for kilobytes, m for megabytes, g for gigabytes, or t
317  *              for terabytes.  p for petabytes and e for exabytes
318  *              have been added as undocumented features for future
319  *              expansion.  For example, 100m is 100 megabytes, while
320  *              50g is 50 gigabytes.  All values are rounded up to the
321  *              nearest block size.
322  */
323 int
324 meta_sp_parsesize(char *s, sp_ext_length_t *szp)
325 {
326 	if (s == NULL || szp == NULL) {
327 		return (-1);
328 	}
329 
330 	/* Check for literal "all" */
331 	if (strcasecmp(s, "all") == 0) {
332 		*szp = 0;
333 		return (0);
334 	}
335 
336 	return (meta_sp_parsesizestring(s, szp));
337 }
338 
339 /*
340  * FUNCTION:	meta_sp_parsesizestring()
341  * INPUT:	s	- the string to parse
342  * OUTPUT:	*szp	- disk block count
343  * RETURNS:	-1 for error, 0 for success
344  * PURPOSE:	parses a string that specifies size. The input string is a
345  *		numeric value followed by a single character, b for disk blocks,
346  *		k for kilobytes, m for megabytes, g for gigabytes, or t for
347  *		terabytes.  p for petabytes and e for exabytes have been added
348  *		as undocumented features for future expansion.  For example,
349  *		100m is 100 megabytes, while 50g is 50 gigabytes.  All values
350  *		are rounded up to the nearest block size.
351  */
352 static int
353 meta_sp_parsesizestring(char *s, sp_ext_length_t *szp)
354 {
355 	sp_ext_length_t	len = 0;
356 	char		len_type[2];
357 
358 	if (s == NULL || szp == NULL) {
359 		return (-1);
360 	}
361 
362 	/*
363 	 * make sure block offset does not overflow 2^64 bytes.
364 	 */
365 	if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) ||
366 	    (len == 0LL) ||
367 	    (len > (1LL << (64 - DEV_BSHIFT))))
368 		return (-1);
369 
370 	switch (len_type[0]) {
371 	case 'B':
372 	case 'b':
373 		len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE));
374 		break;
375 	case 'K':
376 	case 'k':
377 		len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE));
378 		break;
379 	case 'M':
380 	case 'm':
381 		len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE));
382 		break;
383 	case 'g':
384 	case 'G':
385 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE));
386 		break;
387 	case 't':
388 	case 'T':
389 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL,
390 		    DEV_BSIZE));
391 		break;
392 	case 'p':
393 	case 'P':
394 		len = lbtodb(roundup(
395 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
396 		    DEV_BSIZE));
397 		break;
398 	case 'e':
399 	case 'E':
400 		len = lbtodb(roundup(
401 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
402 		    DEV_BSIZE));
403 		break;
404 	default:
405 		/* error */
406 		return (-1);
407 	}
408 
409 	*szp = len;
410 	return (0);
411 }
412 
413 /*
414  * FUNCTION:	meta_sp_setgeom()
415  * INPUT:	np      - the underlying device to setup geometry for
416  *		compnp	- the underlying device to setup geometry for
417  *		mp	- the unit structure to set the geometry for
418  * OUTPUT:	ep	- return error pointer
419  * RETURNS:	int	- -1 if error, 0 otherwise
420  * PURPOSE:	establishes geometry information for a device
421  */
422 static int
423 meta_sp_setgeom(
424 	mdname_t	*np,
425 	mdname_t	*compnp,
426 	mp_unit_t	*mp,
427 	md_error_t	*ep
428 )
429 {
430 	mdgeom_t	*geomp;
431 	uint_t		round_cyl = 0;
432 
433 	if ((geomp = metagetgeom(compnp, ep)) == NULL)
434 		return (-1);
435 	if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct,
436 	    geomp->read_reinstruct, round_cyl, ep) != 0)
437 		return (-1);
438 
439 	return (0);
440 }
441 
442 /*
443  * FUNCTION:	meta_sp_setstatus()
444  * INPUT:	sp	- the set name for the devices to set the status on
445  *		minors	- an array of minor numbers of devices to set status on
446  *		num_units - number of entries in the array
447  *		status	- status value to set all units to
448  * OUTPUT:	ep	- return error pointer
449  * RETURNS:	int	- -1 if error, 0 success
450  * PURPOSE:	sets the status of one or more soft partitions to the
451  *		requested value
452  */
453 int
454 meta_sp_setstatus(
455 	mdsetname_t	*sp,
456 	minor_t		*minors,
457 	int		num_units,
458 	sp_status_t	status,
459 	md_error_t	*ep
460 )
461 {
462 	md_sp_statusset_t	status_params;
463 
464 	assert(minors != NULL);
465 
466 	/* update status of all soft partitions to the status passed in */
467 	(void) memset(&status_params, 0, sizeof (status_params));
468 	status_params.num_units = num_units;
469 	status_params.new_status = status;
470 	status_params.size = num_units * sizeof (minor_t);
471 	status_params.minors = (uintptr_t)minors;
472 	MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno);
473 	if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde,
474 	    NULL) != 0) {
475 		(void) mdstealerror(ep, &status_params.mde);
476 		return (-1);
477 	}
478 	return (0);
479 }
480 
481 /*
482  * FUNCTION:	meta_get_sp_names()
483  * INPUT:	sp	- the set name to get soft partitions from
484  *		options	- options from the command line
485  * OUTPUT:	nlpp	- list of all soft partition names
486  *		ep	- return error pointer
487  * RETURNS:	int	- -1 if error, 0 success
488  * PURPOSE:	returns a list of all soft partitions in the metadb
489  *		for all devices in the specified set
490  */
491 int
492 meta_get_sp_names(
493 	mdsetname_t	*sp,
494 	mdnamelist_t	**nlpp,
495 	int		options,
496 	md_error_t	*ep
497 )
498 {
499 	return (meta_get_names(MD_SP, sp, nlpp, options, ep));
500 }
501 
502 /*
503  * FUNCTION:	meta_get_by_component()
504  * INPUT:	sp	- the set name to get soft partitions from
505  *		compnp	- the name of the device containing the soft
506  *			  partitions that will be returned
507  *		force	- 0 - reads cached namelist if available,
508  *			  1 - reloads cached namelist, frees old namelist
509  * OUTPUT:	nlpp	- list of all soft partition names
510  *		ep	- return error pointer
511  * RETURNS:	int	- -1 error, otherwise the number of soft partitions
512  *			  found on the component (0 = none found).
513  * PURPOSE:	returns a list of all soft partitions on a given device
514  *		from the metadb information
515  */
516 static int
517 meta_sp_get_by_component(
518 	mdsetname_t	*sp,
519 	mdname_t	*compnp,
520 	mdnamelist_t	**nlpp,
521 	int		force,
522 	md_error_t	*ep
523 )
524 {
525 	static mdnamelist_t	*cached_list = NULL;	/* cached namelist */
526 	static int		cached_count = 0;	/* cached count */
527 	mdnamelist_t		*spnlp = NULL;		/* all sp names */
528 	mdnamelist_t		*namep;			/* list iterator */
529 	mdnamelist_t		**tailpp = nlpp;	/* namelist tail */
530 	mdnamelist_t		**cachetailpp;		/* cache tail */
531 	md_sp_t			*msp;			/* unit structure */
532 	int			count = 0;		/* count of sp's */
533 	int			err;
534 	mdname_t		*curnp;
535 
536 	if ((cached_list != NULL) && (!force)) {
537 		/* return a copy of the cached list */
538 		for (namep = cached_list; namep != NULL; namep = namep->next)
539 			tailpp = meta_namelist_append_wrapper(tailpp,
540 			    namep->namep);
541 		return (cached_count);
542 	}
543 
544 	/* free the cache and reset values to zeros to prepare for a new list */
545 	metafreenamelist(cached_list);
546 	cached_count = 0;
547 	cached_list = NULL;
548 	cachetailpp = &cached_list;
549 	*nlpp = NULL;
550 
551 	/* get all the softpartitions first of all */
552 	if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
553 		return (-1);
554 
555 	/*
556 	 * Now for each sp, see if it resides on the component we
557 	 * are interested in, if so then add it to our list
558 	 */
559 	for (namep = spnlp; namep != NULL; namep = namep->next) {
560 		curnp = namep->namep;
561 
562 		/* get the unit structure */
563 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
564 			continue;
565 
566 		/*
567 		 * If the current soft partition is not on the same
568 		 * component, continue the search.  If it is on the same
569 		 * component, add it to our namelist.
570 		 */
571 		err = meta_check_samedrive(compnp, msp->compnamep, ep);
572 		if (err <= 0) {
573 			/* not on the same device, check the next one */
574 			continue;
575 		}
576 
577 		/* it's on the same drive */
578 
579 		/*
580 		 * Check for overlapping partitions if the component is not
581 		 * a metadevice.
582 		 */
583 		if (!metaismeta(msp->compnamep)) {
584 			/*
585 			 * if they're on the same drive, neither
586 			 * should be a metadevice if one isn't
587 			 */
588 			assert(!metaismeta(compnp));
589 
590 			if (meta_check_overlap(msp->compnamep->cname,
591 			    compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0)
592 				continue;
593 
594 			/* in this case it's not an error for them to overlap */
595 			mdclrerror(ep);
596 		}
597 
598 		/* Component is on the same device, add to the used list */
599 		tailpp = meta_namelist_append_wrapper(tailpp, curnp);
600 		cachetailpp = meta_namelist_append_wrapper(cachetailpp,
601 		    curnp);
602 
603 		++count;
604 		++cached_count;
605 	}
606 
607 	assert(count == cached_count);
608 	return (count);
609 
610 out:
611 	metafreenamelist(*nlpp);
612 	*nlpp = NULL;
613 	return (-1);
614 }
615 
616 /*
617  * FUNCTION:    meta_sp_get_default_alignment()
618  * INPUT:       sp      - the pertinent set name
619  *              compnp  - the name of the underlying component
620  * OUTPUT:      ep      - return error pointer
621  * RETURNS:     sp_ext_length_t =0: no default alignment
622  *                              >0: default alignment
623  * PURPOSE:     returns the default alignment for soft partitions to
624  *              be built on top of the specified component or
625  *              metadevice
626  */
627 static sp_ext_length_t
628 meta_sp_get_default_alignment(
629 	mdsetname_t	*sp,
630 	mdname_t	*compnp,
631 	md_error_t	*ep
632 )
633 {
634 	sp_ext_length_t	a = SP_UNALIGNED;
635 	char		*mname;
636 
637 	assert(compnp != NULL);
638 
639 	/*
640 	 * We treat raw devices as opaque, and assume nothing about
641 	 * their alignment requirements.
642 	 */
643 	if (!metaismeta(compnp))
644 		return (SP_UNALIGNED);
645 
646 	/*
647 	 * We already know it's a metadevice from the previous test;
648 	 * metagetmiscname() will tell us which metadevice type we
649 	 * have
650 	 */
651 	mname = metagetmiscname(compnp, ep);
652 	if (mname == NULL)
653 		goto out;
654 
655 	/*
656 	 * For a mirror, we want to deal with the stripe that is the
657 	 * primary side.  If it happens to be asymmetrically
658 	 * configured, there is no simple way to fake a universal
659 	 * alignment.  There's a chance that the least common
660 	 * denominator of the set of interlaces from all stripes of
661 	 * all submirrors would do it, but nobody that really cared
662 	 * that much about this issue would create an asymmetric
663 	 * config to start with.
664 	 *
665 	 * If the component underlying the soft partition is a mirror,
666 	 * then at the exit of this loop, compnp will have been
667 	 * updated to describe the first active submirror.
668 	 */
669 	if (strcmp(mname, MD_MIRROR) == 0) {
670 		md_mirror_t	*mp;
671 		int		smi;
672 		md_submirror_t	*smp;
673 
674 		mp = meta_get_mirror(sp, compnp, ep);
675 		if (mp == NULL)
676 			goto out;
677 
678 		for (smi = 0; smi < NMIRROR; smi++) {
679 
680 			smp = &mp->submirrors[smi];
681 			if (smp->state == SMS_UNUSED)
682 				continue;
683 
684 			compnp = smp->submirnamep;
685 			assert(compnp != NULL);
686 
687 			mname = metagetmiscname(compnp, ep);
688 			if (mname == NULL)
689 				goto out;
690 
691 			break;
692 		}
693 
694 		if (smi == NMIRROR)
695 			goto out;
696 	}
697 
698 	/*
699 	 * Handle stripes and submirrors identically; just return the
700 	 * interlace of the first row.
701 	 */
702 	if (strcmp(mname, MD_STRIPE) == 0) {
703 		md_stripe_t	*stp;
704 
705 		stp = meta_get_stripe(sp, compnp, ep);
706 		if (stp == NULL)
707 			goto out;
708 
709 		a = stp->rows.rows_val[0].interlace;
710 		goto out;
711 	}
712 
713 	/*
714 	 * Raid is even more straightforward; the interlace applies to
715 	 * the entire device.
716 	 */
717 	if (strcmp(mname, MD_RAID) == 0) {
718 		md_raid_t	*rp;
719 
720 		rp = meta_get_raid(sp, compnp, ep);
721 		if (rp == NULL)
722 			goto out;
723 
724 		a = rp->interlace;
725 		goto out;
726 	}
727 
728 	/*
729 	 * If we have arrived here with the alignment still not set,
730 	 * then we expect the error to have been set by one of the
731 	 * routines we called.  If neither is the case, something has
732 	 * really gone wrong above.  (Probably the submirror walk
733 	 * failed to produce a valid submirror, but that would be
734 	 * really bad...)
735 	 */
736 out:
737 	meta_sp_debug("meta_sp_get_default_alignment: miscname %s, "
738 	    "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a);
739 
740 	if (getenv(META_SP_DEBUG) && !mdisok(ep)) {
741 		mde_perror(ep, NULL);
742 	}
743 
744 	assert((a > 0) || (!mdisok(ep)));
745 
746 	return (a);
747 }
748 
749 
750 
751 /*
752  * FUNCTION:	meta_check_insp()
753  * INPUT:	sp	- the set name for the device to check
754  *		np	- the name of the device to check
755  *		slblk	- the starting offset of the device to check
756  *		nblks	- the number of blocks in the device to check
757  * OUTPUT:	ep	- return error pointer
758  * RETURNS:	int	-  0 - device contains soft partitions
759  *			  -1 - device does not contain soft partitions
760  * PURPOSE:	determines whether a device contains any soft partitions
761  */
762 /* ARGSUSED */
763 int
764 meta_check_insp(
765 	mdsetname_t	*sp,
766 	mdname_t	*np,
767 	diskaddr_t	slblk,
768 	diskaddr_t	nblks,
769 	md_error_t	*ep
770 )
771 {
772 	mdnamelist_t	*spnlp = NULL;	/* soft partition name list */
773 	int		count;
774 	int		rval;
775 
776 	/* check set pointer */
777 	assert(sp != NULL);
778 
779 	/*
780 	 * Get a list of the soft partitions that currently reside on
781 	 * the component.  We should ALWAYS force reload the cache,
782 	 * because if we're using the md.tab, we must rebuild
783 	 * the list because it won't contain the previous (if any)
784 	 * soft partition.
785 	 */
786 	/* find all soft partitions on the component */
787 	count = meta_sp_get_by_component(sp, np, &spnlp, 1, ep);
788 
789 	if (count == -1) {
790 		rval = -1;
791 	} else if (count > 0) {
792 		rval = mduseerror(ep, MDE_ALREADY, np->dev,
793 		    spnlp->namep->cname, np->cname);
794 	} else {
795 		rval = 0;
796 	}
797 
798 	metafreenamelist(spnlp);
799 	return (rval);
800 }
801 
802 /*
803  * **************************************************************************
804  *                    Extent List Manipulation Functions                    *
805  * **************************************************************************
806  */
807 
808 /*
809  * FUNCTION:	meta_sp_cmp_by_nameseq()
810  * INPUT:	e1	- first node to compare
811  *		e2	- second node to compare
812  * OUTPUT:	none
813  * RETURNS:	int	- =0 - nodes are equal
814  *			  <0 - e1 should go before e2
815  *			  >0 - e1 should go after e2
816  * PURPOSE:	used for sorted list inserts to build a list sorted by
817  *		name first and sequence number second.
818  */
819 static int
820 meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2)
821 {
822 	int rval;
823 
824 	if (e1->ext_namep == NULL)
825 		return (1);
826 	if (e2->ext_namep == NULL)
827 		return (-1);
828 	if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0)
829 		return (rval);
830 
831 	/* the names are equal, compare sequence numbers */
832 	if (e1->ext_seq > e2->ext_seq)
833 		return (1);
834 	if (e1->ext_seq < e2->ext_seq)
835 		return (-1);
836 	/* sequence numbers are also equal */
837 	return (0);
838 }
839 
840 /*
841  * FUNCTION:	meta_sp_cmp_by_offset()
842  * INPUT:	e1	- first node to compare
843  *		e2	- second node to compare
844  * OUTPUT:	none
845  * RETURNS:	int	- =0 - nodes are equal
846  *			  <0 - e1 should go before e2
847  *			  >0 - e1 should go after e2
848  * PURPOSE:	used for sorted list inserts to build a list sorted by offset
849  */
850 static int
851 meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2)
852 {
853 	if (e1->ext_offset > e2->ext_offset)
854 		return (1);
855 	if (e1->ext_offset < e2->ext_offset)
856 		return (-1);
857 	/* offsets are equal */
858 	return (0);
859 }
860 
861 /*
862  * FUNCTION:	meta_sp_list_insert()
863  * INPUT:	sp	- the set name for the device the node belongs to
864  *		np	- the name of the device the node belongs to
865  *		head	- the head of the list, must be NULL for empty list
866  *		offset	- the physical offset of this extent in sectors
867  *		length	- the length of this extent in sectors
868  *		type	- the type of the extent being inserted
869  *		seq	- the sequence number of the extent being inserted
870  *		flags	- extent flags (eg. whether it needs to be updated)
871  *		compare	- the compare function to use
872  * OUTPUT:	head	- points to the new head if a node was inserted
873  *			  at the beginning
874  * RETURNS:	void
875  * PURPOSE:	inserts an extent node into a sorted doubly linked list.
876  *		The sort order is determined by the compare function.
877  *		Memory is allocated for the node in this function and it
878  *		is up to the caller to free it, possibly using
879  *		meta_sp_list_free().  If a node is inserted at the
880  *		beginning of the list, the head pointer is updated to
881  *		point to the new first node.
882  */
883 static void
884 meta_sp_list_insert(
885 	mdsetname_t	*sp,
886 	mdname_t	*np,
887 	sp_ext_node_t	**head,
888 	sp_ext_offset_t	offset,
889 	sp_ext_length_t	length,
890 	sp_ext_type_t	type,
891 	uint_t		seq,
892 	uint_t		flags,
893 	ext_cmpfunc_t	compare
894 )
895 {
896 	sp_ext_node_t	*newext;
897 	sp_ext_node_t	*curext;
898 
899 	assert(head != NULL);
900 
901 	/* Don't bother adding zero length nodes */
902 	if (length == 0ULL)
903 		return;
904 
905 	/* allocate and fill in new ext_node */
906 	newext = Zalloc(sizeof (sp_ext_node_t));
907 
908 	newext->ext_offset = offset;
909 	newext->ext_length = length;
910 	newext->ext_flags = flags;
911 	newext->ext_type = type;
912 	newext->ext_seq = seq;
913 	newext->ext_setp = sp;
914 	newext->ext_namep = np;
915 
916 	/* first node in the list */
917 	if (*head == NULL) {
918 		newext->ext_next = newext->ext_prev = NULL;
919 		*head = newext;
920 	} else if ((*compare)(*head, newext) >= 0) {
921 		/* the first node has a bigger offset, so insert before it */
922 		assert((*head)->ext_prev == NULL);
923 
924 		newext->ext_prev = NULL;
925 		newext->ext_next = *head;
926 		(*head)->ext_prev = newext;
927 		*head = newext;
928 	} else {
929 		/*
930 		 * find the next node whose offset is greater than
931 		 * the one we want to insert, or the end of the list.
932 		 */
933 		for (curext = *head;
934 		    (curext->ext_next != NULL) &&
935 		    ((*compare)(curext->ext_next, newext) < 0);
936 		    (curext = curext->ext_next))
937 			;
938 
939 		/* link the new node in after the current node */
940 		newext->ext_next = curext->ext_next;
941 		newext->ext_prev = curext;
942 
943 		if (curext->ext_next != NULL)
944 			curext->ext_next->ext_prev = newext;
945 
946 		curext->ext_next = newext;
947 	}
948 }
949 
950 /*
951  * FUNCTION:	meta_sp_list_free()
952  * INPUT:	head	- the head of the list, must be NULL for empty list
953  * OUTPUT:	head	- points to NULL on return
954  * RETURNS:	void
955  * PURPOSE:	walks a double linked extent list and frees each node
956  */
957 static void
958 meta_sp_list_free(sp_ext_node_t **head)
959 {
960 	sp_ext_node_t	*ext;
961 	sp_ext_node_t	*next;
962 
963 	assert(head != NULL);
964 
965 	ext = *head;
966 	while (ext) {
967 		next = ext->ext_next;
968 		Free(ext);
969 		ext = next;
970 	}
971 	*head = NULL;
972 }
973 
974 /*
975  * FUNCTION:	meta_sp_list_remove()
976  * INPUT:	head	- the head of the list, must be NULL for empty list
977  *		ext	- the extent to remove, must be a member of the list
978  * OUTPUT:	head	- points to the new head of the list
979  * RETURNS:	void
980  * PURPOSE:	unlinks the node specified by ext from the list and
981  *		frees it, possibly moving the head pointer forward if
982  *		the head is the node being removed.
983  */
984 static void
985 meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext)
986 {
987 	assert(head != NULL);
988 	assert(*head != NULL);
989 
990 	if (*head == ext)
991 		*head = ext->ext_next;
992 
993 	if (ext->ext_prev != NULL)
994 		ext->ext_prev->ext_next = ext->ext_next;
995 	if (ext->ext_next != NULL)
996 		ext->ext_next->ext_prev = ext->ext_prev;
997 	Free(ext);
998 }
999 
1000 /*
1001  * FUNCTION:	meta_sp_list_size()
1002  * INPUT:	head	- the head of the list, must be NULL for empty list
1003  *		exttype	- the type of the extents to sum
1004  *		exclude_wm - subtract space for extent headers from total
1005  * OUTPUT:	none
1006  * RETURNS:	sp_ext_length_t	- the sum of all of the lengths
1007  * PURPOSE:	sums the lengths of all extents in the list matching the
1008  *		specified type.  This could be used for computing the
1009  *		amount of free or used space, for example.
1010  */
1011 static sp_ext_length_t
1012 meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm)
1013 {
1014 	sp_ext_node_t	*ext;
1015 	sp_ext_length_t	size = 0LL;
1016 
1017 	for (ext = head; ext != NULL; ext = ext->ext_next)
1018 		if (ext->ext_type == exttype)
1019 			size += ext->ext_length -
1020 			    ((exclude_wm) ? MD_SP_WMSIZE : 0);
1021 
1022 	return (size);
1023 }
1024 
1025 /*
1026  * FUNCTION:	meta_sp_list_find()
1027  * INPUT:	head	- the head of the list, must be NULL for empty list
1028  *		offset	- the offset contained by the node to find
1029  * OUTPUT:	none
1030  * RETURNS:	sp_ext_node_t *	- the node containing the requested offset
1031  *				  or NULL if no such nodes were found.
1032  * PURPOSE:	finds a node in a list containing the requested offset
1033  *		(inclusive).  If multiple nodes contain this offset then
1034  *		only the first will be returned, though typically these
1035  *		lists are managed with non-overlapping nodes.
1036  *
1037  *		*The list MUST be sorted by offset for this function to work.*
1038  */
1039 static sp_ext_node_t *
1040 meta_sp_list_find(
1041 	sp_ext_node_t	*head,
1042 	sp_ext_offset_t	offset
1043 )
1044 {
1045 	sp_ext_node_t	*ext;
1046 
1047 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1048 		/* check if the offset lies within this extent */
1049 		if ((offset >= ext->ext_offset) &&
1050 		    (offset < ext->ext_offset + ext->ext_length)) {
1051 			/*
1052 			 * the requested extent should always be a
1053 			 * subset of an extent in the list.
1054 			 */
1055 			return (ext);
1056 		}
1057 	}
1058 	return (NULL);
1059 }
1060 
1061 /*
1062  * FUNCTION:	meta_sp_list_freefill()
1063  * INPUT:	head	- the head of the list, must be NULL for empty list
1064  *		size	- the size of the volume this extent list is
1065  *			  representing
1066  * OUTPUT:	head	- the new head of the list
1067  * RETURNS:	void
1068  * PURPOSE:	finds gaps in the extent list and fills them with a free
1069  *		node.  If there is a gap at the beginning the head
1070  *		pointer will be changed to point to the new free node.
1071  *		If there is free space at the end, the last free extent
1072  *		will extend all the way out to the size specified.
1073  *
1074  *		*The list MUST be sorted by offset for this function to work.*
1075  */
1076 static void
1077 meta_sp_list_freefill(
1078 	sp_ext_node_t	**head,
1079 	sp_ext_length_t	size
1080 )
1081 {
1082 	sp_ext_node_t	*ext;
1083 	sp_ext_offset_t	curoff = 0LL;
1084 
1085 	for (ext = *head; ext != NULL; ext = ext->ext_next) {
1086 		if (curoff < ext->ext_offset)
1087 			meta_sp_list_insert(NULL, NULL, head,
1088 			    curoff, ext->ext_offset - curoff,
1089 			    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1090 		curoff = ext->ext_offset + ext->ext_length;
1091 	}
1092 
1093 	/* pad inverse list out to the end */
1094 	if (curoff < size)
1095 		meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff,
1096 		    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1097 
1098 	if (getenv(META_SP_DEBUG)) {
1099 		meta_sp_debug("meta_sp_list_freefill: Extent list with "
1100 		    "holes freefilled:\n");
1101 		meta_sp_list_dump(*head);
1102 	}
1103 }
1104 
1105 /*
1106  * FUNCTION:	meta_sp_list_dump()
1107  * INPUT:	head	- the head of the list, must be NULL for empty list
1108  * OUTPUT:	none
1109  * RETURNS:	void
1110  * PURPOSE:	dumps the entire extent list to stdout for easy debugging
1111  */
1112 static void
1113 meta_sp_list_dump(sp_ext_node_t *head)
1114 {
1115 	sp_ext_node_t	*ext;
1116 
1117 	meta_sp_debug("meta_sp_list_dump: dumping extent list:\n");
1118 	meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name",
1119 	    "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev",
1120 	    "Next");
1121 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1122 		if (ext->ext_namep != NULL)
1123 			meta_sp_debug("%5s", ext->ext_namep->cname);
1124 		else
1125 			meta_sp_debug("%5s", "NONE");
1126 
1127 		meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq);
1128 		switch (ext->ext_type) {
1129 		case EXTTYP_ALLOC:
1130 			meta_sp_debug("%7s ", "ALLOC");
1131 			break;
1132 		case EXTTYP_FREE:
1133 			meta_sp_debug("%7s ", "FREE");
1134 			break;
1135 		case EXTTYP_END:
1136 			meta_sp_debug("%7s ", "END");
1137 			break;
1138 		case EXTTYP_RESERVED:
1139 			meta_sp_debug("%7s ", "RESV");
1140 			break;
1141 		default:
1142 			meta_sp_debug("%7s ", "INVLD");
1143 			break;
1144 		}
1145 
1146 		meta_sp_debug("%10llu %10llu %5u %10p %10p\n",
1147 		    ext->ext_offset, ext->ext_length,
1148 		    ext->ext_flags, (void *) ext->ext_prev,
1149 		    (void *) ext->ext_next);
1150 	}
1151 	meta_sp_debug("\n");
1152 }
1153 
1154 /*
1155  * FUNCTION:	meta_sp_list_overlaps()
1156  * INPUT:	head	- the head of the list, must be NULL for empty list
1157  * OUTPUT:	none
1158  * RETURNS:	int	- 1 if extents overlap, 0 if ok
1159  * PURPOSE:	checks a list for overlaps.  The list MUST be sorted by
1160  *		offset for this function to work properly.
1161  */
1162 static int
1163 meta_sp_list_overlaps(sp_ext_node_t *head)
1164 {
1165 	sp_ext_node_t	*ext;
1166 
1167 	for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) {
1168 		if (ext->ext_offset + ext->ext_length >
1169 		    ext->ext_next->ext_offset)
1170 			return (1);
1171 	}
1172 	return (0);
1173 }
1174 
1175 /*
1176  * **************************************************************************
1177  *                        Extent Allocation Functions                       *
1178  * **************************************************************************
1179  */
1180 
1181 /*
1182  * FUNCTION:	meta_sp_alloc_by_ext()
1183  * INPUT:	sp	- the set name for the device the node belongs to
1184  *		np	- the name of the device the node belongs to
1185  *		head	- the head of the list, must be NULL for empty list
1186  *		free_ext	- the free extent being allocated from
1187  *		alloc_offset	- the offset of the allocation
1188  *		alloc_len	- the length of the allocation
1189  *		seq		- the sequence number of the allocation
1190  * OUTPUT:	head	- the new head pointer
1191  * RETURNS:	void
1192  * PURPOSE:	allocates a portion of the free extent free_ext.  The
1193  *		allocated portion starts at alloc_offset and is
1194  *		alloc_length long.  Both (alloc_offset) and (alloc_offset +
1195  *		alloc_length) must be contained within the free extent.
1196  *
1197  *		The free extent is split into as many as 3 pieces - a
1198  *		free extent containing [ free_offset .. alloc_offset ), an
1199  *		allocated extent containing the range [ alloc_offset ..
1200  *		alloc_end ], and another free extent containing the
1201  *		range ( alloc_end .. free_end ].  If either of the two
1202  *		new free extents would be zero length, they are not created.
1203  *
1204  *		Finally, the original free extent is removed.  All newly
1205  *		created extents have the EXTFLG_UPDATE flag set.
1206  */
1207 static void
1208 meta_sp_alloc_by_ext(
1209 	mdsetname_t	*sp,
1210 	mdname_t	*np,
1211 	sp_ext_node_t	**head,
1212 	sp_ext_node_t	*free_ext,
1213 	sp_ext_offset_t	alloc_offset,
1214 	sp_ext_length_t	alloc_length,
1215 	uint_t		seq
1216 )
1217 {
1218 	sp_ext_offset_t	free_offset = free_ext->ext_offset;
1219 	sp_ext_length_t	free_length = free_ext->ext_length;
1220 
1221 	sp_ext_offset_t	alloc_end = alloc_offset + alloc_length;
1222 	sp_ext_offset_t	free_end  = free_offset  + free_length;
1223 
1224 	/* allocated extent must be a subset of the free extent */
1225 	assert(free_offset <= alloc_offset);
1226 	assert(free_end >= alloc_end);
1227 
1228 	meta_sp_list_remove(head, free_ext);
1229 
1230 	if (free_offset < alloc_offset) {
1231 		meta_sp_list_insert(NULL, NULL, head, free_offset,
1232 		    (alloc_offset - free_offset), EXTTYP_FREE, 0,
1233 		    EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1234 	}
1235 
1236 	if (free_end > alloc_end) {
1237 		meta_sp_list_insert(NULL, NULL, head, alloc_end,
1238 		    (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE,
1239 		    meta_sp_cmp_by_offset);
1240 	}
1241 
1242 	meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length,
1243 	    EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1244 
1245 	if (getenv(META_SP_DEBUG)) {
1246 		meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n");
1247 		meta_sp_list_dump(*head);
1248 	}
1249 }
1250 
1251 /*
1252  * FUNCTION:	meta_sp_alloc_by_len()
1253  * INPUT:	sp	- the set name for the device the node belongs to
1254  *		np	- the name of the device the node belongs to
1255  *		head	- the head of the list, must be NULL for empty list
1256  *		*lp	- the requested length to allocate
1257  *		last_off	- the last offset already allocated.
1258  *		alignment	- the desired extent alignmeent
1259  * OUTPUT:	head	- the new head pointer
1260  *		*lp	- the length allocated
1261  * RETURNS:	int	- -1 if error, the number of new extents on success
1262  * PURPOSE:	allocates extents from free space to satisfy the requested
1263  *		length.  If requested length is zero, allocates all
1264  *		remaining free space.  This function provides the meat
1265  *		of the extent allocation algorithm.  Allocation is a
1266  *		three tier process:
1267  *
1268  *		1. If last_off is nonzero and there is free space following
1269  *		   that node, then it is extended to allocate as much of that
1270  *		   free space as possible.  This is useful for metattach.
1271  *		2. If a free extent can be found to satisfy the remaining
1272  *		   requested space, then satisfy the rest of the request
1273  *		   from that extent.
1274  *		3. Start allocating space from any remaining free extents until
1275  *		   the remainder of the request is satisified.
1276  *
1277  *              If alignment is non-zero, then every extent modified
1278  *              or newly allocated will be aligned modulo alignment,
1279  *              with a length that is an integer multiple of
1280  *              alignment.
1281  *
1282  *		The EXTFLG_UPDATE flag is set for all nodes (free and
1283  *		allocated) that require updated watermarks.
1284  *
1285  *		This algorithm may have a negative impact on fragmentation
1286  *		in pathological cases and may be improved if it turns out
1287  *		to be a problem.  This may be exacerbated by particularly
1288  *		large alignments.
1289  *
1290  * NOTE:	It's confusing, so it demands an explanation:
1291  *		- len is used to represent requested data space; it
1292  *		  does not include room for a watermark.  On each full
1293  *		  or partial allocation, len will be decremented by
1294  *		  alloc_len (see next paragraph) until it reaches
1295  *		  zero.
1296  *		- alloc_len is used to represent data space allocated
1297  *		  from a particular extent; it does not include space
1298  *		  for a watermark.  In the rare event that a_length
1299  *		  (see next paragraph) is equal to MD_SP_WMSIZE,
1300  *		  alloc_len will be zero and the resulting MD_SP_WMSIZE
1301  *		  fragment of space will be utterly unusable.
1302  *		- a_length is used to represent all space to be
1303  *		  allocated from a particular extent; it DOES include
1304  *		  space for a watermark.
1305  */
1306 static int
1307 meta_sp_alloc_by_len(
1308 	mdsetname_t	*sp,
1309 	mdname_t	*np,
1310 	sp_ext_node_t	**head,
1311 	sp_ext_length_t	*lp,
1312 	sp_ext_offset_t	last_off,
1313 	sp_ext_offset_t	alignment
1314 )
1315 {
1316 	sp_ext_node_t	*free_ext;
1317 	sp_ext_node_t	*alloc_ext;
1318 	uint_t		last_seq = 0;
1319 	uint_t		numexts = 0;
1320 	sp_ext_length_t	freespace;
1321 	sp_ext_length_t	alloc_len;
1322 	sp_ext_length_t	len;
1323 
1324 	/* We're DOA if we can't read *lp */
1325 	assert(lp != NULL);
1326 	len = *lp;
1327 
1328 	/*
1329 	 * Process the nominal case first: we've been given an actual
1330 	 * size argument, rather than the literal "all"
1331 	 */
1332 
1333 	if (len != 0) {
1334 
1335 		/*
1336 		 * Short circuit the check for free space.  This may
1337 		 * tell us we have enough space when we really don't
1338 		 * because each extent loses space to a watermark, but
1339 		 * it will always tell us there isn't enough space
1340 		 * correctly.  Worst case we do some extra work.
1341 		 */
1342 		freespace = meta_sp_list_size(*head, EXTTYP_FREE,
1343 		    INCLUDE_WM);
1344 
1345 		if (freespace < len)
1346 			return (-1);
1347 
1348 		/*
1349 		 * First see if we can extend the last extent for an
1350 		 * attach.
1351 		 */
1352 		if (last_off != 0LL) {
1353 			int align = 0;
1354 
1355 			alloc_ext =
1356 			    meta_sp_list_find(*head, last_off);
1357 			assert(alloc_ext != NULL);
1358 
1359 			/*
1360 			 * The offset test reflects the
1361 			 * inclusion of the watermark in the extent
1362 			 */
1363 			align = (alignment > 0) &&
1364 			    (((alloc_ext->ext_offset + MD_SP_WMSIZE) %
1365 				alignment) == 0);
1366 
1367 			/*
1368 			 * If we decided not to align here, we should
1369 			 * also reset "alignment" so we don't bother
1370 			 * later, either.
1371 			 */
1372 			if (!align) {
1373 				alignment = 0;
1374 			}
1375 
1376 			last_seq = alloc_ext->ext_seq;
1377 
1378 			free_ext = meta_sp_list_find(*head,
1379 			    alloc_ext->ext_offset +
1380 			    alloc_ext->ext_length);
1381 
1382 			/*
1383 			 * If a free extent follows our last allocated
1384 			 * extent, then remove the last allocated
1385 			 * extent and increase the size of the free
1386 			 * extent to overlap it, then allocate the
1387 			 * total space from the new free extent.
1388 			 */
1389 			if (free_ext != NULL &&
1390 			    free_ext->ext_type == EXTTYP_FREE) {
1391 				assert(free_ext->ext_offset ==
1392 				    alloc_ext->ext_offset +
1393 				    alloc_ext->ext_length);
1394 
1395 				alloc_len =
1396 				    MIN(len, free_ext->ext_length);
1397 
1398 				if (align && (alloc_len < len)) {
1399 					/* No watermark space needed */
1400 					alloc_len -= alloc_len % alignment;
1401 				}
1402 
1403 				if (alloc_len > 0) {
1404 					free_ext->ext_offset -=
1405 					    alloc_ext->ext_length;
1406 					free_ext->ext_length +=
1407 					    alloc_ext->ext_length;
1408 
1409 					meta_sp_alloc_by_ext(sp, np, head,
1410 					    free_ext, free_ext->ext_offset,
1411 					    alloc_ext->ext_length + alloc_len,
1412 					    last_seq);
1413 
1414 					/*
1415 					 * now remove the original allocated
1416 					 * node.  We may have overlapping
1417 					 * extents for a short time before
1418 					 * this node is removed.
1419 					 */
1420 					meta_sp_list_remove(head, alloc_ext);
1421 					len -= alloc_len;
1422 				}
1423 			}
1424 			last_seq++;
1425 		}
1426 
1427 		if (len == 0LL)
1428 			goto out;
1429 
1430 		/*
1431 		 * Next, see if we can find a single allocation for
1432 		 * the remainder.  This may make fragmentation worse
1433 		 * in some cases, but there's no good way to allocate
1434 		 * that doesn't have a highly fragmented corner case.
1435 		 */
1436 		for (free_ext = *head; free_ext != NULL;
1437 			free_ext = free_ext->ext_next) {
1438 			sp_ext_offset_t	a_offset;
1439 			sp_ext_offset_t	a_length;
1440 
1441 			if (free_ext->ext_type != EXTTYP_FREE)
1442 				continue;
1443 
1444 			/*
1445 			 * The length test should include space for
1446 			 * the watermark
1447 			 */
1448 
1449 			a_offset = free_ext->ext_offset;
1450 			a_length = free_ext->ext_length;
1451 
1452 			if (alignment > 0) {
1453 
1454 				/*
1455 				 * Shortcut for extents that have been
1456 				 * previously added to pad out the
1457 				 * data space
1458 				 */
1459 				if (a_length < alignment) {
1460 					continue;
1461 				}
1462 
1463 				/*
1464 				 * Round up so the data space begins
1465 				 * on a properly aligned boundary.
1466 				 */
1467 				a_offset += alignment -
1468 				    (a_offset % alignment) - MD_SP_WMSIZE;
1469 
1470 				/*
1471 				 * This is only necessary in case the
1472 				 * watermark size is ever greater than
1473 				 * one.  It'll never happen, of
1474 				 * course; we'll get rid of watermarks
1475 				 * before we make 'em bigger.
1476 				 */
1477 				if (a_offset < free_ext->ext_offset) {
1478 					a_offset += alignment;
1479 				}
1480 
1481 				/*
1482 				 * Adjust the length to account for
1483 				 * the space lost above (if any)
1484 				 */
1485 				a_length -=
1486 					(a_offset - free_ext->ext_offset);
1487 			}
1488 
1489 			if (a_length >= len + MD_SP_WMSIZE) {
1490 				meta_sp_alloc_by_ext(sp, np, head,
1491 					free_ext, a_offset,
1492 					len + MD_SP_WMSIZE, last_seq);
1493 
1494 				len = 0LL;
1495 				numexts++;
1496 				break;
1497 			}
1498 		}
1499 
1500 		if (len == 0LL)
1501 			goto out;
1502 
1503 
1504 		/*
1505 		 * If the request could not be satisfied by extending
1506 		 * the last extent or by a single extent, then put
1507 		 * multiple smaller extents together until the request
1508 		 * is satisfied.
1509 		 */
1510 		for (free_ext = *head; (free_ext != NULL) && (len > 0);
1511 			free_ext = free_ext->ext_next) {
1512 			sp_ext_offset_t a_offset;
1513 			sp_ext_length_t a_length;
1514 
1515 			if (free_ext->ext_type != EXTTYP_FREE)
1516 				continue;
1517 
1518 			a_offset = free_ext->ext_offset;
1519 			a_length = free_ext->ext_length;
1520 
1521 			if (alignment > 0) {
1522 
1523 				/*
1524 				 * Shortcut for extents that have been
1525 				 * previously added to pad out the
1526 				 * data space
1527 				 */
1528 				if (a_length < alignment) {
1529 					continue;
1530 				}
1531 
1532 				/*
1533 				 * Round up so the data space begins
1534 				 * on a properly aligned boundary.
1535 				 */
1536 				a_offset += alignment -
1537 					(a_offset % alignment) - MD_SP_WMSIZE;
1538 
1539 				/*
1540 				 * This is only necessary in case the
1541 				 * watermark size is ever greater than
1542 				 * one.  It'll never happen, of
1543 				 * course; we'll get rid of watermarks
1544 				 * before we make 'em bigger.
1545 				 */
1546 				if (a_offset < free_ext->ext_offset) {
1547 					a_offset += alignment;
1548 				}
1549 
1550 				/*
1551 				 * Adjust the length to account for
1552 				 * the space lost above (if any)
1553 				 */
1554 				a_length -=
1555 					(a_offset - free_ext->ext_offset);
1556 
1557 				/*
1558 				 * Adjust the length to be properly
1559 				 * aligned if it is NOT to be the
1560 				 * last extent in the soft partition.
1561 				 */
1562 				if ((a_length - MD_SP_WMSIZE) < len)
1563 					a_length -=
1564 						(a_length - MD_SP_WMSIZE)
1565 						% alignment;
1566 			}
1567 
1568 			alloc_len = MIN(len, a_length - MD_SP_WMSIZE);
1569 			if (alloc_len == 0)
1570 				continue;
1571 
1572 			/*
1573 			 * meta_sp_alloc_by_ext() expects the
1574 			 * allocation length to include the watermark
1575 			 * size, which is why we don't simply pass in
1576 			 * alloc_len here.
1577 			 */
1578 			meta_sp_alloc_by_ext(sp, np, head, free_ext,
1579 				a_offset, MIN(len + MD_SP_WMSIZE, a_length),
1580 				last_seq);
1581 
1582 			len -= alloc_len;
1583 			numexts++;
1584 			last_seq++;
1585 		}
1586 
1587 
1588 		/*
1589 		 * If there was not enough space we can throw it all
1590 		 * away since no real work has been done yet.
1591 		 */
1592 		if (len != 0) {
1593 			meta_sp_list_free(head);
1594 			return (-1);
1595 		}
1596 	}
1597 
1598 	/*
1599 	 * Otherwise, the literal "all" was specified: allocate all
1600 	 * available free space.  Don't bother with alignment.
1601 	 */
1602 	else {
1603 		/* First, extend the last extent if this is a grow */
1604 		if (last_off != 0LL) {
1605 			alloc_ext =
1606 				meta_sp_list_find(*head, last_off);
1607 			assert(alloc_ext != NULL);
1608 
1609 			last_seq = alloc_ext->ext_seq;
1610 
1611 			free_ext = meta_sp_list_find(*head,
1612 				alloc_ext->ext_offset +
1613 				alloc_ext->ext_length);
1614 
1615 			/*
1616 			 * If a free extent follows our last allocated
1617 			 * extent, then remove the last allocated
1618 			 * extent and increase the size of the free
1619 			 * extent to overlap it, then allocate the
1620 			 * total space from the new free extent.
1621 			 */
1622 			if (free_ext != NULL &&
1623 			    free_ext->ext_type == EXTTYP_FREE) {
1624 				assert(free_ext->ext_offset ==
1625 				    alloc_ext->ext_offset +
1626 				    alloc_ext->ext_length);
1627 
1628 				len = alloc_len =
1629 				    free_ext->ext_length;
1630 
1631 				free_ext->ext_offset -=
1632 				    alloc_ext->ext_length;
1633 				free_ext->ext_length +=
1634 				    alloc_ext->ext_length;
1635 
1636 				meta_sp_alloc_by_ext(sp, np, head,
1637 				    free_ext, free_ext->ext_offset,
1638 				    alloc_ext->ext_length + alloc_len,
1639 				    last_seq);
1640 
1641 				/*
1642 				 * now remove the original allocated
1643 				 * node.  We may have overlapping
1644 				 * extents for a short time before
1645 				 * this node is removed.
1646 				 */
1647 				meta_sp_list_remove(head, alloc_ext);
1648 			}
1649 
1650 			last_seq++;
1651 		}
1652 
1653 		/* Next, grab all remaining free space */
1654 		for (free_ext = *head; free_ext != NULL;
1655 			free_ext = free_ext->ext_next) {
1656 
1657 			if (free_ext->ext_type == EXTTYP_FREE) {
1658 				alloc_len =
1659 				    free_ext->ext_length - MD_SP_WMSIZE;
1660 				if (alloc_len == 0)
1661 					continue;
1662 
1663 				/*
1664 				 * meta_sp_alloc_by_ext() expects the
1665 				 * allocation length to include the
1666 				 * watermark size, which is why we
1667 				 * don't simply pass in alloc_len
1668 				 * here.
1669 				 */
1670 				meta_sp_alloc_by_ext(sp, np, head,
1671 				    free_ext, free_ext->ext_offset,
1672 				    free_ext->ext_length,
1673 				    last_seq);
1674 
1675 				len += alloc_len;
1676 				numexts++;
1677 				last_seq++;
1678 			}
1679 		}
1680 	}
1681 
1682 out:
1683 	if (getenv(META_SP_DEBUG)) {
1684 		meta_sp_debug("meta_sp_alloc_by_len: Extent list after "
1685 		    "allocation:\n");
1686 		meta_sp_list_dump(*head);
1687 	}
1688 
1689 	if (*lp == 0) {
1690 		*lp = len;
1691 
1692 		/*
1693 		 * Make sure the callers hit a no space error if we
1694 		 * didn't actually find anything.
1695 		 */
1696 		if (len == 0) {
1697 			return (-1);
1698 		}
1699 	}
1700 
1701 	return (numexts);
1702 }
1703 
1704 /*
1705  * FUNCTION:	meta_sp_alloc_by_list()
1706  * INPUT:	sp	- the set name for the device the node belongs to
1707  *		np	- the name of the device the node belongs to
1708  *		head	- the head of the list, must be NULL for empty list
1709  *		oblist	- an extent list containing requested nodes to allocate
1710  * OUTPUT:	head	- the new head pointer
1711  * RETURNS:	int	- -1 if error, the number of new extents on success
1712  * PURPOSE:	allocates extents from free space to satisfy the requested
1713  *		extent list.  This is primarily used for the -o/-b options
1714  *		where the user may specifically request extents to allocate.
1715  *		Each extent in the oblist must be a subset (inclusive) of a
1716  *		free extent and may not overlap each other.  This
1717  *		function sets the EXTFLG_UPDATE flag for each node that
1718  *		requires a watermark update after allocating.
1719  */
1720 static int
1721 meta_sp_alloc_by_list(
1722 	mdsetname_t	*sp,
1723 	mdname_t	*np,
1724 	sp_ext_node_t	**head,
1725 	sp_ext_node_t	*oblist
1726 )
1727 {
1728 	sp_ext_node_t	*ext;
1729 	sp_ext_node_t	*free_ext;
1730 	uint_t		numexts = 0;
1731 
1732 	for (ext = oblist; ext != NULL; ext = ext->ext_next) {
1733 
1734 		free_ext = meta_sp_list_find(*head,
1735 		    ext->ext_offset - MD_SP_WMSIZE);
1736 
1737 		/* Make sure the allocation is within the free extent */
1738 		if ((free_ext == NULL) ||
1739 		    (ext->ext_offset + ext->ext_length >
1740 		    free_ext->ext_offset + free_ext->ext_length) ||
1741 		    (free_ext->ext_type != EXTTYP_FREE))
1742 			return (-1);
1743 
1744 		meta_sp_alloc_by_ext(sp, np, head, free_ext,
1745 		    ext->ext_offset - MD_SP_WMSIZE,
1746 		    ext->ext_length + MD_SP_WMSIZE, ext->ext_seq);
1747 
1748 		numexts++;
1749 	}
1750 
1751 	assert(meta_sp_list_overlaps(*head) == 0);
1752 
1753 	if (getenv(META_SP_DEBUG)) {
1754 		meta_sp_debug("meta_sp_alloc_by_list: Extent list after "
1755 		    "allocation:\n");
1756 		meta_sp_list_dump(*head);
1757 	}
1758 
1759 	return (numexts);
1760 }
1761 
1762 /*
1763  * **************************************************************************
1764  *                     Extent List Population Functions                     *
1765  * **************************************************************************
1766  */
1767 
1768 /*
1769  * FUNCTION:	meta_sp_extlist_from_namelist()
1770  * INPUT:	sp	- the set name for the device the node belongs to
1771  *		spnplp	- the namelist of soft partitions to build a list from
1772  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1773  *		ep	- return error pointer
1774  * RETURNS:	int	- -1 if error, 0 on success
1775  * PURPOSE:	builds an extent list representing the soft partitions
1776  *		specified in the namelist.  Each extent in each soft
1777  *		partition is added to the list with the type EXTTYP_ALLOC.
1778  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1779  *		extent in the list includes the space occupied by the
1780  *		watermark, which is not included in the unit structures.
1781  */
1782 static int
1783 meta_sp_extlist_from_namelist(
1784 	mdsetname_t	*sp,
1785 	mdnamelist_t	*spnlp,
1786 	sp_ext_node_t	**extlist,
1787 	md_error_t	*ep
1788 )
1789 {
1790 	int		extn;
1791 	md_sp_t		*msp;		/* unit structure of the sp's */
1792 	mdnamelist_t	*namep;
1793 
1794 	assert(sp != NULL);
1795 
1796 	/*
1797 	 * Now go through the soft partitions and add a node to the used
1798 	 * list for each allocated extent.
1799 	 */
1800 	for (namep = spnlp; namep != NULL; namep = namep->next) {
1801 		mdname_t	*curnp = namep->namep;
1802 
1803 		/* get the unit structure */
1804 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
1805 			return (-1);
1806 
1807 		for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1808 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1809 
1810 			/*
1811 			 * subtract from offset and add to the length
1812 			 * to account for the watermark, which is not
1813 			 * contained in the extents in the unit structure.
1814 			 */
1815 			meta_sp_list_insert(sp, curnp, extlist,
1816 			    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
1817 			    EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset);
1818 		}
1819 	}
1820 	return (0);
1821 }
1822 
1823 /*
1824  * FUNCTION:	meta_sp_extlist_from_wm()
1825  * INPUT:	sp	- the set name for the device the node belongs to
1826  *		compnp	- the name of the device to scan watermarks on
1827  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1828  *		ep	- return error pointer
1829  * RETURNS:	int	- -1 if error, 0 on success
1830  * PURPOSE:	builds an extent list representing the soft partitions
1831  *		specified in the namelist.  Each extent in each soft
1832  *		partition is added to the list with the type EXTTYP_ALLOC.
1833  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1834  *		extent in the list includes the space occupied by the
1835  *		watermark, which is not included in the unit structures.
1836  */
1837 static int
1838 meta_sp_extlist_from_wm(
1839 	mdsetname_t	*sp,
1840 	mdname_t	*compnp,
1841 	sp_ext_node_t	**extlist,
1842 	ext_cmpfunc_t	compare,
1843 	md_error_t	*ep
1844 )
1845 {
1846 	mp_watermark_t	wm;
1847 	mdname_t	*np = NULL;
1848 	mdsetname_t	*spsetp = NULL;
1849 	sp_ext_offset_t	cur_off;
1850 	md_set_desc	*sd;
1851 	int		init = 0;
1852 	mdkey_t		key;
1853 	minor_t		mnum;
1854 
1855 	if (!metaislocalset(sp)) {
1856 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1857 			return (-1);
1858 	}
1859 
1860 	if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR)
1861 		return (-1);
1862 
1863 	for (;;) {
1864 		if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) {
1865 			return (-1);
1866 		}
1867 
1868 		/* get the set and name pointers */
1869 		if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) {
1870 			if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) {
1871 				return (-1);
1872 			}
1873 		}
1874 
1875 		/*
1876 		 * For the MN set, meta_init_make_device needs to
1877 		 * be run on all the nodes so the entries for the
1878 		 * softpart device name and its comp can be created
1879 		 * in the same order in the replica namespace.  If
1880 		 * we have it run on mdmn_do_iocset then the mddbs
1881 		 * will be out of sync between master node and slave
1882 		 * nodes.
1883 		 */
1884 		if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) {
1885 
1886 		    if (!metaislocalset(sp) && MD_MNSET_DESC(sd)) {
1887 			md_mn_msg_addmdname_t	*send_params;
1888 			int			result;
1889 			md_mn_result_t		*resp = NULL;
1890 			int			message_size;
1891 
1892 			message_size =  sizeof (*send_params) +
1893 			    strlen(wm.wm_mdname) + 1;
1894 			send_params = Zalloc(message_size);
1895 			send_params->addmdname_setno = sp->setno;
1896 			(void) strcpy(&send_params->addmdname_name[0],
1897 			    wm.wm_mdname);
1898 			result = mdmn_send_message(sp->setno,
1899 			    MD_MN_MSG_ADDMDNAME,
1900 			    MD_MSGF_PANIC_WHEN_INCONSISTENT,
1901 			    (char *)send_params, message_size, &resp,
1902 			    ep);
1903 			Free(send_params);
1904 			if (resp != NULL) {
1905 				if (resp->mmr_exitval != 0) {
1906 					free_result(resp);
1907 					return (-1);
1908 				}
1909 				free_result(resp);
1910 			}
1911 			if (result != 0)
1912 				return (-1);
1913 		    } else {
1914 
1915 			if (!is_existing_meta_hsp(sp, wm.wm_mdname)) {
1916 			    if ((key = meta_init_make_device(&sp,
1917 				wm.wm_mdname, ep)) <= 0) {
1918 					return (-1);
1919 				}
1920 				init = 1;
1921 			}
1922 		    }
1923 
1924 		    np = metaname(&spsetp, wm.wm_mdname, META_DEVICE, ep);
1925 		    if (np == NULL) {
1926 			if (init) {
1927 			    if (meta_getnmentbykey(sp->setno, MD_SIDEWILD,
1928 				key, NULL, &mnum, NULL, ep) != NULL) {
1929 				    (void) metaioctl(MD_IOCREM_DEV, &mnum,
1930 						ep, NULL);
1931 			    }
1932 			    (void) del_self_name(sp, key, ep);
1933 			}
1934 			return (-1);
1935 		    }
1936 		}
1937 
1938 		/* insert watermark into extent list */
1939 		meta_sp_list_insert(spsetp, np, extlist, cur_off,
1940 		    wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq,
1941 		    EXTFLG_UPDATE, compare);
1942 
1943 		/* if we see the end watermark, we're done */
1944 		if (wm.wm_type == EXTTYP_END)
1945 			break;
1946 
1947 		cur_off += wm.wm_length + 1;
1948 
1949 		/* clear out set and name pointers for next iteration */
1950 		np = NULL;
1951 		spsetp = NULL;
1952 	}
1953 
1954 	return (0);
1955 }
1956 
1957 /*
1958  * **************************************************************************
1959  *                        Print (metastat) Functions                        *
1960  * **************************************************************************
1961  */
1962 
1963 /*
1964  * FUNCTION:	meta_sp_short_print()
1965  * INPUT:	msp	- the unit structure to display
1966  *		fp	- the file pointer to send output to
1967  *		options	- print options from the command line processor
1968  * OUTPUT:	ep	- return error pointer
1969  * RETURNS:	int	- -1 if error, 0 on success
1970  * PURPOSE:	display a short report of the soft partition in md.tab
1971  *		form, primarily used for metastat -p.
1972  */
1973 static int
1974 meta_sp_short_print(
1975 	md_sp_t		*msp,
1976 	char		*fname,
1977 	FILE		*fp,
1978 	mdprtopts_t	options,
1979 	md_error_t	*ep
1980 )
1981 {
1982 	int	extn;
1983 
1984 	if (options & PRINT_LARGEDEVICES) {
1985 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0)
1986 			return (0);
1987 	}
1988 
1989 	if (options & PRINT_FN) {
1990 		if ((msp->common.revision & MD_FN_META_DEV) == 0)
1991 			return (0);
1992 	}
1993 
1994 	/* print name and -p */
1995 	if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF)
1996 		return (mdsyserror(ep, errno, fname));
1997 
1998 	/* print the component */
1999 	/*
2000 	 * Always print the full path name
2001 	 */
2002 	if (fprintf(fp, " %s", msp->compnamep->rname) == EOF)
2003 		return (mdsyserror(ep, errno, fname));
2004 
2005 	/* print out each extent */
2006 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2007 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2008 		if (fprintf(fp, " -o %llu -b %llu ", extp->poff,
2009 		    extp->len) == EOF)
2010 			return (mdsyserror(ep, errno, fname));
2011 	}
2012 
2013 	if (fprintf(fp, "\n") == EOF)
2014 		return (mdsyserror(ep, errno, fname));
2015 
2016 	/* success */
2017 	return (0);
2018 }
2019 
2020 /*
2021  * FUNCTION:	meta_sp_status_to_name()
2022  * INPUT:	xsp_status	- the status value to convert to a string
2023  *		tstate		- transient errored device state. If set the
2024  *				  device is Unavailable
2025  * OUTPUT:	none
2026  * RETURNS:	char *	- a pointer to the string representing the status value
2027  * PURPOSE:	return an internationalized string representing the
2028  *		status value for a soft partition.  The strings are
2029  *		strdup'd and must be freed by the caller.
2030  */
2031 static char *
2032 meta_sp_status_to_name(
2033 	xsp_status_t	xsp_status,
2034 	uint_t		tstate
2035 )
2036 {
2037 	char *rval = NULL;
2038 
2039 	/*
2040 	 * Check to see if we have MD_INACCESSIBLE set. This is the only valid
2041 	 * value for an 'Unavailable' return. tstate can be set because of
2042 	 * other multi-node reasons (e.g. ABR being set)
2043 	 */
2044 	if (tstate & MD_INACCESSIBLE) {
2045 		return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable")));
2046 	}
2047 
2048 	switch (xsp_status) {
2049 	case MD_SP_CREATEPEND:
2050 		rval = Strdup(dgettext(TEXT_DOMAIN, "Creating"));
2051 		break;
2052 	case MD_SP_GROWPEND:
2053 		rval = Strdup(dgettext(TEXT_DOMAIN, "Growing"));
2054 		break;
2055 	case MD_SP_DELPEND:
2056 		rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting"));
2057 		break;
2058 	case MD_SP_OK:
2059 		rval = Strdup(dgettext(TEXT_DOMAIN, "Okay"));
2060 		break;
2061 	case MD_SP_ERR:
2062 		rval = Strdup(dgettext(TEXT_DOMAIN, "Errored"));
2063 		break;
2064 	case MD_SP_RECOVER:
2065 		rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering"));
2066 		break;
2067 	}
2068 
2069 	if (rval == NULL)
2070 		rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid"));
2071 
2072 	return (rval);
2073 }
2074 
2075 /*
2076  * FUNCTION:	meta_sp_report()
2077  * INPUT:	sp	- the set name for the unit being displayed
2078  *		msp	- the unit structure to display
2079  *		nlpp	- pass back the large devs
2080  *		fp	- the file pointer to send output to
2081  *		options	- print options from the command line processor
2082  * OUTPUT:	ep	- return error pointer
2083  * RETURNS:	int	- -1 if error, 0 on success
2084  * PURPOSE:	print a full report of the device specified
2085  */
2086 static int
2087 meta_sp_report(
2088 	mdsetname_t	*sp,
2089 	md_sp_t		*msp,
2090 	mdnamelist_t	**nlpp,
2091 	char		*fname,
2092 	FILE		*fp,
2093 	mdprtopts_t	options,
2094 	md_error_t	*ep
2095 )
2096 {
2097 	uint_t		extn;
2098 	char		*status;
2099 	char		*devid = "";
2100 	mdname_t	*didnp = NULL;
2101 	ddi_devid_t	dtp;
2102 	int		len;
2103 	uint_t		tstate = 0;
2104 
2105 	if (options & PRINT_LARGEDEVICES) {
2106 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0) {
2107 			return (0);
2108 		} else {
2109 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2110 				return (-1);
2111 		}
2112 	}
2113 
2114 	if (options & PRINT_FN) {
2115 		if ((msp->common.revision & MD_FN_META_DEV) == 0) {
2116 			return (0);
2117 		} else {
2118 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2119 				return (-1);
2120 		}
2121 	}
2122 
2123 	if (options & PRINT_HEADER) {
2124 		if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"),
2125 		    msp->common.namep->cname) == EOF)
2126 			return (mdsyserror(ep, errno, fname));
2127 	}
2128 
2129 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Device: %s\n"),
2130 	    msp->compnamep->cname) == EOF)
2131 		return (mdsyserror(ep, errno, fname));
2132 
2133 	/* Determine if device is available before displaying status */
2134 	if (metaismeta(msp->common.namep)) {
2135 		if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0)
2136 			return (-1);
2137 	}
2138 	status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED);
2139 
2140 	/* print out "State" to be consistent with other metadevices */
2141 	if (tstate & MD_ABR_CAP) {
2142 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2143 		    "    State: %s - Application Based Recovery (ABR)\n"),
2144 		    status) == EOF) {
2145 			Free(status);
2146 			return (mdsyserror(ep, errno, fname));
2147 		}
2148 	} else {
2149 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2150 		    "    State: %s\n"), status) == EOF) {
2151 			Free(status);
2152 			return (mdsyserror(ep, errno, fname));
2153 		}
2154 	}
2155 	free(status);
2156 
2157 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Size: %llu blocks (%s)\n"),
2158 	    msp->common.size,
2159 	    meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF)
2160 		return (mdsyserror(ep, errno, fname));
2161 
2162 	/* print component details */
2163 	if (! metaismeta(msp->compnamep)) {
2164 		diskaddr_t	start_blk;
2165 		int		has_mddb;
2166 		char		*has_mddb_str;
2167 
2168 		/* print header */
2169 		/*
2170 		 * Building a format string on the fly that will
2171 		 * be used in (f)printf. This allows the length
2172 		 * of the ctd to vary from small to large without
2173 		 * looking horrible.
2174 		 */
2175 		len = strlen(msp->compnamep->cname);
2176 		len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device")));
2177 		len += 2;
2178 		if (fprintf(fp,
2179 		    "\t%-*.*s %-12.12s %-5.5s %s\n",
2180 		    len, len,
2181 		    dgettext(TEXT_DOMAIN, "Device"),
2182 		    dgettext(TEXT_DOMAIN, "Start Block"),
2183 		    dgettext(TEXT_DOMAIN, "Dbase"),
2184 		    dgettext(TEXT_DOMAIN, "Reloc")) == EOF) {
2185 			return (mdsyserror(ep, errno, fname));
2186 		}
2187 
2188 
2189 		/* get info */
2190 		if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) ==
2191 		    MD_DISKADDR_ERROR)
2192 			return (-1);
2193 
2194 		if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0)
2195 			return (-1);
2196 
2197 		if (has_mddb)
2198 			has_mddb_str = dgettext(TEXT_DOMAIN, "Yes");
2199 		else
2200 			has_mddb_str = dgettext(TEXT_DOMAIN, "No");
2201 
2202 		/* populate the key in the name_p structure */
2203 		didnp = metadevname(&sp, msp->compnamep->dev, ep);
2204 		if (didnp == NULL) {
2205 			return (-1);
2206 		}
2207 
2208 		/* determine if devid does NOT exist */
2209 		if (options & PRINT_DEVID) {
2210 		    if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep),
2211 					didnp->key, ep)) == NULL)
2212 				devid = dgettext(TEXT_DOMAIN, "No ");
2213 			else {
2214 				devid = dgettext(TEXT_DOMAIN, "Yes");
2215 				free(dtp);
2216 			}
2217 		}
2218 
2219 		/* print info */
2220 		/*
2221 		 * This allows the length
2222 		 * of the ctd to vary from small to large without
2223 		 * looking horrible.
2224 		 */
2225 		if (fprintf(fp, "\t%-*s %8lld     %-5.5s %s\n",
2226 		    len, msp->compnamep->cname,
2227 		    start_blk, has_mddb_str, devid) == EOF) {
2228 			return (mdsyserror(ep, errno, fname));
2229 		}
2230 		(void) fprintf(fp, "\n");
2231 	}
2232 
2233 
2234 	/* print the headers */
2235 	if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n",
2236 	    dgettext(TEXT_DOMAIN, "Extent"),
2237 	    dgettext(TEXT_DOMAIN, "Start Block"),
2238 	    dgettext(TEXT_DOMAIN, "Block count")) == EOF)
2239 		return (mdsyserror(ep, errno, fname));
2240 
2241 	/* print out each extent */
2242 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2243 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2244 
2245 		/* If PRINT_TIMES option is ever supported, add output here */
2246 		if (fprintf(fp, "\t%6u %24llu %24llu\n",
2247 		    extn, extp->poff, extp->len) == EOF)
2248 			return (mdsyserror(ep, errno, fname));
2249 	}
2250 
2251 	/* separate records with a newline */
2252 	(void) fprintf(fp, "\n");
2253 	return (0);
2254 }
2255 
2256 /*
2257  * FUNCTION:	meta_sp_print()
2258  * INPUT:	sp	- the set name for the unit being displayed
2259  *		np	- the name of the device to print
2260  *		fname	- ??? not used
2261  *		fp	- the file pointer to send output to
2262  *		options	- print options from the command line processor
2263  * OUTPUT:	ep	- return error pointer
2264  * RETURNS:	int	- -1 if error, 0 on success
2265  * PURPOSE:	print a full report of the device specified by metastat.
2266  *		This is the main entry point for printing.
2267  */
2268 int
2269 meta_sp_print(
2270 	mdsetname_t	*sp,
2271 	mdname_t	*np,
2272 	mdnamelist_t	**nlpp,
2273 	char		*fname,
2274 	FILE		*fp,
2275 	mdprtopts_t	options,
2276 	md_error_t	*ep
2277 )
2278 {
2279 	md_sp_t		*msp;
2280 	md_unit_t	*mdp;
2281 	int		rval = 0;
2282 
2283 	/* should always have the same set */
2284 	assert(sp != NULL);
2285 
2286 	/* print all the soft partitions */
2287 	if (np == NULL) {
2288 		mdnamelist_t	*nlp = NULL;
2289 		mdnamelist_t	*p;
2290 		int		cnt;
2291 
2292 		if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0)
2293 			return (-1);
2294 		else if (cnt == 0)
2295 			return (0);
2296 
2297 		/* recusively print them out */
2298 		for (p = nlp; (p != NULL); p = p->next) {
2299 			mdname_t	*curnp = p->namep;
2300 
2301 			/*
2302 			 * one problem with the rval of -1 here is that
2303 			 * the error gets "lost" when the next device is
2304 			 * printed, but we want to print them all anyway.
2305 			 */
2306 			rval = meta_sp_print(sp, curnp, nlpp, fname, fp,
2307 			    options, ep);
2308 		}
2309 
2310 		/* clean up, return success */
2311 		metafreenamelist(nlp);
2312 		return (rval);
2313 	}
2314 
2315 	/* get the unit structure */
2316 	if ((msp = meta_get_sp_common(sp, np,
2317 	    ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL)
2318 		return (-1);
2319 
2320 	/* check for parented */
2321 	if ((! (options & PRINT_SUBDEVS)) &&
2322 	    (MD_HAS_PARENT(msp->common.parent))) {
2323 		return (0);
2324 	}
2325 
2326 	/* print appropriate detail */
2327 	if (options & PRINT_SHORT) {
2328 		if (meta_sp_short_print(msp, fname, fp, options, ep) != 0)
2329 			return (-1);
2330 	} else {
2331 		if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0)
2332 			return (-1);
2333 	}
2334 
2335 	/*
2336 	 * Print underlying metadevices if they are parented to us and
2337 	 * if the info for the underlying metadevice has not been printed.
2338 	 */
2339 	if (metaismeta(msp->compnamep)) {
2340 		/* get the unit structure for the subdevice */
2341 		if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL)
2342 			return (-1);
2343 
2344 		/* If info not already printed, recurse */
2345 		if (!BT_TEST(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)))) {
2346 			if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp,
2347 			    (options | PRINT_HEADER | PRINT_SUBDEVS),
2348 			    NULL, ep) != 0) {
2349 				return (-1);
2350 			}
2351 			BT_SET(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)));
2352 		}
2353 	}
2354 	return (0);
2355 }
2356 
2357 /*
2358  * **************************************************************************
2359  *                     Watermark Manipulation Functions                     *
2360  * **************************************************************************
2361  */
2362 
2363 /*
2364  * FUNCTION:	meta_sp_get_start()
2365  * INPUT:	sp	- the operating set
2366  *		np 	- device upon which the sp is being built
2367  * OUTPUT:	ep	- return error pointer
2368  * RETURNS:	daddr_t	- -1 if error, otherwise the start block
2369  * PURPOSE:	Encapsulate the determination of the start block of the
2370  *		device upon which the sp is built or being built.
2371  */
2372 static diskaddr_t
2373 meta_sp_get_start(
2374 	mdsetname_t	*sp,
2375 	mdname_t	*np,
2376 	md_error_t	*ep
2377 )
2378 {
2379 	daddr_t		start_block;
2380 
2381 	if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR)
2382 		start_block += MD_SP_START;
2383 
2384 	return (start_block);
2385 }
2386 
2387 /*
2388  * FUNCTION:	meta_sp_update_wm()
2389  * INPUT:	sp	- the operating set
2390  *		msp	- a pointer to the XDR unit structure
2391  *		extlist	- the extent list specifying watermarks to update
2392  * OUTPUT:	ep	- return error pointer
2393  * RETURNS:	int	- -1 if error, 0 on success
2394  * PURPOSE:	steps backwards through the extent list updating
2395  *		watermarks for all extents with the EXTFLG_UPDATE flag
2396  *		set.  Writing the watermarks guarantees consistency when
2397  *		extents must be broken into pieces since the original
2398  *		watermark will be the last to be updated, and will be
2399  *		changed to point to a new watermark that is already
2400  *		known to be consistent.  If one of the writes fails, the
2401  *		original watermark stays intact and none of the changes
2402  *		are realized.
2403  */
2404 static int
2405 meta_sp_update_wm(
2406 	mdsetname_t	*sp,
2407 	md_sp_t		*msp,
2408 	sp_ext_node_t	*extlist,
2409 	md_error_t	*ep
2410 )
2411 {
2412 	sp_ext_node_t	*ext;
2413 	sp_ext_node_t	*tail;
2414 	mp_watermark_t	*wmp, *watermarks;
2415 	xsp_offset_t	*osp, *offsets;
2416 	int		update_count = 0;
2417 	int		rval = 0;
2418 	md_unit_t	*mdp;
2419 	md_sp_update_wm_t	update_params;
2420 
2421 	if (getenv(META_SP_DEBUG)) {
2422 		meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n");
2423 		meta_sp_list_dump(extlist);
2424 	}
2425 
2426 	/*
2427 	 * find the last node so we can write the watermarks backwards
2428 	 * and count watermarks to update so we can allocate space
2429 	 */
2430 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
2431 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2432 			update_count++;
2433 		}
2434 
2435 		if (ext->ext_next == NULL) {
2436 			tail = ext;
2437 		}
2438 	}
2439 	ext = tail;
2440 
2441 	wmp = watermarks =
2442 	    Zalloc(update_count * sizeof (mp_watermark_t));
2443 	osp = offsets =
2444 	    Zalloc(update_count * sizeof (sp_ext_offset_t));
2445 
2446 	while (ext != NULL) {
2447 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2448 			/* update watermark */
2449 			wmp->wm_magic = MD_SP_MAGIC;
2450 			wmp->wm_version = MD_SP_VERSION;
2451 			wmp->wm_type = ext->ext_type;
2452 			wmp->wm_seq = ext->ext_seq;
2453 			wmp->wm_length = ext->ext_length - MD_SP_WMSIZE;
2454 
2455 			/* fill in the volume name and set name */
2456 			if (ext->ext_namep != NULL)
2457 				(void) strcpy(wmp->wm_mdname,
2458 				    ext->ext_namep->cname);
2459 			else
2460 				(void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME);
2461 			if (ext->ext_setp != NULL &&
2462 			    ext->ext_setp->setno != MD_LOCAL_SET)
2463 				(void) strcpy(wmp->wm_setname,
2464 				    ext->ext_setp->setname);
2465 			else
2466 				(void) strcpy(wmp->wm_setname,
2467 				    MD_SP_LOCALSETNAME);
2468 
2469 			/* Generate the checksum */
2470 			wmp->wm_checksum = 0;
2471 			crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum,
2472 			    sizeof (*wmp), NULL);
2473 
2474 			/* record the extent offset */
2475 			*osp = ext->ext_offset;
2476 
2477 			/* Advance the placeholders */
2478 			osp++; wmp++;
2479 		}
2480 		ext = ext->ext_prev;
2481 	}
2482 
2483 	mdp = meta_get_mdunit(sp, msp->common.namep, ep);
2484 	if (mdp == NULL) {
2485 		rval = -1;
2486 		goto out;
2487 	}
2488 
2489 	(void) memset(&update_params, 0, sizeof (update_params));
2490 	update_params.mnum = MD_SID(mdp);
2491 	update_params.count = update_count;
2492 	update_params.wmp = (uintptr_t)watermarks;
2493 	update_params.osp = (uintptr_t)offsets;
2494 	MD_SETDRIVERNAME(&update_params, MD_SP,
2495 	    MD_MIN2SET(update_params.mnum));
2496 
2497 	if (metaioctl(MD_IOC_SPUPDATEWM, &update_params,
2498 	    &update_params.mde, msp->common.namep->cname) != 0) {
2499 		(void) mdstealerror(ep, &update_params.mde);
2500 		rval = -1;
2501 		goto out;
2502 	}
2503 
2504 out:
2505 	Free(watermarks);
2506 	Free(offsets);
2507 
2508 	return (rval);
2509 }
2510 
2511 /*
2512  * FUNCTION:	meta_sp_clear_wm()
2513  * INPUT:	sp	- the operating set
2514  *		msp	- the unit structure for the soft partition to clear
2515  * OUTPUT:	ep	- return error pointer
2516  * RETURNS:	int	- -1 if error, 0 on success
2517  * PURPOSE:	steps through the extents for a soft partition unit and
2518  *		creates an extent list designed to mark all of the
2519  *		watermarks for those extents as free.  The extent list
2520  *		is then passed to meta_sp_update_wm() to actually write
2521  *		the watermarks out.
2522  */
2523 static int
2524 meta_sp_clear_wm(
2525 	mdsetname_t	*sp,
2526 	md_sp_t		*msp,
2527 	md_error_t	*ep
2528 )
2529 {
2530 	sp_ext_node_t	*extlist = NULL;
2531 	int		numexts = msp->ext.ext_len;
2532 	uint_t		i;
2533 	int		rval = 0;
2534 
2535 	/* for each watermark must set the flag to SP_FREE */
2536 	for (i = 0; i < numexts; i++) {
2537 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
2538 
2539 		meta_sp_list_insert(NULL, NULL, &extlist,
2540 		    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
2541 		    EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
2542 	}
2543 
2544 	/* update watermarks */
2545 	rval = meta_sp_update_wm(sp, msp, extlist, ep);
2546 
2547 	meta_sp_list_free(&extlist);
2548 	return (rval);
2549 }
2550 
2551 /*
2552  * FUNCTION:	meta_sp_read_wm()
2553  * INPUT:	sp	- setname for component
2554  *		compnp	- mdname_t for component
2555  *		offset	- the offset of the watermark to read (sectors)
2556  * OUTPUT:	wm	- the watermark structure to read into
2557  *		ep	- return error pointer
2558  * RETURNS:	int	- -1 if error, 0 on success
2559  * PURPOSE:	seeks out to the requested offset and reads a watermark.
2560  *		It then verifies that the magic number is correct and
2561  *		that the checksum is valid, returning an error if either
2562  *		is wrong.
2563  */
2564 static int
2565 meta_sp_read_wm(
2566 	mdsetname_t	*sp,
2567 	mdname_t	*compnp,
2568 	mp_watermark_t	*wm,
2569 	sp_ext_offset_t	offset,
2570 	md_error_t	*ep
2571 )
2572 {
2573 	md_sp_read_wm_t	read_params;
2574 
2575 	/*
2576 	 * make sure block offset does not overflow 2^64 bytes and it's a
2577 	 * multiple of the block size.
2578 	 */
2579 	assert(offset <= (1LL << (64 - DEV_BSHIFT)));
2580 	/* LINTED */
2581 	assert((sizeof (*wm) % DEV_BSIZE) == 0);
2582 
2583 	(void) memset(wm, 0, sizeof (*wm));
2584 
2585 	(void) memset(&read_params, 0, sizeof (read_params));
2586 	read_params.rdev = compnp->dev;
2587 	read_params.wmp = (uintptr_t)wm;
2588 	read_params.offset = offset;
2589 	MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno);
2590 
2591 	if (metaioctl(MD_IOC_SPREADWM, &read_params,
2592 	    &read_params.mde, compnp->cname) != 0) {
2593 
2594 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2595 		    "Extent header read failed, block %llu.\n"), offset);
2596 		return (mdstealerror(ep, &read_params.mde));
2597 	}
2598 
2599 	/* make sure magic number is correct */
2600 	if (wm->wm_magic != MD_SP_MAGIC) {
2601 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2602 		    "found incorrect magic number %x, expected %x.\n"),
2603 		    wm->wm_magic, MD_SP_MAGIC);
2604 		/*
2605 		 * Pass NULL for the device name as we don't have
2606 		 * valid watermark contents.
2607 		 */
2608 		return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL));
2609 	}
2610 
2611 	if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
2612 	    sizeof (*wm), NULL)) {
2613 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2614 		    "found incorrect checksum %x.\n"),
2615 		    wm->wm_checksum);
2616 		return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname));
2617 	}
2618 
2619 	return (0);
2620 }
2621 
2622 /*
2623  * **************************************************************************
2624  *                  Query Functions
2625  * **************************************************************************
2626  */
2627 
2628 /*
2629  * IMPORTANT NOTE: This is a static function that assumes that
2630  *		   its input parameters have been checked and
2631  *		   have valid values that lie within acceptable
2632  *		   ranges.
2633  *
2634  * FUNCTION:	meta_sp_enough_space()
2635  * INPUT:	desired_number_of_sps - the number of soft partitions desired;
2636  *					must be > 0
2637  *		desired_sp_size - the desired soft partition size in blocks;
2638  *				  must be > 0
2639  *		extent_listpp - a reference to a reference to an extent
2640  *				list that lists the extents on a device;
2641  *				must be a reference to a reference to a
2642  *				valid extent list
2643  *		alignment - the desired data space alignment for the sp's
2644  * OUTPUT:	boolean_t return value
2645  * RETURNS:	boolean_t - B_TRUE if there's enough space in the extent
2646  *			    list to create the desired soft partitions,
2647  *			    B_FALSE if there's not enough space
2648  * PURPOSE:	determines whether there's enough free space in an extent
2649  *		list to allow creation of a set of soft partitions
2650  */
2651 static boolean_t
2652 meta_sp_enough_space(
2653 	int		desired_number_of_sps,
2654 	blkcnt_t	desired_sp_size,
2655 	sp_ext_node_t	**extent_listpp,
2656 	sp_ext_length_t	alignment
2657 )
2658 {
2659 	boolean_t		enough_space;
2660 	int			number_of_sps;
2661 	int			number_of_extents_used;
2662 	sp_ext_length_t		desired_ext_length = desired_sp_size;
2663 
2664 	enough_space = B_TRUE;
2665 	number_of_sps = 0;
2666 	while ((enough_space == B_TRUE) &&
2667 		(number_of_sps < desired_number_of_sps)) {
2668 		/*
2669 		 * Use the extent allocation algorithm implemented by
2670 		 * meta_sp_alloc_by_len() to test whether the free
2671 		 * extents in the extent list referenced by *extent_listpp
2672 		 * contain enough space to accomodate a soft partition
2673 		 * of size desired_ext_length.
2674 		 *
2675 		 * Repeat the test <desired_number_of_sps> times
2676 		 * or until it fails, whichever comes first,
2677 		 * each time allocating the extents required to
2678 		 * create the soft partition without actually
2679 		 * creating the soft partition.
2680 		 */
2681 		number_of_extents_used = meta_sp_alloc_by_len(
2682 						TEST_SETNAMEP,
2683 						TEST_SOFT_PARTITION_NAMEP,
2684 						extent_listpp,
2685 						&desired_ext_length,
2686 						NO_OFFSET,
2687 						alignment);
2688 		if (number_of_extents_used == -1) {
2689 			enough_space = B_FALSE;
2690 		} else {
2691 			number_of_sps++;
2692 		}
2693 	}
2694 	return (enough_space);
2695 }
2696 
2697 /*
2698  * IMPORTANT NOTE: This is a static function that calls other functions
2699  *		   that check its mdsetnamep and device_mdnamep
2700  *		   input parameters, but expects extent_listpp to
2701  *		   be a initialized to a valid address to which
2702  *		   it can write a reference to the extent list that
2703  *		   it creates.
2704  *
2705  * FUNCTION:	meta_sp_get_extent_list()
2706  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2707  *			     for the set containing the device for
2708  *			     which the extents are to be listed
2709  *		device_mdnamep - a reference to the mdname_t structure
2710  *				 for the device for which the extents
2711  *				 are to be listed
2712  * OUTPUT:	*extent_listpp - a reference to the extent list for
2713  *				 the device; NULL if the function fails
2714  *		*ep - the libmeta error encountered, if any
2715  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2716  *			    B_FALSE if not
2717  * PURPOSE:	gets the extent list for a device
2718  */
2719 static boolean_t
2720 meta_sp_get_extent_list(
2721 	mdsetname_t	*mdsetnamep,
2722 	mdname_t	*device_mdnamep,
2723 	sp_ext_node_t	**extent_listpp,
2724 	md_error_t	*ep
2725 )
2726 {
2727 	diskaddr_t		device_size_in_blocks;
2728 	mdnamelist_t		*sp_name_listp;
2729 	diskaddr_t		start_block_address_in_blocks;
2730 
2731 	*extent_listpp = NULL;
2732 	sp_name_listp = NULL;
2733 
2734 	start_block_address_in_blocks = meta_sp_get_start(mdsetnamep,
2735 						device_mdnamep,
2736 						ep);
2737 	if (start_block_address_in_blocks == MD_DISKADDR_ERROR) {
2738 	    if (getenv(META_SP_DEBUG)) {
2739 		mde_perror(ep, "meta_sp_get_extent_list:meta_sp_get_start");
2740 	    }
2741 	    return (B_FALSE);
2742 	}
2743 
2744 	device_size_in_blocks = metagetsize(device_mdnamep, ep);
2745 	if (device_size_in_blocks == MD_DISKADDR_ERROR) {
2746 	    if (getenv(META_SP_DEBUG)) {
2747 		mde_perror(ep,
2748 		    "meta_sp_get_extent_list:metagetsize");
2749 	    }
2750 	    return (B_FALSE);
2751 	}
2752 
2753 	/*
2754 	 * Sanity check: the start block will have skipped an integer
2755 	 * number of cylinders, C.  C will usually be zero.  If (C > 0),
2756 	 * and the disk slice happens to only be C cylinders in total
2757 	 * size, we'll fail this check.
2758 	 */
2759 	if (device_size_in_blocks <=
2760 	    (start_block_address_in_blocks + MD_SP_WMSIZE)) {
2761 	    (void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname);
2762 	    return (B_FALSE);
2763 	}
2764 
2765 	/*
2766 	 * After this point, we will have allocated resources, so any
2767 	 * failure returns must be through the supplied "fail" label
2768 	 * to properly deallocate things.
2769 	 */
2770 
2771 	/*
2772 	 * Create an empty extent list that starts one watermark past
2773 	 * the start block of the device and ends one watermark before
2774 	 * the end of the device.
2775 	 */
2776 	meta_sp_list_insert(TEST_SETNAMEP,
2777 			    TEST_SOFT_PARTITION_NAMEP,
2778 			    extent_listpp,
2779 			    NO_OFFSET,
2780 			    (sp_ext_length_t)start_block_address_in_blocks,
2781 			    EXTTYP_RESERVED,
2782 			    NO_SEQUENCE_NUMBER,
2783 			    NO_FLAGS,
2784 			    meta_sp_cmp_by_offset);
2785 	meta_sp_list_insert(TEST_SETNAMEP,
2786 			    TEST_SOFT_PARTITION_NAMEP,
2787 			    extent_listpp,
2788 			    (sp_ext_offset_t)(device_size_in_blocks -
2789 				MD_SP_WMSIZE),
2790 			    MD_SP_WMSIZE,
2791 			    EXTTYP_END,
2792 			    NO_SEQUENCE_NUMBER,
2793 			    NO_FLAGS,
2794 			    meta_sp_cmp_by_offset);
2795 
2796 	/*
2797 	 * Get the list of soft partitions that are already on the
2798 	 * device.
2799 	 */
2800 	if (meta_sp_get_by_component(mdsetnamep, device_mdnamep,
2801 	    &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) {
2802 		if (getenv(META_SP_DEBUG)) {
2803 			mde_perror(ep,
2804 			    "meta_sp_get_extent_list:meta_sp_get_by_component");
2805 		}
2806 		goto fail;
2807 	}
2808 
2809 	if (sp_name_listp != NULL) {
2810 		/*
2811 		 * If there are soft partitions on the device, add the
2812 		 * extents used in them to the extent list.
2813 		 */
2814 		if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp,
2815 		    extent_listpp, ep) == -1) {
2816 			if (getenv(META_SP_DEBUG)) {
2817 				mde_perror(ep, "meta_sp_get_extent_list:"
2818 				    "meta_sp_extlist_from_namelist");
2819 			}
2820 			goto fail;
2821 		}
2822 		metafreenamelist(sp_name_listp);
2823 	}
2824 
2825 	/*
2826 	 * Add free extents to the extent list to represent
2827 	 * the remaining regions of free space on the
2828 	 * device.
2829 	 */
2830 	meta_sp_list_freefill(extent_listpp, device_size_in_blocks);
2831 	return (B_TRUE);
2832 
2833 fail:
2834 	if (sp_name_listp != NULL) {
2835 		metafreenamelist(sp_name_listp);
2836 	}
2837 
2838 	if (*extent_listpp != NULL) {
2839 		/*
2840 		 * meta_sp_list_free sets *extent_listpp to NULL.
2841 		 */
2842 		meta_sp_list_free(extent_listpp);
2843 	}
2844 	return (B_FALSE);
2845 }
2846 
2847 /*
2848  * IMPORTANT NOTE: This is a static function that calls other functions
2849  *		   that check its mdsetnamep and mddrivenamep
2850  *		   input parameters, but expects extent_listpp to
2851  *		   be a initialized to a valid address to which
2852  *		   it can write a reference to the extent list that
2853  *		   it creates.
2854  *
2855  * FUNCTION:	meta_sp_get_extent_list_for_drive()
2856  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2857  *			     for the set containing the drive for
2858  *			     which the extents are to be listed
2859  *		mddrivenamep   - a reference to the mddrivename_t structure
2860  *				 for the drive for which the extents
2861  *				 are to be listed
2862  * OUTPUT:	*extent_listpp - a reference to the extent list for
2863  *				 the drive; NULL if the function fails
2864  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2865  *			    B_FALSE if not
2866  * PURPOSE:	gets the extent list for a drive when the entire drive
2867  *		is to be soft partitioned
2868  */
2869 static boolean_t
2870 meta_sp_get_extent_list_for_drive(
2871 	mdsetname_t	*mdsetnamep,
2872 	mddrivename_t	*mddrivenamep,
2873 	sp_ext_node_t	**extent_listpp
2874 )
2875 {
2876 	boolean_t		can_use;
2877 	diskaddr_t		free_space;
2878 	md_error_t		mderror;
2879 	mdvtoc_t		proposed_vtoc;
2880 	int			repartition_options;
2881 	int			return_value;
2882 	md_sp_t			test_sp_struct;
2883 
2884 	can_use = B_TRUE;
2885 	*extent_listpp = NULL;
2886 	mderror = mdnullerror;
2887 	test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0,
2888 					&mderror);
2889 	if (test_sp_struct.compnamep == NULL) {
2890 		can_use = B_FALSE;
2891 	}
2892 
2893 	if (can_use == B_TRUE) {
2894 		mderror = mdnullerror;
2895 		repartition_options = 0;
2896 		return_value = meta_check_sp(mdsetnamep, &test_sp_struct,
2897 				MDCMD_USE_WHOLE_DISK, &repartition_options,
2898 				&mderror);
2899 		if (return_value != 0) {
2900 			can_use = B_FALSE;
2901 		}
2902 	}
2903 
2904 	if (can_use == B_TRUE) {
2905 		mderror = mdnullerror;
2906 		repartition_options = repartition_options |
2907 			(MD_REPART_FORCE | MD_REPART_DONT_LABEL);
2908 		return_value = meta_repartition_drive(mdsetnamep, mddrivenamep,
2909 				repartition_options, &proposed_vtoc, &mderror);
2910 		if (return_value != 0) {
2911 			can_use = B_FALSE;
2912 		}
2913 	}
2914 
2915 	if (can_use == B_TRUE) {
2916 		free_space = proposed_vtoc.parts[MD_SLICE0].size;
2917 		if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) {
2918 			can_use = B_FALSE;
2919 		}
2920 	}
2921 
2922 	if (can_use == B_TRUE) {
2923 		/*
2924 		 * Create an extent list that starts with
2925 		 * a reserved extent that ends at the start
2926 		 * of the usable space on slice zero of the
2927 		 * proposed VTOC, ends with an extent that
2928 		 * reserves space for a watermark at the end
2929 		 * of slice zero, and contains a single free
2930 		 * extent that occupies the rest of the space
2931 		 * on the slice.
2932 		 *
2933 		 * NOTE:
2934 		 *
2935 		 * Don't use metagetstart() or metagetsize() to
2936 		 * find the usable space.  They query the mdname_t
2937 		 * structure that represents an actual device to
2938 		 * determine the amount of space on the device that
2939 		 * contains metadata and the total amount of space
2940 		 * on the device.  Since this function creates a
2941 		 * proposed extent list that doesn't reflect the
2942 		 * state of an actual device, there's no mdname_t
2943 		 * structure to be queried.
2944 		 *
2945 		 * When a drive is reformatted to prepare for
2946 		 * soft partitioning, all of slice seven is
2947 		 * reserved for metadata, all of slice zero is
2948 		 * available for soft partitioning, and all other
2949 		 * slices on the drive are empty.  The proposed
2950 		 * extent list for the drive therefore contains
2951 		 * only three extents: a reserved extent that ends
2952 		 * at the start of the usable space on slice zero,
2953 		 * a single free extent that occupies all the usable
2954 		 * space on slice zero, and an ending extent that
2955 		 * reserves space for a watermark at the end of
2956 		 * slice zero.
2957 		 */
2958 		meta_sp_list_insert(TEST_SETNAMEP,
2959 			TEST_SOFT_PARTITION_NAMEP,
2960 			extent_listpp,
2961 			NO_OFFSET,
2962 			(sp_ext_length_t)(MD_SP_START),
2963 			EXTTYP_RESERVED,
2964 			NO_SEQUENCE_NUMBER,
2965 			NO_FLAGS,
2966 			meta_sp_cmp_by_offset);
2967 		meta_sp_list_insert(TEST_SETNAMEP,
2968 			TEST_SOFT_PARTITION_NAMEP,
2969 			extent_listpp,
2970 			(sp_ext_offset_t)(free_space - MD_SP_WMSIZE),
2971 			MD_SP_WMSIZE,
2972 			EXTTYP_END,
2973 			NO_SEQUENCE_NUMBER,
2974 			NO_FLAGS,
2975 			meta_sp_cmp_by_offset);
2976 		meta_sp_list_freefill(extent_listpp, free_space);
2977 	}
2978 	return (can_use);
2979 }
2980 
2981 /*
2982  * FUNCTION:	meta_sp_can_create_sps()
2983  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2984  *			     for the set containing the device for
2985  *			     which the extents are to be listed
2986  *		mdnamep - a reference to the mdname_t of the device
2987  *			  on which the soft parititions are to be created
2988  *		number_of_sps - the desired number of soft partitions
2989  *		sp_size - the desired soft partition size
2990  * OUTPUT:	boolean_t return value
2991  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
2992  *			    B_FALSE if not
2993  * PURPOSE:	determines whether a set of soft partitions can be created
2994  *		on a device
2995  */
2996 boolean_t
2997 meta_sp_can_create_sps(
2998 	mdsetname_t	*mdsetnamep,
2999 	mdname_t	*mdnamep,
3000 	int		number_of_sps,
3001 	blkcnt_t	sp_size
3002 )
3003 {
3004 	sp_ext_node_t	*extent_listp;
3005 	boolean_t	succeeded;
3006 	md_error_t	mde;
3007 
3008 	if ((number_of_sps > 0) && (sp_size > 0)) {
3009 		succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3010 						    &extent_listp, &mde);
3011 	} else {
3012 		succeeded = B_FALSE;
3013 	}
3014 
3015 	/*
3016 	 * We don't really care about an error return from the
3017 	 * alignment call; that will just result in passing zero,
3018 	 * which will be interpreted as no alignment.
3019 	 */
3020 
3021 	if (succeeded == B_TRUE) {
3022 		succeeded = meta_sp_enough_space(number_of_sps,
3023 		    sp_size, &extent_listp,
3024 		    meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde));
3025 		meta_sp_list_free(&extent_listp);
3026 	}
3027 	return (succeeded);
3028 }
3029 
3030 /*
3031  * FUNCTION:	meta_sp_can_create_sps_on_drive()
3032  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3033  *			     for the set containing the drive for
3034  *			     which the extents are to be listed
3035  *		mddrivenamep - a reference to the mddrivename_t of the drive
3036  *			       on which the soft parititions are to be created
3037  *		number_of_sps - the desired number of soft partitions
3038  *		sp_size - the desired soft partition size
3039  * OUTPUT:	boolean_t return value
3040  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3041  *			    B_FALSE if not
3042  * PURPOSE:	determines whether a set of soft partitions can be created
3043  *		on a drive if the entire drive is soft partitioned
3044  */
3045 boolean_t
3046 meta_sp_can_create_sps_on_drive(
3047 	mdsetname_t	*mdsetnamep,
3048 	mddrivename_t	*mddrivenamep,
3049 	int		number_of_sps,
3050 	blkcnt_t	sp_size
3051 )
3052 {
3053 	sp_ext_node_t	*extent_listp;
3054 	boolean_t	succeeded;
3055 
3056 	if ((number_of_sps > 0) && (sp_size > 0)) {
3057 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3058 							mddrivenamep,
3059 							&extent_listp);
3060 	} else {
3061 		succeeded = B_FALSE;
3062 	}
3063 
3064 	/*
3065 	 * We don't care about alignment on the space call because
3066 	 * we're specifically dealing with a drive, which will have no
3067 	 * inherent alignment.
3068 	 */
3069 
3070 	if (succeeded == B_TRUE) {
3071 		succeeded = meta_sp_enough_space(number_of_sps, sp_size,
3072 		    &extent_listp, SP_UNALIGNED);
3073 		meta_sp_list_free(&extent_listp);
3074 	}
3075 	return (succeeded);
3076 }
3077 
3078 /*
3079  * FUNCTION:	meta_sp_get_free_space()
3080  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3081  *			     for the set containing the device for
3082  *			     which the free space is to be returned
3083  *		mdnamep - a reference to the mdname_t of the device
3084  *			  for which the free space is to be returned
3085  * OUTPUT:	blkcnt_t return value
3086  * RETURNS:	blkcnt_t - the number of blocks of free space on the device
3087  * PURPOSE:	returns the number of blocks of free space on a device
3088  */
3089 blkcnt_t
3090 meta_sp_get_free_space(
3091 	mdsetname_t	*mdsetnamep,
3092 	mdname_t	*mdnamep
3093 )
3094 {
3095 	sp_ext_node_t		*extent_listp;
3096 	sp_ext_length_t		free_blocks;
3097 	boolean_t		succeeded;
3098 	md_error_t		mde;
3099 
3100 	extent_listp = NULL;
3101 	free_blocks = 0;
3102 	succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3103 					    &extent_listp, &mde);
3104 	if (succeeded == B_TRUE) {
3105 		free_blocks = meta_sp_list_size(extent_listp,
3106 		    EXTTYP_FREE, INCLUDE_WM);
3107 		meta_sp_list_free(&extent_listp);
3108 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3109 			/*
3110 			 * Subtract a safety margin for watermarks when
3111 			 * computing the number of blocks available for
3112 			 * use.  The actual number of watermarks can't
3113 			 * be calculated without knowing the exact numbers
3114 			 * and sizes of both the free extents and the soft
3115 			 * partitions to be created.  The calculation is
3116 			 * highly complex and error-prone even if those
3117 			 * quantities are known.  The approximate value
3118 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3119 			 * correct value in all practical cases.
3120 			 */
3121 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3122 		} else {
3123 			free_blocks = 0;
3124 		}
3125 	} else {
3126 	    mdclrerror(&mde);
3127 	}
3128 
3129 	return (free_blocks);
3130 }
3131 
3132 /*
3133  * FUNCTION:	meta_sp_get_free_space_on_drive()
3134  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3135  *			     for the set containing the drive for
3136  *			     which the free space is to be returned
3137  *		mddrivenamep - a reference to the mddrivename_t of the drive
3138  *			       for which the free space is to be returned
3139  * OUTPUT:	blkcnt_t return value
3140  * RETURNS:	blkcnt_t - the number of blocks of free space on the drive
3141  * PURPOSE:	returns the number of blocks of space usable for soft
3142  *		partitions on an entire drive, if the entire drive is
3143  *		soft partitioned
3144  */
3145 blkcnt_t
3146 meta_sp_get_free_space_on_drive(
3147 	mdsetname_t	*mdsetnamep,
3148 	mddrivename_t	*mddrivenamep
3149 )
3150 {
3151 	sp_ext_node_t		*extent_listp;
3152 	sp_ext_length_t		free_blocks;
3153 	boolean_t		succeeded;
3154 
3155 	extent_listp = NULL;
3156 	free_blocks = 0;
3157 	succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3158 			mddrivenamep, &extent_listp);
3159 	if (succeeded == B_TRUE) {
3160 		free_blocks = meta_sp_list_size(extent_listp,
3161 		    EXTTYP_FREE, INCLUDE_WM);
3162 		meta_sp_list_free(&extent_listp);
3163 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3164 			/*
3165 			 * Subtract a safety margin for watermarks when
3166 			 * computing the number of blocks available for
3167 			 * use.  The actual number of watermarks can't
3168 			 * be calculated without knowing the exact numbers
3169 			 * and sizes of both the free extents and the soft
3170 			 * partitions to be created.  The calculation is
3171 			 * highly complex and error-prone even if those
3172 			 * quantities are known.  The approximate value
3173 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3174 			 * correct value in all practical cases.
3175 			 */
3176 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3177 		} else {
3178 			free_blocks = 0;
3179 		}
3180 	}
3181 	return (free_blocks);
3182 }
3183 
3184 /*
3185  * FUNCTION:	meta_sp_get_number_of_possible_sps()
3186  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3187  *			     for the set containing the device for
3188  *			     which the number of possible soft partitions
3189  *			     is to be returned
3190  *		mdnamep - a reference to the mdname_t of the device
3191  *			  for which the number of possible soft partitions
3192  *			  is to be returned
3193  * OUTPUT:	int return value
3194  * RETURNS:	int - the number of soft partitions of the desired size
3195  *		      that can be created on the device
3196  * PURPOSE:	returns the number of soft partitions of a given size
3197  *		that can be created on a device
3198  */
3199 int
3200 meta_sp_get_number_of_possible_sps(
3201 	mdsetname_t	*mdsetnamep,
3202 	mdname_t	*mdnamep,
3203 	blkcnt_t	sp_size
3204 )
3205 {
3206 	sp_ext_node_t	*extent_listp;
3207 	int		number_of_possible_sps;
3208 	boolean_t	succeeded;
3209 	md_error_t	mde;
3210 	sp_ext_length_t	alignment;
3211 
3212 	extent_listp = NULL;
3213 	number_of_possible_sps = 0;
3214 	if (sp_size > 0) {
3215 	    if ((succeeded = meta_sp_get_extent_list(mdsetnamep,
3216 		mdnamep, &extent_listp, &mde)) == B_FALSE)
3217 		mdclrerror(&mde);
3218 	} else {
3219 		succeeded = B_FALSE;
3220 	}
3221 
3222 	if (succeeded == B_TRUE) {
3223 		alignment = meta_sp_get_default_alignment(mdsetnamep,
3224 		    mdnamep, &mde);
3225 	}
3226 
3227 	while (succeeded == B_TRUE) {
3228 		/*
3229 		 * Keep allocating space from the extent list
3230 		 * for soft partitions of the desired size until
3231 		 * there's not enough free space left in the list
3232 		 * for another soft partiition of that size.
3233 		 * Add one to the number of possible soft partitions
3234 		 * for each soft partition for which there is
3235 		 * enough free space left.
3236 		 */
3237 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3238 		    sp_size, &extent_listp, alignment);
3239 		if (succeeded == B_TRUE) {
3240 			number_of_possible_sps++;
3241 		}
3242 	}
3243 	if (extent_listp != NULL) {
3244 		meta_sp_list_free(&extent_listp);
3245 	}
3246 	return (number_of_possible_sps);
3247 }
3248 
3249 /*
3250  * FUNCTION:	meta_sp_get_number_of_possible_sps_on_drive()
3251  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3252  *			     for the set containing the drive for
3253  *			     which the number of possible soft partitions
3254  *			     is to be returned
3255  *		mddrivenamep - a reference to the mddrivename_t of the drive
3256  *			       for which the number of possible soft partitions
3257  *			       is to be returned
3258  *		sp_size - the size in blocks of the proposed soft partitions
3259  * OUTPUT:	int return value
3260  * RETURNS:	int - the number of soft partitions of the desired size
3261  *		      that can be created on the drive
3262  * PURPOSE:	returns the number of soft partitions of a given size
3263  *		that can be created on a drive, if the entire drive is
3264  *		soft partitioned
3265  */
3266 int
3267 meta_sp_get_number_of_possible_sps_on_drive(
3268 	mdsetname_t	*mdsetnamep,
3269 	mddrivename_t	*mddrivenamep,
3270 	blkcnt_t	sp_size
3271 )
3272 {
3273 	sp_ext_node_t	*extent_listp;
3274 	int		number_of_possible_sps;
3275 	boolean_t	succeeded;
3276 
3277 	extent_listp = NULL;
3278 	number_of_possible_sps = 0;
3279 	if (sp_size > 0) {
3280 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3281 					mddrivenamep, &extent_listp);
3282 	} else {
3283 		succeeded = B_FALSE;
3284 	}
3285 	while (succeeded == B_TRUE) {
3286 		/*
3287 		 * Keep allocating space from the extent list
3288 		 * for soft partitions of the desired size until
3289 		 * there's not enough free space left in the list
3290 		 * for another soft partition of that size.
3291 		 * Add one to the number of possible soft partitions
3292 		 * for each soft partition for which there is
3293 		 * enough free space left.
3294 		 *
3295 		 * Since it's a drive, not a metadevice, make no
3296 		 * assumptions about alignment.
3297 		 */
3298 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3299 		    sp_size, &extent_listp, SP_UNALIGNED);
3300 		if (succeeded == B_TRUE) {
3301 			number_of_possible_sps++;
3302 		}
3303 	}
3304 	if (extent_listp != NULL) {
3305 		meta_sp_list_free(&extent_listp);
3306 	}
3307 	return (number_of_possible_sps);
3308 }
3309 
3310 /*
3311  * FUNCTION:	meta_sp_get_possible_sp_size()
3312  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3313  *			     for the set containing the device for
3314  *			     which the possible soft partition size
3315  *			     is to be returned
3316  *		mdnamep - a reference to the mdname_t of the device
3317  *			  for which the possible soft partition size
3318  *			  is to be returned
3319  *		number_of_sps - the desired number of soft partitions
3320  * OUTPUT:	blkcnt_t return value
3321  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3322  * PURPOSE:	returns the maximum possible size of each of a given number of
3323  *		soft partitions of equal size that can be created on a device
3324  */
3325 blkcnt_t
3326 meta_sp_get_possible_sp_size(
3327 	mdsetname_t	*mdsetnamep,
3328 	mdname_t	*mdnamep,
3329 	int		number_of_sps
3330 )
3331 {
3332 	blkcnt_t	free_blocks;
3333 	blkcnt_t	sp_size;
3334 	boolean_t	succeeded;
3335 
3336 	sp_size = 0;
3337 	if (number_of_sps > 0) {
3338 		free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep);
3339 		sp_size = free_blocks / number_of_sps;
3340 		succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3341 						number_of_sps, sp_size);
3342 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3343 			/*
3344 			 * To compensate for space that may have been
3345 			 * occupied by watermarks, reduce sp_size by a
3346 			 * number of blocks equal to the number of soft
3347 			 * partitions desired, and test again to see
3348 			 * whether the desired number of soft partitions
3349 			 * can be created.
3350 			 */
3351 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3352 			succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3353 							number_of_sps, sp_size);
3354 		}
3355 		if (sp_size < 0) {
3356 			sp_size = 0;
3357 		}
3358 	}
3359 	return (sp_size);
3360 }
3361 
3362 /*
3363  * FUNCTION:	meta_sp_get_possible_sp_size_on_drive()
3364  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3365  *			     for the set containing the drive for
3366  *			     which the possible soft partition size
3367  *			     is to be returned
3368  *		mddrivenamep - a reference to the mddrivename_t of the drive
3369  *			       for which the possible soft partition size
3370  *			       is to be returned
3371  *		number_of_sps - the desired number of soft partitions
3372  * OUTPUT:	blkcnt_t return value
3373  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3374  * PURPOSE:	returns the maximum possible size of each of a given number of
3375  *		soft partitions of equal size that can be created on a drive
3376  *              if the entire drive is soft partitioned
3377  */
3378 blkcnt_t
3379 meta_sp_get_possible_sp_size_on_drive(
3380 	mdsetname_t	*mdsetnamep,
3381 	mddrivename_t	*mddrivenamep,
3382 	int		number_of_sps
3383 )
3384 {
3385 	blkcnt_t	free_blocks;
3386 	blkcnt_t	sp_size;
3387 	boolean_t	succeeded;
3388 
3389 	sp_size = 0;
3390 	if (number_of_sps > 0) {
3391 		free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep,
3392 								mddrivenamep);
3393 		sp_size = free_blocks / number_of_sps;
3394 		succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3395 						mddrivenamep,
3396 						number_of_sps, sp_size);
3397 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3398 			/*
3399 			 * To compensate for space that may have been
3400 			 * occupied by watermarks, reduce sp_size by a
3401 			 * number of blocks equal to the number of soft
3402 			 * partitions desired, and test again to see
3403 			 * whether the desired number of soft partitions
3404 			 * can be created.
3405 			 */
3406 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3407 			succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3408 							mddrivenamep,
3409 							number_of_sps, sp_size);
3410 		}
3411 		if (sp_size < 0) {
3412 			sp_size = 0;
3413 		}
3414 	}
3415 	return (sp_size);
3416 }
3417 
3418 /*
3419  * **************************************************************************
3420  *                  Unit Structure Manipulation Functions                   *
3421  * **************************************************************************
3422  */
3423 
3424 /*
3425  * FUNCTION:	meta_sp_fillextarray()
3426  * INPUT:	mp	- the unit structure to fill
3427  *		extlist	- the list of extents to fill with
3428  * OUTPUT:	none
3429  * RETURNS:	void
3430  * PURPOSE:	fills in the unit structure extent list with the extents
3431  *		specified by extlist.  Only extents in extlist with the
3432  *		EXTFLG_UPDATE flag are changed in the unit structure,
3433  *		and the index into the unit structure is the sequence
3434  *		number in the extent list.  After all of the nodes have
3435  *		been updated the virtual offsets in the unit structure
3436  *		are updated to reflect the new lengths.
3437  */
3438 static void
3439 meta_sp_fillextarray(
3440 	mp_unit_t	*mp,
3441 	sp_ext_node_t	*extlist
3442 )
3443 {
3444 	int	i;
3445 	sp_ext_node_t	*ext;
3446 	sp_ext_offset_t	curvoff = 0LL;
3447 
3448 	assert(mp != NULL);
3449 
3450 	/* go through the allocation list and fill in our unit structure */
3451 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
3452 		if ((ext->ext_type == EXTTYP_ALLOC) &&
3453 		    (ext->ext_flags & EXTFLG_UPDATE) != 0) {
3454 			mp->un_ext[ext->ext_seq].un_poff =
3455 			    ext->ext_offset + MD_SP_WMSIZE;
3456 			mp->un_ext[ext->ext_seq].un_len =
3457 			    ext->ext_length - MD_SP_WMSIZE;
3458 		}
3459 	}
3460 
3461 	for (i = 0; i < mp->un_numexts; i++) {
3462 		assert(mp->un_ext[i].un_poff != 0);
3463 		assert(mp->un_ext[i].un_len  != 0);
3464 		mp->un_ext[i].un_voff = curvoff;
3465 		curvoff += mp->un_ext[i].un_len;
3466 	}
3467 }
3468 
3469 /*
3470  * FUNCTION:	meta_sp_createunit()
3471  * INPUT:	np	- the name of the device to create a unit structure for
3472  *		compnp	- the name of the device the soft partition is on
3473  *		extlist	- the extent list to populate the new unit with
3474  *		numexts	- the number of extents in the extent list
3475  *		len	- the total size of the soft partition (sectors)
3476  *		status	- the initial status of the unit structure
3477  * OUTPUT:	ep	- return error pointer
3478  * RETURNS:	mp_unit_t * - the new unit structure.
3479  * PURPOSE:	allocates and fills in a new soft partition unit
3480  *		structure to be passed to the soft partitioning driver
3481  *		for creation.
3482  */
3483 static mp_unit_t *
3484 meta_sp_createunit(
3485 	mdname_t	*np,
3486 	mdname_t	*compnp,
3487 	sp_ext_node_t	*extlist,
3488 	int		numexts,
3489 	sp_ext_length_t	len,
3490 	sp_status_t	status,
3491 	md_error_t	*ep
3492 )
3493 {
3494 	mp_unit_t	*mp;
3495 	uint_t		ms_size;
3496 
3497 	ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) +
3498 	    (numexts * sizeof (mp->un_ext[0]));
3499 
3500 	mp = Zalloc(ms_size);
3501 
3502 	/* fill in fields in common unit structure */
3503 	mp->c.un_type = MD_METASP;
3504 	mp->c.un_size = ms_size;
3505 	MD_SID(mp) = meta_getminor(np->dev);
3506 	mp->c.un_total_blocks = len;
3507 	mp->c.un_actual_tb = len;
3508 
3509 	/* set up geometry */
3510 	(void) meta_sp_setgeom(np, compnp, mp, ep);
3511 
3512 	/* if we're building on metadevice we can't parent */
3513 	if (metaismeta(compnp))
3514 		MD_CAPAB(mp) = MD_CANT_PARENT;
3515 	else
3516 		MD_CAPAB(mp) = MD_CAN_PARENT;
3517 
3518 	/* fill soft partition-specific fields */
3519 	mp->un_dev = compnp->dev;
3520 	mp->un_key = compnp->key;
3521 
3522 	/* mdname_t start_blk field is not 64-bit! */
3523 	mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk;
3524 	mp->un_status = status;
3525 	mp->un_numexts = numexts;
3526 	mp->un_length = len;
3527 
3528 	/* fill in the extent array */
3529 	meta_sp_fillextarray(mp, extlist);
3530 
3531 	return (mp);
3532 }
3533 
3534 /*
3535  * FUNCTION:	meta_sp_updateunit()
3536  * INPUT:	np       - name structure for the metadevice being updated
3537  *		old_un	 - the original unit structure that is being updated
3538  *		extlist	 - the extent list to populate the new unit with
3539  *		grow_len - the amount by which the partition is being grown
3540  *		numexts	 - the number of extents in the extent list
3541  *		ep       - return error pointer
3542  * OUTPUT:	none
3543  * RETURNS:	mp_unit_t * - the updated unit structure
3544  * PURPOSE:	allocates and fills in a new soft partition unit structure to
3545  *		be passed to the soft partitioning driver for creation.  The
3546  *		old unit structure is first copied in, and then the updated
3547  *		extents are changed in the new unit structure.  This is
3548  *		typically used when the size of an existing unit is changed.
3549  */
3550 static mp_unit_t *
3551 meta_sp_updateunit(
3552 	mdname_t	*np,
3553 	mp_unit_t	*old_un,
3554 	sp_ext_node_t	*extlist,
3555 	sp_ext_length_t	grow_len,
3556 	int		numexts,
3557 	md_error_t	*ep
3558 )
3559 {
3560 	mp_unit_t	*new_un;
3561 	sp_ext_length_t	new_len;
3562 	uint_t		new_size;
3563 
3564 	assert(old_un != NULL);
3565 	assert(extlist != NULL);
3566 
3567 	/* allocate new unit structure and copy in old unit */
3568 	new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) +
3569 	    ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0]));
3570 	new_len = old_un->un_length + grow_len;
3571 	new_un = Zalloc(new_size);
3572 	bcopy(old_un, new_un, old_un->c.un_size);
3573 
3574 	/* update size and geometry information */
3575 	new_un->c.un_size = new_size;
3576 	new_un->un_length = new_len;
3577 	new_un->c.un_total_blocks = new_len;
3578 	new_un->c.un_actual_tb = new_len;
3579 	if (meta_adjust_geom((md_unit_t *)new_un, np,
3580 	    old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct,
3581 	    0, ep) != 0) {
3582 		Free(new_un);
3583 		return (NULL);
3584 	}
3585 
3586 	/* update extent information */
3587 	new_un->un_numexts += numexts;
3588 
3589 	meta_sp_fillextarray(new_un, extlist);
3590 
3591 	return (new_un);
3592 }
3593 
3594 /*
3595  * FUNCTION:	meta_get_sp()
3596  * INPUT:	sp	- the set name for the device to get
3597  *		np	- the name of the device to get
3598  * OUTPUT:	ep	- return error pointer
3599  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition
3600  * PURPOSE:	interface to the rest of libmeta for fetching a unit structure
3601  *		for the named device.  Just a wrapper for meta_get_sp_common().
3602  */
3603 md_sp_t *
3604 meta_get_sp(
3605 	mdsetname_t	*sp,
3606 	mdname_t	*np,
3607 	md_error_t	*ep
3608 )
3609 {
3610 	return (meta_get_sp_common(sp, np, 0, ep));
3611 }
3612 
3613 /*
3614  * FUNCTION:	meta_get_sp_common()
3615  * INPUT:	sp	- the set name for the device to get
3616  *		np	- the name of the device to get
3617  *		fast	- whether to use the cache or not (NOT IMPLEMENTED!)
3618  * OUTPUT:	ep	- return error pointer
3619  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition,
3620  *			    NULL if np is not a soft partition
3621  * PURPOSE:	common routine for fetching a soft partition unit structure
3622  */
3623 md_sp_t *
3624 meta_get_sp_common(
3625 	mdsetname_t	*sp,
3626 	mdname_t	*np,
3627 	int		fast,
3628 	md_error_t	*ep
3629 )
3630 {
3631 	mddrivename_t	*dnp = np->drivenamep;
3632 	char		*miscname;
3633 	mp_unit_t	*mp;
3634 	md_sp_t		*msp;
3635 	int		i;
3636 
3637 	/* must have set */
3638 	assert(sp != NULL);
3639 
3640 	/* short circuit */
3641 	if (dnp->unitp != NULL) {
3642 		if (dnp->unitp->type != MD_METASP)
3643 			return (NULL);
3644 		return ((md_sp_t *)dnp->unitp);
3645 	}
3646 	/* get miscname and unit */
3647 	if ((miscname = metagetmiscname(np, ep)) == NULL)
3648 		return (NULL);
3649 
3650 	if (strcmp(miscname, MD_SP) != 0) {
3651 		(void) mdmderror(ep, MDE_NOT_SP, 0, np->cname);
3652 		return (NULL);
3653 	}
3654 
3655 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
3656 		return (NULL);
3657 
3658 	assert(mp->c.un_type == MD_METASP);
3659 
3660 	/* allocate soft partition */
3661 	msp = Zalloc(sizeof (*msp));
3662 
3663 	/* get the common information */
3664 	msp->common.namep = np;
3665 	msp->common.type = mp->c.un_type;
3666 	msp->common.state = mp->c.un_status;
3667 	msp->common.capabilities = mp->c.un_capabilities;
3668 	msp->common.parent = mp->c.un_parent;
3669 	msp->common.size = mp->c.un_total_blocks;
3670 	msp->common.user_flags = mp->c.un_user_flags;
3671 	msp->common.revision = mp->c.un_revision;
3672 
3673 	/* get soft partition information */
3674 	if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL)
3675 		goto out;
3676 
3677 	/*
3678 	 * Fill in the key and the start block.  Note that the start
3679 	 * block in the unit structure is 64 bits but the name pointer
3680 	 * only supports 32 bits.
3681 	 */
3682 	msp->compnamep->key = mp->un_key;
3683 	msp->compnamep->start_blk = mp->un_start_blk;
3684 
3685 	/* fill in status field */
3686 	msp->status = mp->un_status;
3687 
3688 	/* allocate the extents */
3689 	msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val));
3690 	msp->ext.ext_len = mp->un_numexts;
3691 
3692 	/* do the extents for this soft partition */
3693 	for (i = 0; i < mp->un_numexts; i++) {
3694 		struct mp_ext	*mde = &mp->un_ext[i];
3695 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
3696 
3697 		extp->voff = mde->un_voff;
3698 		extp->poff = mde->un_poff;
3699 		extp->len = mde->un_len;
3700 	}
3701 
3702 	/* cleanup, return success */
3703 	Free(mp);
3704 	dnp->unitp = (md_common_t *)msp;
3705 	return (msp);
3706 
3707 out:
3708 	/* clean up and return error */
3709 	Free(mp);
3710 	Free(msp);
3711 	return (NULL);
3712 }
3713 
3714 
3715 /*
3716  * FUNCTION:	meta_init_sp()
3717  * INPUT:	spp	- the set name for the new device
3718  *		argc	- the remaining argument count for the metainit cmdline
3719  *		argv	- the remainder of the unparsed command line
3720  *		options	- global options parsed by metainit
3721  * OUTPUT:	ep	- return error pointer
3722  * RETURNS:	int	- -1 failure, 0 success
3723  * PURPOSE:	provides the command line parsing and name management overhead
3724  *		for creating a new soft partition.  Ultimately this calls
3725  *		meta_create_sp() which does the real work of allocating space
3726  *		for the new soft partition.
3727  */
3728 int
3729 meta_init_sp(
3730 	mdsetname_t	**spp,
3731 	int		argc,
3732 	char		*argv[],
3733 	mdcmdopts_t	options,
3734 	md_error_t	*ep
3735 )
3736 {
3737 	char		*compname = NULL;
3738 	mdname_t	*spcompnp = NULL;	/* name of component volume */
3739 	char		*devname = argv[0];	/* unit name */
3740 	mdname_t	*np = NULL;		/* name of soft partition */
3741 	md_sp_t		*msp = NULL;
3742 	int		c;
3743 	int		old_optind;
3744 	sp_ext_length_t	len = 0LL;
3745 	int		rval = -1;
3746 	uint_t		seq;
3747 	int		oflag;
3748 	int		failed;
3749 	mddrivename_t	*dnp = NULL;
3750 	sp_ext_length_t	alignment = 0LL;
3751 	sp_ext_node_t	*extlist = NULL;
3752 
3753 	assert(argc > 0);
3754 
3755 	/* expect sp name, -p, optional -e, compname, and size parameters */
3756 	/* grab soft partition name */
3757 	if ((np = metaname(spp, devname, META_DEVICE, ep)) == NULL)
3758 		goto out;
3759 
3760 	/* see if it exists already */
3761 	if (metagetmiscname(np, ep) != NULL) {
3762 		(void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP,
3763 		    meta_getminor(np->dev), devname);
3764 		goto out;
3765 	} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) {
3766 		goto out;
3767 	} else {
3768 		mdclrerror(ep);
3769 	}
3770 	--argc, ++argv;
3771 
3772 	if (argc == 0)
3773 		goto syntax;
3774 
3775 	/* grab -p */
3776 	if (strcmp(argv[0], "-p") != 0)
3777 		goto syntax;
3778 	--argc, ++argv;
3779 
3780 	if (argc == 0)
3781 		goto syntax;
3782 
3783 	/* see if -e is there */
3784 	if (strcmp(argv[0], "-e") == 0) {
3785 		/* use the whole disk */
3786 		options |= MDCMD_USE_WHOLE_DISK;
3787 		--argc, ++argv;
3788 	}
3789 
3790 	if (argc == 0)
3791 		goto syntax;
3792 
3793 	/* get component name */
3794 	compname = Strdup(argv[0]);
3795 
3796 	if (options & MDCMD_USE_WHOLE_DISK) {
3797 		if ((dnp = metadrivename(spp, compname, ep)) == NULL) {
3798 			goto out;
3799 		}
3800 		if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) {
3801 			goto out;
3802 		}
3803 	} else if ((spcompnp = metaname(spp, compname, UNKNOWN, ep)) == NULL) {
3804 		goto out;
3805 	}
3806 	assert(*spp != NULL);
3807 
3808 	if (!(options & MDCMD_NOLOCK)) {
3809 		/* grab set lock */
3810 		if (meta_lock(*spp, TRUE, ep))
3811 			goto out;
3812 
3813 		if (meta_check_ownership(*spp, ep) != 0)
3814 			goto out;
3815 	}
3816 
3817 	/* allocate the soft partition */
3818 	msp = Zalloc(sizeof (*msp));
3819 
3820 	/* setup common */
3821 	msp->common.namep = np;
3822 	msp->common.type = MD_METASP;
3823 
3824 	compname = spcompnp->cname;
3825 
3826 	assert(spcompnp->rname != NULL);
3827 	--argc, ++argv;
3828 
3829 	if (argc == 0) {
3830 		goto syntax;
3831 	}
3832 
3833 	if (*argv[0] == '-') {
3834 		/*
3835 		 * parse any other command line options, this includes
3836 		 * the recovery options -o and -b. The special thing
3837 		 * with these options is that the len needs to be
3838 		 * kept track of otherwise when the geometry of the
3839 		 * "device" is built it will create an invalid geometry
3840 		 */
3841 		old_optind = optind = 0;
3842 		opterr = 0;
3843 		oflag = 0;
3844 		seq = 0;
3845 		failed = 0;
3846 		while ((c = getopt(argc, argv, "A:o:b:")) != -1) {
3847 			sp_ext_offset_t	offset;
3848 			sp_ext_length_t	length;
3849 			longlong_t	tmp_size;
3850 
3851 			switch (c) {
3852 			case 'A':	/* data alignment */
3853 				if (meta_sp_parsesizestring(optarg,
3854 					&alignment) == -1) {
3855 					failed = 1;
3856 				}
3857 				break;
3858 			case 'o':	/* offset in the partition */
3859 				if (oflag == 1) {
3860 					failed = 1;
3861 				} else {
3862 					tmp_size = atoll(optarg);
3863 					if (tmp_size <= 0) {
3864 						failed = 1;
3865 					} else {
3866 						oflag = 1;
3867 						options |= MDCMD_DIRECT;
3868 
3869 						offset = tmp_size;
3870 					}
3871 				}
3872 
3873 				break;
3874 			case 'b':	/* number of blocks */
3875 				if (oflag == 0) {
3876 					failed = 1;
3877 				} else {
3878 					tmp_size = atoll(optarg);
3879 					if (tmp_size <= 0) {
3880 						failed = 1;
3881 					} else {
3882 						oflag = 0;
3883 
3884 						length = tmp_size;
3885 
3886 						/* we have a pair of values */
3887 						meta_sp_list_insert(*spp, np,
3888 							&extlist, offset,
3889 							length, EXTTYP_ALLOC,
3890 							seq++, EXTFLG_UPDATE,
3891 							meta_sp_cmp_by_offset);
3892 						len += length;
3893 					}
3894 				}
3895 
3896 				break;
3897 			default:
3898 				argc -= old_optind;
3899 				argv += old_optind;
3900 				goto options;
3901 			}
3902 
3903 			if (failed) {
3904 				argc -= old_optind;
3905 				argv += old_optind;
3906 				goto syntax;
3907 			}
3908 
3909 			old_optind = optind;
3910 		}
3911 		argc -= optind;
3912 		argv += optind;
3913 
3914 		/*
3915 		 * Must have matching pairs of -o and -b flags
3916 		 */
3917 		if (oflag != 0)
3918 			goto syntax;
3919 
3920 		/*
3921 		 * Can't specify both layout (indicated indirectly by
3922 		 * len being set by thye -o/-b cases above) AND
3923 		 * alignment
3924 		 */
3925 		if ((len > 0LL) && (alignment > 0LL))
3926 			goto syntax;
3927 
3928 		/*
3929 		 * sanity check the allocation list
3930 		 */
3931 		if ((extlist != NULL) && meta_sp_list_overlaps(extlist))
3932 			goto syntax;
3933 	}
3934 
3935 	if (len == 0LL) {
3936 		if (argc == 0)
3937 			goto syntax;
3938 		if (meta_sp_parsesize(argv[0], &len) == -1)
3939 			goto syntax;
3940 		--argc, ++argv;
3941 	}
3942 
3943 	msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val));
3944 	msp->ext.ext_val->len = len;
3945 	msp->compnamep = spcompnp;
3946 
3947 	/* we should be at the end */
3948 	if (argc != 0)
3949 		goto syntax;
3950 
3951 	/* create soft partition */
3952 	if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0)
3953 		goto out;
3954 	rval = 0;
3955 
3956 	/* let em know */
3957 	if (options & MDCMD_PRINT) {
3958 		(void) printf(dgettext(TEXT_DOMAIN,
3959 		    "%s: Soft Partition is setup\n"),
3960 		    devname);
3961 		(void) fflush(stdout);
3962 	}
3963 	goto out;
3964 
3965 syntax:
3966 	/* syntax error */
3967 	rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv);
3968 	goto out;
3969 
3970 options:
3971 	/* options error */
3972 	rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv);
3973 	goto out;
3974 
3975 out:
3976 	if (msp != NULL) {
3977 		if (msp->ext.ext_val != NULL) {
3978 			Free(msp->ext.ext_val);
3979 		}
3980 		Free(msp);
3981 	}
3982 
3983 	return (rval);
3984 }
3985 
3986 /*
3987  * FUNCTION:	meta_free_sp()
3988  * INPUT:	msp	- the soft partition unit to free
3989  * OUTPUT:	none
3990  * RETURNS:	void
3991  * PURPOSE:	provides an interface from the rest of libmeta for freeing a
3992  *		soft partition unit
3993  */
3994 void
3995 meta_free_sp(md_sp_t *msp)
3996 {
3997 	Free(msp);
3998 }
3999 
4000 /*
4001  * FUNCTION:	meta_sp_issp()
4002  * INPUT:	sp	- the set name to check
4003  *		np	- the name to check
4004  * OUTPUT:	ep	- return error pointer
4005  * RETURNS:	int	- 0 means sp,np is a soft partition
4006  *			  1 means sp,np is not a soft partition
4007  * PURPOSE:	determines whether the given device is a soft partition
4008  *		device.  This is called by other metadevice check routines.
4009  */
4010 int
4011 meta_sp_issp(
4012 	mdsetname_t	*sp,
4013 	mdname_t	*np,
4014 	md_error_t	*ep
4015 )
4016 {
4017 	if (meta_get_sp_common(sp, np, 0, ep) == NULL)
4018 		return (1);
4019 
4020 	return (0);
4021 }
4022 
4023 /*
4024  * FUNCTION:	meta_check_sp()
4025  * INPUT:	sp	- the set name to check
4026  *		msp	- the unit structure to check
4027  *		options	- creation options
4028  * OUTPUT:	repart_options - options to be passed to
4029  *				meta_repartition_drive()
4030  *		ep	- return error pointer
4031  * RETURNS:	int	-  0 ok to create on this component
4032  *			  -1 error or not ok to create on this component
4033  * PURPOSE:	Checks to determine whether the rules for creation of
4034  *		soft partitions allow creation of a soft partition on
4035  *		the device described by the mdname_t structure referred
4036  *		to by msp->compnamep.
4037  *
4038  *		NOTE: Does NOT check to determine whether the extents
4039  *		      described in the md_sp_t structure referred to by
4040  *		      msp will fit on the device described by the mdname_t
4041  *		      structure located at msp->compnamep.
4042  */
4043 static int
4044 meta_check_sp(
4045 	mdsetname_t	*sp,
4046 	md_sp_t		*msp,
4047 	mdcmdopts_t	options,
4048 	int		*repart_options,
4049 	md_error_t	*ep
4050 )
4051 {
4052 	md_common_t	*mdp;
4053 	mdname_t	*compnp = msp->compnamep;
4054 	uint_t		slice;
4055 	mddrivename_t	*dnp;
4056 	mdname_t	*slicenp;
4057 	mdvtoc_t	*vtocp;
4058 
4059 	/* make sure it is in the set */
4060 	if (meta_check_inset(sp, compnp, ep) != 0)
4061 		return (-1);
4062 
4063 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4064 		uint_t	rep_slice;
4065 
4066 		/*
4067 		 * check to make sure we can partition this drive.
4068 		 * we cannot continue if any of the following are
4069 		 * true:
4070 		 * The drive is a metadevice.
4071 		 * The drive contains a mounted slice.
4072 		 * The drive contains a slice being swapped to.
4073 		 * The drive contains slices which are part of other
4074 		 * metadevices.
4075 		 * The drive contains a metadb.
4076 		 */
4077 		if (metaismeta(compnp))
4078 			return (mddeverror(ep, MDE_IS_META, compnp->dev,
4079 			    compnp->cname));
4080 
4081 		assert(compnp->drivenamep != NULL);
4082 
4083 		/*
4084 		 * ensure that we have slice 0 since the disk will be
4085 		 * repartitioned in the USE_WHOLE_DISK case.  this check
4086 		 * is redundant unless the user incorrectly specifies a
4087 		 * a fully qualified drive AND slice name (i.e.,
4088 		 * /dev/dsk/cXtXdXsX), which will be incorrectly
4089 		 * recognized as a drive name by the metaname code.
4090 		 */
4091 
4092 		if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL)
4093 			return (-1);
4094 		if (slice != MD_SLICE0)
4095 			return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname));
4096 
4097 		dnp = compnp->drivenamep;
4098 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
4099 			return (-1);
4100 
4101 		for (slice = 0; slice < vtocp->nparts; slice++) {
4102 
4103 			/* only check if the slice really exists */
4104 			if (vtocp->parts[slice].size == 0)
4105 				continue;
4106 
4107 			slicenp = metaslicename(dnp, slice, ep);
4108 			if (slicenp == NULL)
4109 				return (-1);
4110 
4111 			/* check to ensure that it is not already in use */
4112 			if (meta_check_inuse(sp,
4113 			    slicenp, MDCHK_INUSE, ep) != 0) {
4114 				return (-1);
4115 			}
4116 
4117 			/*
4118 			 * Up to this point, tests are applied to all
4119 			 * slices uniformly.
4120 			 */
4121 
4122 			if (slice == rep_slice) {
4123 				/*
4124 				 * Tests inside the body of this
4125 				 * conditional are applied only to
4126 				 * slice seven.
4127 				 */
4128 				if (meta_check_inmeta(sp, slicenp,
4129 				    options | MDCHK_ALLOW_MDDB |
4130 				    MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0)
4131 					return (-1);
4132 
4133 				/*
4134 				 * For slice seven, a metadb is NOT an
4135 				 * automatic failure. It merely means
4136 				 * that we're not allowed to muck
4137 				 * about with the partitioning of that
4138 				 * slice.  We indicate this by masking
4139 				 * in the MD_REPART_LEAVE_REP flag.
4140 				 */
4141 				if (metahasmddb(sp, slicenp, ep)) {
4142 					assert(repart_options !=
4143 					    NULL);
4144 					*repart_options |=
4145 					    MD_REPART_LEAVE_REP;
4146 				}
4147 
4148 				/*
4149 				 * Skip the remaining tests for slice
4150 				 * seven
4151 				 */
4152 				continue;
4153 			}
4154 
4155 			/*
4156 			 * Tests below this point will be applied to
4157 			 * all slices EXCEPT for the replica slice.
4158 			 */
4159 
4160 
4161 			/* check if component is in a metadevice */
4162 			if (meta_check_inmeta(sp, slicenp, options, 0,
4163 			    -1, ep) != 0)
4164 				return (-1);
4165 
4166 			/* check to see if component has a metadb */
4167 			if (metahasmddb(sp, slicenp, ep))
4168 				return (mddeverror(ep, MDE_HAS_MDDB,
4169 				    slicenp->dev, slicenp->cname));
4170 		}
4171 		/*
4172 		 * This should be all of the testing necessary when
4173 		 * the MDCMD_USE_WHOLE_DISK flag is set; the rest of
4174 		 * meta_check_sp() is oriented towards component
4175 		 * arguments instead of disks.
4176 		 */
4177 		goto meta_check_sp_ok;
4178 
4179 	}
4180 
4181 	/* check to ensure that it is not already in use */
4182 	if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) {
4183 		return (-1);
4184 	}
4185 
4186 	if (!metaismeta(compnp)) {	/* handle non-metadevices */
4187 
4188 		/*
4189 		 * The component can have one or more soft partitions on it
4190 		 * already, but can't be part of any other type of metadevice,
4191 		 * so if it is used for a metadevice, but the metadevice
4192 		 * isn't a soft partition, return failure.
4193 		 */
4194 
4195 		if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 &&
4196 		    meta_check_insp(sp, compnp, 0, -1, ep) == 0) {
4197 			return (-1);
4198 		}
4199 	} else {			/* handle metadevices */
4200 		/* get underlying unit & check capabilities */
4201 		if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL)
4202 			return (-1);
4203 
4204 		if ((! (mdp->capabilities & MD_CAN_PARENT)) ||
4205 		    (! (mdp->capabilities & MD_CAN_SP)))
4206 			return (mdmderror(ep, MDE_INVAL_UNIT,
4207 			    meta_getminor(compnp->dev), compnp->cname));
4208 	}
4209 
4210 meta_check_sp_ok:
4211 	mdclrerror(ep);
4212 	return (0);
4213 }
4214 
4215 /*
4216  * FUNCTION:	meta_create_sp()
4217  * INPUT:	sp	- the set name to create in
4218  *		msp	- the unit structure to create
4219  *		oblist	- an optional list of requested extents (-o/-b options)
4220  *		options	- creation options
4221  *		alignment - data alignment
4222  * OUTPUT:	ep	- return error pointer
4223  * RETURNS:	int	-  0 success, -1 error
4224  * PURPOSE:	does most of the work for creating a soft partition.  If
4225  *		metainit -p -e was used, first partition the drive.  Then
4226  *		create an extent list based on the existing soft partitions
4227  *		and assume all space not used by them is free.  Storage for
4228  *		the new soft partition is allocated from the free extents
4229  *		based on the length specified on the command line or the
4230  *		oblist passed in.  The unit structure is then committed and
4231  *		the watermarks are updated.  Finally, the status is changed to
4232  *		Okay and the process is complete.
4233  */
4234 static int
4235 meta_create_sp(
4236 	mdsetname_t	*sp,
4237 	md_sp_t		*msp,
4238 	sp_ext_node_t	*oblist,
4239 	mdcmdopts_t	options,
4240 	sp_ext_length_t	alignment,
4241 	md_error_t	*ep
4242 )
4243 {
4244 	mdname_t	*np = msp->common.namep;
4245 	mdname_t	*compnp = msp->compnamep;
4246 	mp_unit_t	*mp = NULL;
4247 	mdnamelist_t	*keynlp = NULL, *spnlp = NULL;
4248 	md_set_params_t	set_params;
4249 	int		rval = -1;
4250 	diskaddr_t	comp_size;
4251 	diskaddr_t	sp_start;
4252 	sp_ext_node_t	*extlist = NULL;
4253 	int		numexts = 0;	/* number of extents */
4254 	int		count = 0;
4255 	int		committed = 0;
4256 	int		repart_options = MD_REPART_FORCE;
4257 	int		create_flag = MD_CRO_32BIT;
4258 
4259 	md_set_desc	*sd;
4260 	mm_unit_t	*mm;
4261 	md_set_mmown_params_t	*ownpar = NULL;
4262 	int		comp_is_mirror = 0;
4263 
4264 	/* validate soft partition */
4265 	if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0)
4266 		return (-1);
4267 
4268 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4269 		if ((options & MDCMD_DOIT) != 0) {
4270 			if (meta_repartition_drive(sp,
4271 			    compnp->drivenamep,
4272 			    repart_options,
4273 			    NULL, /* Don't return the VTOC */
4274 			    ep) != 0)
4275 
4276 				return (-1);
4277 		} else {
4278 			/*
4279 			 * If -n and -e are both specified, it doesn't make
4280 			 * sense to continue without actually partitioning
4281 			 * the drive.
4282 			 */
4283 			return (0);
4284 		}
4285 	}
4286 
4287 	/* populate the start_blk field of the component name */
4288 	if ((sp_start = meta_sp_get_start(sp, compnp, ep)) ==
4289 	    MD_DISKADDR_ERROR) {
4290 		rval = -1;
4291 		goto out;
4292 	}
4293 
4294 	if (options & MDCMD_DOIT) {
4295 		/* store name in namespace */
4296 		if (add_key_name(sp, compnp, &keynlp, ep) != 0) {
4297 			rval = -1;
4298 			goto out;
4299 		}
4300 	}
4301 
4302 	/*
4303 	 * Get a list of the soft partitions that currently reside on
4304 	 * the component.  We should ALWAYS force reload the cache,
4305 	 * because if this is a single creation, there will not BE a
4306 	 * cached list, and if we're using the md.tab, we must rebuild
4307 	 * the list because it won't contain the previous (if any)
4308 	 * soft partition.
4309 	 */
4310 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4311 	if (count < 0) {
4312 		/* error occured */
4313 		rval = -1;
4314 		goto out;
4315 	}
4316 
4317 	/*
4318 	 * get the size of the underlying device.  if the size is smaller
4319 	 * than or equal to the watermark size, we know there isn't
4320 	 * enough space.
4321 	 */
4322 	if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) {
4323 		rval = -1;
4324 		goto out;
4325 	} else if (comp_size <= MD_SP_WMSIZE) {
4326 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname);
4327 		rval = -1;
4328 		goto out;
4329 	}
4330 	/*
4331 	 * seed extlist with reserved space at the beginning of the volume and
4332 	 * enough space for the end watermark.  The end watermark always gets
4333 	 * updated, but if the underlying device changes size it may not be
4334 	 * pointed to until the extent before it is updated.  Since the
4335 	 * end of the reserved space is where the first watermark starts,
4336 	 * the reserved extent should never be marked for updating.
4337 	 */
4338 
4339 	meta_sp_list_insert(NULL, NULL, &extlist,
4340 	    0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4341 	meta_sp_list_insert(NULL, NULL, &extlist,
4342 	    (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE,
4343 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4344 
4345 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4346 		rval = -1;
4347 		goto out;
4348 	}
4349 
4350 	metafreenamelist(spnlp);
4351 
4352 	if (getenv(META_SP_DEBUG)) {
4353 		meta_sp_debug("meta_create_sp: list of used extents:\n");
4354 		meta_sp_list_dump(extlist);
4355 	}
4356 
4357 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4358 
4359 	/* get extent list from -o/-b options or from free space */
4360 	if (options & MDCMD_DIRECT) {
4361 		if (getenv(META_SP_DEBUG)) {
4362 			meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n");
4363 			meta_sp_list_dump(oblist);
4364 		}
4365 
4366 		numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist);
4367 		if (numexts == -1) {
4368 			(void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname);
4369 			rval = -1;
4370 			goto out;
4371 		}
4372 	} else {
4373 		numexts = meta_sp_alloc_by_len(sp, np, &extlist,
4374 		    &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment :
4375 		    meta_sp_get_default_alignment(sp, compnp, ep));
4376 		if (numexts == -1) {
4377 			(void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname);
4378 			rval = -1;
4379 			goto out;
4380 		}
4381 	}
4382 
4383 	assert(extlist != NULL);
4384 
4385 	/* create soft partition */
4386 	mp = meta_sp_createunit(msp->common.namep, msp->compnamep,
4387 	    extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep);
4388 
4389 	create_flag = meta_check_devicesize(mp->c.un_total_blocks);
4390 
4391 	/* if we're not doing anything (metainit -n), return success */
4392 	if (! (options & MDCMD_DOIT)) {
4393 		rval = 0;	/* success */
4394 		goto out;
4395 	}
4396 
4397 	(void) memset(&set_params, 0, sizeof (set_params));
4398 
4399 	if (create_flag == MD_CRO_64BIT) {
4400 		mp->c.un_revision |= MD_64BIT_META_DEV;
4401 		set_params.options = MD_CRO_64BIT;
4402 	} else {
4403 		mp->c.un_revision &= ~MD_64BIT_META_DEV;
4404 		set_params.options = MD_CRO_32BIT;
4405 	}
4406 
4407 	if (getenv(META_SP_DEBUG)) {
4408 		meta_sp_debug("meta_create_sp: printing unit structure\n");
4409 		meta_sp_printunit(mp);
4410 	}
4411 
4412 	/*
4413 	 * Check to see if we're trying to create a partition on a mirror. If so
4414 	 * we may have to enforce an ownership change before writing the
4415 	 * watermark out.
4416 	 */
4417 	if (metaismeta(compnp)) {
4418 		char *miscname;
4419 
4420 		miscname = metagetmiscname(compnp, ep);
4421 		if (miscname != NULL)
4422 			comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0);
4423 		else
4424 			comp_is_mirror = 0;
4425 	} else {
4426 		comp_is_mirror = 0;
4427 	}
4428 
4429 	/*
4430 	 * For a multi-node environment we have to ensure that the master
4431 	 * node owns an underlying mirror before we issue the MD_IOCSET ioctl.
4432 	 * If the master does not own the device we will deadlock as the
4433 	 * implicit write of the watermarks (in sp_ioctl.c) will cause an
4434 	 * ownership change that will block as the MD_IOCSET is still in
4435 	 * progress. To close this window we force an owner change to occur
4436 	 * before issuing the MD_IOCSET. We cannot simply open the device and
4437 	 * write to it as this will only work for the first soft-partition
4438 	 * creation.
4439 	 */
4440 
4441 	if (comp_is_mirror && !metaislocalset(sp)) {
4442 
4443 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4444 			rval = -1;
4445 			goto out;
4446 		}
4447 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
4448 			mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
4449 			if (mm == NULL) {
4450 				rval = -1;
4451 				goto out;
4452 			} else {
4453 				rval = meta_mn_change_owner(&ownpar, sp->setno,
4454 					meta_getminor(compnp->dev),
4455 					sd->sd_mn_mynode->nd_nodeid,
4456 					MD_MN_MM_PREVENT_CHANGE |
4457 					    MD_MN_MM_SPAWN_THREAD);
4458 				if (rval == -1)
4459 					goto out;
4460 			}
4461 		}
4462 	}
4463 
4464 	set_params.mnum = MD_SID(mp);
4465 	set_params.size = mp->c.un_size;
4466 	set_params.mdp = (uintptr_t)mp;
4467 	MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum));
4468 
4469 	/* first phase of commit. */
4470 	if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
4471 	    np->cname) != 0) {
4472 		(void) mdstealerror(ep, &set_params.mde);
4473 		rval = -1;
4474 		goto out;
4475 	}
4476 
4477 	/* we've successfully committed the record */
4478 	committed = 1;
4479 
4480 	/* write watermarks */
4481 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
4482 		rval = -1;
4483 		goto out;
4484 	}
4485 
4486 	/*
4487 	 * Allow mirror ownership to change. If we don't succeed in this
4488 	 * ioctl it isn't fatal, but the cluster will probably hang fairly
4489 	 * soon as the mirror owner won't change. However, we have
4490 	 * successfully written the watermarks out to the device so the
4491 	 * softpart creation has succeeded
4492 	 */
4493 	if (ownpar) {
4494 		(void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum,
4495 		    ownpar->d.owner,
4496 		    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
4497 	}
4498 
4499 	/* second phase of commit, set status to MD_SP_OK */
4500 	if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) {
4501 		rval = -1;
4502 		goto out;
4503 	}
4504 	rval = 0;
4505 out:
4506 	Free(mp);
4507 	if (ownpar)
4508 		Free(ownpar);
4509 
4510 	if (extlist != NULL)
4511 		meta_sp_list_free(&extlist);
4512 
4513 	if (rval != 0 && keynlp != NULL && committed != 1)
4514 		(void) del_key_names(sp, keynlp, NULL);
4515 
4516 	metafreenamelist(keynlp);
4517 
4518 	return (rval);
4519 }
4520 
4521 /*
4522  * **************************************************************************
4523  *                      Reset (metaclear) Functions                         *
4524  * **************************************************************************
4525  */
4526 
4527 /*
4528  * FUNCTION:	meta_sp_reset_common()
4529  * INPUT:	sp	- the set name of the device to reset
4530  *		np	- the name of the device to reset
4531  *		msp	- the unit structure to reset
4532  *		options	- metaclear options
4533  * OUTPUT:	ep	- return error pointer
4534  * RETURNS:	int	-  0 success, -1 error
4535  * PURPOSE:	"resets", or more accurately deletes, the soft partition
4536  *		specified.  First the state is set to "deleting" and then the
4537  *		watermarks are all cleared out.  Once the watermarks have been
4538  *		updated, the unit structure is deleted from the metadb.
4539  */
4540 static int
4541 meta_sp_reset_common(
4542 	mdsetname_t	*sp,
4543 	mdname_t	*np,
4544 	md_sp_t		*msp,
4545 	md_sp_reset_t	reset_params,
4546 	mdcmdopts_t	options,
4547 	md_error_t	*ep
4548 )
4549 {
4550 	char	*miscname;
4551 	int	rval = -1;
4552 	int	is_open = 0;
4553 
4554 	/* make sure that nobody owns us */
4555 	if (MD_HAS_PARENT(msp->common.parent))
4556 		return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev),
4557 					np->cname));
4558 
4559 	/* make sure that the soft partition isn't open */
4560 	if ((is_open = meta_isopen(sp, np, ep, options)) < 0)
4561 		return (-1);
4562 	else if (is_open)
4563 		return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev),
4564 					np->cname));
4565 
4566 	/* get miscname */
4567 	if ((miscname = metagetmiscname(np, ep)) == NULL)
4568 		return (-1);
4569 
4570 	/* fill in reset params */
4571 	MD_SETDRIVERNAME(&reset_params, miscname, sp->setno);
4572 	reset_params.mnum = meta_getminor(np->dev);
4573 	reset_params.force = (options & MDCMD_FORCE) ? 1 : 0;
4574 
4575 	/*
4576 	 * clear soft partition - phase one.
4577 	 * place the soft partition into the "delete pending" state.
4578 	 */
4579 	if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0)
4580 		return (-1);
4581 
4582 	/*
4583 	 * Now clear the watermarks.  If the force flag is specified,
4584 	 * ignore any errors writing the watermarks and delete the unit
4585 	 * structure anyway.  An error may leave the on-disk format in a
4586 	 * corrupt state.  If force is not specified and we fail here,
4587 	 * the soft partition will remain in the "delete pending" state.
4588 	 */
4589 	if ((meta_sp_clear_wm(sp, msp, ep) < 0) &&
4590 	    ((options & MDCMD_FORCE) == 0))
4591 		goto out;
4592 
4593 	/*
4594 	 * clear soft partition - phase two.
4595 	 * the driver removes the soft partition from the metadb and
4596 	 * zeros out incore version.
4597 	 */
4598 	if (metaioctl(MD_IOCRESET, &reset_params,
4599 	    &reset_params.mde, np->cname) != 0) {
4600 		(void) mdstealerror(ep, &reset_params.mde);
4601 		goto out;
4602 	}
4603 
4604 	/*
4605 	 * Wait for the /dev to be cleaned up. Ignore the return
4606 	 * value since there's not much we can do.
4607 	 */
4608 	(void) meta_update_devtree(meta_getminor(np->dev));
4609 
4610 	rval = 0;	/* success */
4611 
4612 	if (options & MDCMD_PRINT) {
4613 		(void) printf(dgettext(TEXT_DOMAIN,
4614 		    "%s: Soft Partition is cleared\n"),
4615 		    np->cname);
4616 		(void) fflush(stdout);
4617 	}
4618 
4619 	/*
4620 	 * if told to recurse and on a metadevice, then attempt to
4621 	 * clear the subdevices.  Indicate failure if the clear fails.
4622 	 */
4623 	if ((options & MDCMD_RECURSE) &&
4624 	    (metaismeta(msp->compnamep)) &&
4625 	    (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0))
4626 		rval = -1;
4627 
4628 out:
4629 	meta_invalidate_name(np);
4630 	return (rval);
4631 }
4632 
4633 /*
4634  * FUNCTION:	meta_sp_reset()
4635  * INPUT:	sp	- the set name of the device to reset
4636  *		np	- the name of the device to reset
4637  *		options	- metaclear options
4638  * OUTPUT:	ep	- return error pointer
4639  * RETURNS:	int	-  0 success, -1 error
4640  * PURPOSE:	provides the entry point to the rest of libmeta for deleting a
4641  *		soft partition.  If np is NULL, then soft partitions are
4642  *		all deleted at the current level and then recursively deleted.
4643  *		Otherwise, if a name is specified either directly or as a
4644  *		result of a recursive operation, it deletes only that name.
4645  *		Since something sitting under a soft partition may be parented
4646  *		to it, we have to reparent that other device to another soft
4647  *		partition on the same component if we're deleting the one it's
4648  *		parented to.
4649  */
4650 int
4651 meta_sp_reset(
4652 	mdsetname_t	*sp,
4653 	mdname_t	*np,
4654 	mdcmdopts_t	options,
4655 	md_error_t	*ep
4656 )
4657 {
4658 	md_sp_t		*msp;
4659 	int		rval = -1;
4660 	mdnamelist_t	*spnlp = NULL, *nlp = NULL;
4661 	md_sp_reset_t	reset_params;
4662 	int		num_sp;
4663 
4664 	assert(sp != NULL);
4665 
4666 	/* reset/delete all soft paritions */
4667 	if (np == NULL) {
4668 		/*
4669 		 * meta_reset_all sets MDCMD_RECURSE, but this behavior
4670 		 * is incorrect for soft partitions.  We want to clear
4671 		 * all soft partitions at a particular level in the
4672 		 * metadevice stack before moving to the next level.
4673 		 * Thus, we clear MDCMD_RECURSE from the options.
4674 		 */
4675 		options &= ~MDCMD_RECURSE;
4676 
4677 		/* for each soft partition */
4678 		rval = 0;
4679 		if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
4680 			rval = -1;
4681 
4682 		for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) {
4683 			np = nlp->namep;
4684 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4685 				rval = -1;
4686 				break;
4687 			}
4688 			/*
4689 			 * meta_reset_all calls us twice to get soft
4690 			 * partitions at the top and bottom of the stack.
4691 			 * thus, if we have a parent, we'll get deleted
4692 			 * on the next call.
4693 			 */
4694 			if (MD_HAS_PARENT(msp->common.parent))
4695 				continue;
4696 			/*
4697 			 * If this is a multi-node set, we send a series
4698 			 * of individual metaclear commands.
4699 			 */
4700 			if (meta_is_mn_set(sp, ep)) {
4701 				if (meta_mn_send_metaclear_command(sp,
4702 				    np->cname, options, 0, ep) != 0) {
4703 					rval = -1;
4704 					break;
4705 				}
4706 			} else {
4707 				if (meta_sp_reset(sp, np, options, ep) != 0) {
4708 					rval = -1;
4709 					break;
4710 				}
4711 			}
4712 		}
4713 		/* cleanup return status */
4714 		metafreenamelist(spnlp);
4715 		return (rval);
4716 	}
4717 
4718 	/* check the name */
4719 	if (metachkmeta(np, ep) != 0)
4720 		return (-1);
4721 
4722 	/* get the unit structure */
4723 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4724 		return (-1);
4725 
4726 	/* clear out reset parameters */
4727 	(void) memset(&reset_params, 0, sizeof (reset_params));
4728 
4729 	/* if our child is a metadevice, we need to deparent/reparent it */
4730 	if (metaismeta(msp->compnamep)) {
4731 		/* get sp's on this component */
4732 		if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep,
4733 		    &spnlp, 1, ep)) <= 0)
4734 			/* no sp's on this device.  error! */
4735 			return (-1);
4736 		else if (num_sp == 1)
4737 			/* last sp on this device, so we deparent */
4738 			reset_params.new_parent = MD_NO_PARENT;
4739 		else {
4740 			/* have to reparent this metadevice */
4741 			for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4742 				if (meta_getminor(nlp->namep->dev) ==
4743 					meta_getminor(np->dev))
4744 					continue;
4745 				/*
4746 				 * this isn't the softpart we are deleting,
4747 				 * so use this device as the new parent.
4748 				 */
4749 				reset_params.new_parent =
4750 				    meta_getminor(nlp->namep->dev);
4751 				break;
4752 			}
4753 		}
4754 		metafreenamelist(spnlp);
4755 	}
4756 
4757 	if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0)
4758 		return (-1);
4759 
4760 	return (0);
4761 }
4762 
4763 /*
4764  * FUNCTION:	meta_sp_reset_component()
4765  * INPUT:	sp	- the set name of the device to reset
4766  *		name	- the string name of the device to reset
4767  *		options	- metaclear options
4768  * OUTPUT:	ep	- return error pointer
4769  * RETURNS:	int	-  0 success, -1 error
4770  * PURPOSE:	provides the ability to delete all soft partitions on a
4771  *		specified device (metaclear -p).  It first gets all of the
4772  *		soft partitions on the component and then deletes each one
4773  *		individually.
4774  */
4775 int
4776 meta_sp_reset_component(
4777 	mdsetname_t	*sp,
4778 	char		*name,
4779 	mdcmdopts_t	options,
4780 	md_error_t	*ep
4781 )
4782 {
4783 	mdname_t	*compnp, *np;
4784 	mdnamelist_t	*spnlp = NULL;
4785 	mdnamelist_t	*nlp = NULL;
4786 	md_sp_t		*msp;
4787 	int		count;
4788 	md_sp_reset_t	reset_params;
4789 
4790 	if ((compnp = metaname(&sp, name, UNKNOWN, ep)) == NULL)
4791 		return (-1);
4792 
4793 	/* If we're starting out with no soft partitions, it's an error */
4794 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4795 	if (count == 0)
4796 		return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname));
4797 	else if (count < 0)
4798 		return (-1);
4799 
4800 	/*
4801 	 * clear all soft partitions on this component.
4802 	 * NOTE: we reparent underlying metadevices as we go so that
4803 	 * things stay sane.  Also, if we encounter an error, we stop
4804 	 * and go no further in case recovery might be needed.
4805 	 */
4806 	for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4807 		/* clear out reset parameters */
4808 		(void) memset(&reset_params, 0, sizeof (reset_params));
4809 
4810 		/* check the name */
4811 		np = nlp->namep;
4812 
4813 		if (metachkmeta(np, ep) != 0) {
4814 			metafreenamelist(spnlp);
4815 			return (-1);
4816 		}
4817 
4818 		/* get the unit structure */
4819 		if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4820 			metafreenamelist(spnlp);
4821 			return (-1);
4822 		}
4823 
4824 		/* have to deparent/reparent metadevices */
4825 		if (metaismeta(compnp)) {
4826 			if (nlp->next == NULL)
4827 				reset_params.new_parent = MD_NO_PARENT;
4828 			else
4829 				reset_params.new_parent =
4830 				    meta_getminor(spnlp->next->namep->dev);
4831 		}
4832 
4833 		/* clear soft partition */
4834 		if (meta_sp_reset_common(sp, np, msp, reset_params,
4835 		    options, ep) < 0) {
4836 			metafreenamelist(spnlp);
4837 			return (-1);
4838 		}
4839 	}
4840 	metafreenamelist(spnlp);
4841 	return (0);
4842 }
4843 
4844 /*
4845  * **************************************************************************
4846  *                      Grow (metattach) Functions                          *
4847  * **************************************************************************
4848  */
4849 
4850 /*
4851  * FUNCTION:	meta_sp_attach()
4852  * INPUT:	sp	- the set name of the device to attach to
4853  *		np	- the name of the device to attach to
4854  *		addsize	- the unparsed string holding the amount of space to add
4855  *		options	- metattach options
4856  *		alignment - data alignment
4857  * OUTPUT:	ep	- return error pointer
4858  * RETURNS:	int	-  0 success, -1 error
4859  * PURPOSE:	grows a soft partition by reading in the existing unit
4860  *		structure and setting its state to Growing, allocating more
4861  *		space (similar to meta_create_sp()), updating the watermarks,
4862  *		and then writing out the new unit structure in the Okay state.
4863  */
4864 int
4865 meta_sp_attach(
4866 	mdsetname_t	*sp,
4867 	mdname_t	*np,
4868 	char		*addsize,
4869 	mdcmdopts_t	options,
4870 	sp_ext_length_t	alignment,
4871 	md_error_t	*ep
4872 )
4873 {
4874 	md_grow_params_t	grow_params;
4875 	sp_ext_length_t		grow_len;	/* amount to grow */
4876 	mp_unit_t		*mp, *new_un;
4877 	mdname_t		*compnp = NULL;
4878 
4879 	sp_ext_node_t		*extlist = NULL;
4880 	int			numexts;
4881 	mdnamelist_t		*spnlp = NULL;
4882 	int			count;
4883 	md_sp_t			*msp;
4884 	daddr_t			start_block;
4885 
4886 	/* should have the same set */
4887 	assert(sp != NULL);
4888 	assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev)));
4889 
4890 	/* check name */
4891 	if (metachkmeta(np, ep) != 0)
4892 		return (-1);
4893 
4894 	if (meta_sp_parsesize(addsize, &grow_len) == -1) {
4895 		return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname));
4896 	}
4897 
4898 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
4899 		return (-1);
4900 
4901 	/* make sure we don't have a parent */
4902 	if (MD_HAS_PARENT(mp->c.un_parent)) {
4903 		Free(mp);
4904 		return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname));
4905 	}
4906 
4907 	if (getenv(META_SP_DEBUG)) {
4908 		meta_sp_debug("meta_sp_attach: Unit structure before new "
4909 		    "space:\n");
4910 		meta_sp_printunit(mp);
4911 	}
4912 
4913 	/*
4914 	 * NOTE: the fast option to metakeyname is 0 as opposed to 1
4915 	 * If this was not the case we would suffer the following
4916 	 * assertion failure:
4917 	 * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP
4918 	 * file meta_check.x, line 315
4919 	 * I guess this is because we have not "seen" this drive before
4920 	 * and hence hit the failure - this is of course the attach routine
4921 	 */
4922 	if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) {
4923 		Free(mp);
4924 		return (-1);
4925 	}
4926 
4927 	/* metakeyname does not fill in the key. */
4928 	compnp->key = mp->un_key;
4929 
4930 	/* work out the space on the component that we are dealing with */
4931 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
4932 
4933 	/*
4934 	 * see if the component has been soft partitioned yet, or if an
4935 	 * error occurred.
4936 	 */
4937 	if (count == 0) {
4938 		Free(mp);
4939 		return (mdmderror(ep, MDE_NOT_SP, 0, np->cname));
4940 	} else if (count < 0) {
4941 		Free(mp);
4942 		return (-1);
4943 	}
4944 
4945 	/*
4946 	 * seed extlist with reserved space at the beginning of the volume and
4947 	 * enough space for the end watermark.  The end watermark always gets
4948 	 * updated, but if the underlying device changes size it may not be
4949 	 * pointed to until the extent before it is updated.  Since the
4950 	 * end of the reserved space is where the first watermark starts,
4951 	 * the reserved extent should never be marked for updating.
4952 	 */
4953 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
4954 	    MD_DISKADDR_ERROR) {
4955 		Free(mp);
4956 		return (-1);
4957 	}
4958 
4959 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
4960 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4961 	meta_sp_list_insert(NULL, NULL, &extlist,
4962 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
4963 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4964 
4965 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4966 		Free(mp);
4967 		return (-1);
4968 	}
4969 
4970 	metafreenamelist(spnlp);
4971 
4972 	if (getenv(META_SP_DEBUG)) {
4973 		meta_sp_debug("meta_sp_attach: list of used extents:\n");
4974 		meta_sp_list_dump(extlist);
4975 	}
4976 
4977 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4978 
4979 	assert(mp->un_numexts >= 1);
4980 	numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len,
4981 	    mp->un_ext[mp->un_numexts - 1].un_poff,
4982 	    (alignment > 0) ? alignment :
4983 	    meta_sp_get_default_alignment(sp, compnp, ep));
4984 
4985 	if (numexts == -1) {
4986 		Free(mp);
4987 		return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname));
4988 	}
4989 
4990 	/* allocate new unit structure and copy in old unit */
4991 	if ((new_un = meta_sp_updateunit(np, mp, extlist,
4992 	    grow_len, numexts, ep)) == NULL) {
4993 		Free(mp);
4994 		return (-1);
4995 	}
4996 	Free(mp);
4997 
4998 	/* If running in dryrun mode (-n option), we're done here */
4999 	if ((options & MDCMD_DOIT) == 0) {
5000 		if (options & MDCMD_PRINT) {
5001 			(void) printf(dgettext(TEXT_DOMAIN,
5002 			    "%s: Soft Partition would grow\n"),
5003 			    np->cname);
5004 			(void) fflush(stdout);
5005 		}
5006 		return (0);
5007 	}
5008 
5009 	if (getenv(META_SP_DEBUG)) {
5010 		meta_sp_debug("meta_sp_attach: updated unit structure:\n");
5011 		meta_sp_printunit(new_un);
5012 	}
5013 
5014 	assert(new_un != NULL);
5015 
5016 	(void) memset(&grow_params, 0, sizeof (grow_params));
5017 	if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) {
5018 		grow_params.options = MD_CRO_64BIT;
5019 		new_un->c.un_revision |= MD_64BIT_META_DEV;
5020 	} else {
5021 		grow_params.options = MD_CRO_32BIT;
5022 		new_un->c.un_revision &= ~MD_64BIT_META_DEV;
5023 	}
5024 	grow_params.mnum = MD_SID(new_un);
5025 	grow_params.size = new_un->c.un_size;
5026 	grow_params.mdp = (uintptr_t)new_un;
5027 	MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum));
5028 
5029 	if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde,
5030 	    np->cname) != 0) {
5031 		(void) mdstealerror(ep, &grow_params.mde);
5032 		return (-1);
5033 	}
5034 
5035 	/* update all watermarks */
5036 
5037 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
5038 		return (-1);
5039 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0)
5040 		return (-1);
5041 
5042 
5043 	/* second phase of commit, set status to MD_SP_OK */
5044 	if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0)
5045 		return (-1);
5046 
5047 	meta_invalidate_name(np);
5048 
5049 	if (options & MDCMD_PRINT) {
5050 		(void) printf(dgettext(TEXT_DOMAIN,
5051 		    "%s: Soft Partition has been grown\n"),
5052 		    np->cname);
5053 		(void) fflush(stdout);
5054 	}
5055 
5056 	return (0);
5057 }
5058 
5059 /*
5060  * **************************************************************************
5061  *                    Recovery (metarecover) Functions                      *
5062  * **************************************************************************
5063  */
5064 
5065 /*
5066  * FUNCTION:	meta_recover_sp()
5067  * INPUT:	sp	- the name of the set we are recovering on
5068  *		compnp	- name pointer for device we are recovering on
5069  *		argc	- argument count
5070  *		argv	- left over arguments not parsed by metarecover command
5071  *		options	- metarecover options
5072  * OUTPUT:	ep	- return error pointer
5073  * RETURNS:	int	- 0 - success, -1 - error
5074  * PURPOSE:	parse soft partitioning-specific metarecover options and
5075  *		dispatch to the appropriate function to handle recovery.
5076  */
5077 int
5078 meta_recover_sp(
5079 	mdsetname_t	*sp,
5080 	mdname_t	*compnp,
5081 	int		argc,
5082 	char		*argv[],
5083 	mdcmdopts_t	options,
5084 	md_error_t	*ep
5085 )
5086 {
5087 	md_set_desc	*sd;
5088 
5089 	if (argc > 1) {
5090 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5091 		    argc, argv);
5092 		return (-1);
5093 	}
5094 
5095 	/*
5096 	 * For a MN set, this operation must be performed on the master
5097 	 * as it is responsible for maintaining the watermarks
5098 	 */
5099 	if (!metaislocalset(sp)) {
5100 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
5101 			return (-1);
5102 		if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) {
5103 			(void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno,
5104 			    sd->sd_mn_master_nodenm, NULL, NULL);
5105 			return (-1);
5106 		}
5107 	}
5108 	if (argc == 0) {
5109 		/*
5110 		 * if no additional arguments are passed, metarecover should
5111 		 * validate both on-disk and metadb structures as well as
5112 		 * checking that both are consistent with each other
5113 		 */
5114 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5115 			return (-1);
5116 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5117 			return (-1);
5118 		if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0)
5119 			return (-1);
5120 	} else if (strcmp(argv[0], "-d") == 0) {
5121 		/*
5122 		 * Ensure that there is no existing valid record for this
5123 		 * soft-partition. If there is we have nothing to do.
5124 		 */
5125 		if (meta_sp_validate_unit(sp, compnp, options, ep) == 0)
5126 			return (-1);
5127 		/* validate and recover from on-disk structures */
5128 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5129 			return (-1);
5130 		if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0)
5131 			return (-1);
5132 	} else if (strcmp(argv[0], "-m") == 0) {
5133 		/* validate and recover from metadb structures */
5134 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5135 			return (-1);
5136 		if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0)
5137 			return (-1);
5138 	} else {
5139 		/* syntax error */
5140 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5141 		    argc, argv);
5142 		return (-1);
5143 	}
5144 
5145 	return (0);
5146 }
5147 
5148 /*
5149  * FUNCTION:	meta_sp_display_exthdr()
5150  * INPUT:	none
5151  * OUTPUT:	none
5152  * RETURNS:	void
5153  * PURPOSE:	print header line for sp_ext_node_t information.  to be used
5154  *		in conjunction with meta_sp_display_ext().
5155  */
5156 static void
5157 meta_sp_display_exthdr(void)
5158 {
5159 	(void) printf("%20s %5s %7s %20s %20s\n",
5160 	    dgettext(TEXT_DOMAIN, "Name"),
5161 	    dgettext(TEXT_DOMAIN, "Seq#"),
5162 	    dgettext(TEXT_DOMAIN, "Type"),
5163 	    dgettext(TEXT_DOMAIN, "Offset"),
5164 	    dgettext(TEXT_DOMAIN, "Length"));
5165 }
5166 
5167 
5168 /*
5169  * FUNCTION:	meta_sp_display_ext()
5170  * INPUT:	ext	- extent to display
5171  * OUTPUT:	none
5172  * RETURNS:	void
5173  * PURPOSE:	print selected fields from sp_ext_node_t.
5174  */
5175 static void
5176 meta_sp_display_ext(sp_ext_node_t *ext)
5177 {
5178 	/* print extent information */
5179 	if (ext->ext_namep != NULL)
5180 		(void) printf("%20s ", ext->ext_namep->cname);
5181 	else
5182 		(void) printf("%20s ", "NONE");
5183 
5184 	(void) printf("%5u ", ext->ext_seq);
5185 
5186 	switch (ext->ext_type) {
5187 	case EXTTYP_ALLOC:
5188 		(void) printf("%7s ", "ALLOC");
5189 		break;
5190 	case EXTTYP_FREE:
5191 		(void) printf("%7s ", "FREE");
5192 		break;
5193 	case EXTTYP_RESERVED:
5194 		(void) printf("%7s ", "RESV");
5195 		break;
5196 	case EXTTYP_END:
5197 		(void) printf("%7s ", "END");
5198 		break;
5199 	default:
5200 		(void) printf("%7s ", "INVLD");
5201 		break;
5202 	}
5203 
5204 	(void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length);
5205 }
5206 
5207 
5208 /*
5209  * FUNCTION:	meta_sp_checkseq()
5210  * INPUT:	extlist	- list of extents to be checked
5211  * OUTPUT:	none
5212  * RETURNS:	int	- 0 - success, -1 - error
5213  * PURPOSE:	check soft partition sequence numbers.  this function assumes
5214  *		that a list of extents representing 1 or more soft partitions
5215  *		is passed in sorted in sequence number order.  within a
5216  *		single soft partition, there may not be any missing or
5217  *		duplicate sequence numbers.
5218  */
5219 static int
5220 meta_sp_checkseq(sp_ext_node_t *extlist)
5221 {
5222 	sp_ext_node_t *ext;
5223 
5224 	assert(extlist != NULL);
5225 
5226 	for (ext = extlist;
5227 	    ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC;
5228 	    ext = ext->ext_next) {
5229 		if (ext->ext_next->ext_namep != NULL &&
5230 		    strcmp(ext->ext_next->ext_namep->cname,
5231 			ext->ext_namep->cname) != 0)
5232 				continue;
5233 
5234 		if (ext->ext_next->ext_seq != ext->ext_seq + 1) {
5235 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5236 			    "%s: sequence numbers are "
5237 			    "incorrect: %d should be %d\n"),
5238 			    ext->ext_next->ext_namep->cname,
5239 			    ext->ext_next->ext_seq, ext->ext_seq + 1);
5240 			return (-1);
5241 		}
5242 	}
5243 	return (0);
5244 }
5245 
5246 
5247 /*
5248  * FUNCTION:	meta_sp_resolve_name_conflict()
5249  * INPUT:	sp	- name of set we're are recovering in.
5250  *		old_np	- name pointer of soft partition we found on disk.
5251  * OUTPUT:	new_np	- name pointer for new soft partition name.
5252  *		ep	- error pointer returned.
5253  * RETURNS:	int	- 0 - name not replace, 1 - name replaced, -1 - error
5254  * PURPOSE:	Check to see if the name of one of the soft partitions we found
5255  *		on disk already exists in the metadb.  If so, prompt for a new
5256  *		name.  In addition, we keep a static array of names that
5257  *		will be recovered from this device since these names don't
5258  *		exist in the configuration at this point but cannot be
5259  *		recovered more than once.
5260  */
5261 static int
5262 meta_sp_resolve_name_conflict(
5263 	mdsetname_t	*sp,
5264 	mdname_t	*old_np,
5265 	mdname_t	**new_np,
5266 	md_error_t	*ep
5267 )
5268 {
5269 	char		yesno[255];
5270 	char		*yes;
5271 	char		newname[MD_SP_MAX_DEVNAME_PLUS_1];
5272 	int		nunits;
5273 	static int	*used_names = NULL;
5274 
5275 	assert(old_np != NULL);
5276 
5277 	if (used_names == NULL) {
5278 		if ((nunits = meta_get_nunits(ep)) < 0)
5279 			return (-1);
5280 		used_names = Zalloc(nunits * sizeof (int));
5281 	}
5282 
5283 	/* see if it exists already */
5284 	if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 &&
5285 	    metagetmiscname(old_np, ep) == NULL) {
5286 		if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5287 			return (-1);
5288 		else {
5289 			used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1;
5290 			mdclrerror(ep);
5291 			return (0);
5292 		}
5293 	}
5294 
5295 	/* name exists, ask the user for a new one */
5296 	(void) printf(dgettext(TEXT_DOMAIN,
5297 	    "WARNING: A soft partition named %s was found in the extent\n"
5298 	    "headers, but this name already exists in the metadb "
5299 	    "configuration.\n"
5300 	    "In order to continue recovery you must supply\n"
5301 	    "a new name for this soft partition.\n"), old_np->cname);
5302 	(void) printf(dgettext(TEXT_DOMAIN,
5303 	    "Would you like to continue and supply a new name? (yes/no) "));
5304 
5305 	(void) fflush(stdout);
5306 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
5307 	    (strlen(yesno) == 1))
5308 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
5309 		    dgettext(TEXT_DOMAIN, "no"));
5310 	yes = dgettext(TEXT_DOMAIN, "yes");
5311 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
5312 		return (-1);
5313 	}
5314 
5315 	(void) fflush(stdin);
5316 
5317 	/* get the new name */
5318 	for (;;) {
5319 		(void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name "
5320 		    "for this soft partition (dXXXX) "));
5321 		(void) fflush(stdout);
5322 		if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL)
5323 			(void) strcpy(newname, "");
5324 
5325 		/* remove newline character */
5326 		if (newname[strlen(newname) - 1] == '\n')
5327 			newname[strlen(newname) - 1] = '\0';
5328 
5329 		if (!(is_metaname(newname)) ||
5330 		    (meta_init_make_device(&sp, newname, ep) <= 0)) {
5331 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5332 			    "Invalid metadevice name\n"));
5333 			(void) fflush(stderr);
5334 			continue;
5335 		}
5336 
5337 		if ((*new_np = metaname(&sp, newname,
5338 		    META_DEVICE, ep)) == NULL) {
5339 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5340 			    "Invalid metadevice name\n"));
5341 			(void) fflush(stderr);
5342 			continue;
5343 		}
5344 
5345 		assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits);
5346 		/* make sure the name isn't already being used */
5347 		if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] ||
5348 		    metagetmiscname(*new_np, ep) != NULL) {
5349 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5350 			    "That name already exists\n"));
5351 			continue;
5352 		} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5353 			return (-1);
5354 
5355 		break;
5356 	}
5357 
5358 	/* got a new name, place in used array and return */
5359 	used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1;
5360 	mdclrerror(ep);
5361 	return (1);
5362 }
5363 
5364 /*
5365  * FUNCTION:	meta_sp_validate_wm()
5366  * INPUT:	sp	- set name we are recovering in
5367  *		compnp	- name pointer for device we are recovering from
5368  *		options	- metarecover options
5369  * OUTPUT:	ep	- error pointer returned
5370  * RETURNS:	int	- 0 - success, -1 - error
5371  * PURPOSE:	validate and display watermark configuration.  walk the
5372  *		on-disk watermark structures and validate the information
5373  *		found within.  since a watermark configuration is
5374  *		"self-defining", the act of traversing the watermarks
5375  *		is part of the validation process.
5376  */
5377 static int
5378 meta_sp_validate_wm(
5379 	mdsetname_t	*sp,
5380 	mdname_t	*compnp,
5381 	mdcmdopts_t	options,
5382 	md_error_t	*ep
5383 )
5384 {
5385 	sp_ext_node_t	*extlist = NULL;
5386 	sp_ext_node_t	*ext;
5387 	int		num_sps = 0;
5388 	int		rval;
5389 
5390 	if ((options & MDCMD_VERBOSE) != 0)
5391 		(void) printf(dgettext(TEXT_DOMAIN,
5392 		    "Verifying on-disk structures on %s.\n"),
5393 		    compnp->cname);
5394 
5395 	/*
5396 	 * for each watermark, build an ext_node, place on list.
5397 	 */
5398 	rval = meta_sp_extlist_from_wm(sp, compnp, &extlist,
5399 	    meta_sp_cmp_by_nameseq, ep);
5400 
5401 	if ((options & MDCMD_VERBOSE) != 0) {
5402 		/* print out what we found */
5403 		if (extlist == NULL)
5404 			(void) printf(dgettext(TEXT_DOMAIN,
5405 			    "No extent headers found on %s.\n"),
5406 			    compnp->cname);
5407 		else {
5408 			(void) printf(dgettext(TEXT_DOMAIN,
5409 			    "The following extent headers were found on %s.\n"),
5410 			    compnp->cname);
5411 			meta_sp_display_exthdr();
5412 		}
5413 		for (ext = extlist; ext != NULL; ext = ext->ext_next)
5414 			meta_sp_display_ext(ext);
5415 	}
5416 
5417 	if (rval < 0) {
5418 		(void) printf(dgettext(TEXT_DOMAIN,
5419 		    "%s: On-disk structures invalid or "
5420 		    "no soft partitions found.\n"),
5421 		    compnp->cname);
5422 		return (-1);
5423 	}
5424 
5425 	assert(extlist != NULL);
5426 
5427 	/* count number of soft partitions */
5428 	for (ext = extlist;
5429 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5430 	    ext = ext->ext_next) {
5431 		if (ext->ext_next != NULL &&
5432 		    ext->ext_next->ext_namep != NULL &&
5433 		    strcmp(ext->ext_next->ext_namep->cname,
5434 			ext->ext_namep->cname) == 0)
5435 				continue;
5436 		num_sps++;
5437 	}
5438 
5439 	if ((options & MDCMD_VERBOSE) != 0)
5440 		(void) printf(dgettext(TEXT_DOMAIN,
5441 		    "Found %d soft partition(s) on %s.\n"), num_sps,
5442 		    compnp->cname);
5443 
5444 	if (num_sps == 0) {
5445 		(void) printf(dgettext(TEXT_DOMAIN,
5446 		    "%s: No soft partitions.\n"), compnp->cname);
5447 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5448 	}
5449 
5450 	/* check sequence numbers */
5451 	if ((options & MDCMD_VERBOSE) != 0)
5452 		(void) printf(dgettext(TEXT_DOMAIN,
5453 		    "Checking sequence numbers.\n"));
5454 
5455 	if (meta_sp_checkseq(extlist) != 0)
5456 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5457 
5458 	return (0);
5459 }
5460 
5461 /*
5462  * FUNCTION:	meta_sp_validate_unit()
5463  * INPUT:	sp	- name of set we are recovering in
5464  *		compnp	- name of component we are recovering from
5465  *		options	- metarecover options
5466  * OUTPUT:	ep	- error pointer returned
5467  * RETURNS:	int	- 0 - success, -1 - error
5468  * PURPOSE:	validate and display metadb configuration.  begin by getting
5469  *		all soft partitions built on the specified component.  get
5470  *		the unit structure for each one and validate the fields within.
5471  */
5472 static int
5473 meta_sp_validate_unit(
5474 	mdsetname_t	*sp,
5475 	mdname_t	*compnp,
5476 	mdcmdopts_t	options,
5477 	md_error_t	*ep
5478 )
5479 {
5480 	md_sp_t		*msp;
5481 	mdnamelist_t	*spnlp = NULL;
5482 	mdnamelist_t	*namep = NULL;
5483 	int		count;
5484 	uint_t		extn;
5485 	sp_ext_length_t	size;
5486 
5487 	if ((options & MDCMD_VERBOSE) != 0)
5488 		(void) printf(dgettext(TEXT_DOMAIN,
5489 		    "%s: Validating soft partition metadb entries.\n"),
5490 		    compnp->cname);
5491 
5492 	if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR)
5493 		return (-1);
5494 
5495 	/* get all soft partitions on component */
5496 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
5497 
5498 	if (count == 0) {
5499 		(void) printf(dgettext(TEXT_DOMAIN,
5500 		    "%s: No soft partitions.\n"), compnp->cname);
5501 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5502 	} else if (count < 0) {
5503 		return (-1);
5504 	}
5505 
5506 	/* Now go through the soft partitions and check each one */
5507 	for (namep = spnlp; namep != NULL; namep = namep->next) {
5508 		mdname_t	*curnp = namep->namep;
5509 		sp_ext_offset_t	curvoff;
5510 
5511 		/* get the unit structure */
5512 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
5513 			return (-1);
5514 
5515 		/* verify generic unit structure parameters */
5516 		if ((options & MDCMD_VERBOSE) != 0)
5517 			(void) printf(dgettext(TEXT_DOMAIN,
5518 			    "\nVerifying device %s.\n"),
5519 			    curnp->cname);
5520 
5521 		/*
5522 		 * MD_SP_LAST is an invalid state and is always the
5523 		 * highest numbered.
5524 		 */
5525 		if (msp->status >= MD_SP_LAST) {
5526 			(void) printf(dgettext(TEXT_DOMAIN,
5527 			    "%s: status value %u is out of range.\n"),
5528 			    curnp->cname, msp->status);
5529 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5530 			    0, curnp->cname));
5531 		} else if ((options & MDCMD_VERBOSE) != 0) {
5532 			uint_t	tstate = 0;
5533 
5534 			if (metaismeta(msp->compnamep)) {
5535 				if (meta_get_tstate(msp->common.namep->dev,
5536 				    &tstate, ep) != 0)
5537 					return (-1);
5538 			}
5539 			(void) printf(dgettext(TEXT_DOMAIN,
5540 			    "%s: Status \"%s\" is valid.\n"),
5541 			    curnp->cname, meta_sp_status_to_name(msp->status,
5542 			    tstate & MD_DEV_ERRORED));
5543 		}
5544 
5545 		/* Now verify each extent */
5546 		if ((options & MDCMD_VERBOSE) != 0)
5547 			(void) printf("%14s %21s %21s %21s\n",
5548 			    dgettext(TEXT_DOMAIN, "Extent Number"),
5549 			    dgettext(TEXT_DOMAIN, "Virtual Offset"),
5550 			    dgettext(TEXT_DOMAIN, "Physical Offset"),
5551 			    dgettext(TEXT_DOMAIN, "Length"));
5552 
5553 		curvoff = 0ULL;
5554 		for (extn = 0; extn < msp->ext.ext_len; extn++) {
5555 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
5556 
5557 			if ((options & MDCMD_VERBOSE) != 0)
5558 				(void) printf("%14u %21llu %21llu %21llu\n",
5559 				    extn, extp->voff, extp->poff, extp->len);
5560 
5561 			if (extp->voff != curvoff) {
5562 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5563 				    "%s: virtual offset for extent %u "
5564 				    "is inconsistent, expected %llu, "
5565 				    "got %llu.\n"), curnp->cname, extn,
5566 				    curvoff, extp->voff);
5567 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5568 				    0, compnp->cname));
5569 			}
5570 
5571 			/* make sure extent does not drop off the end */
5572 			if ((extp->poff + extp->len) == size) {
5573 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5574 				    "%s: extent %u at offset %llu, "
5575 				    "length %llu exceeds the size of the "
5576 				    "device, %llu.\n"), curnp->cname,
5577 				    extn, extp->poff, extp->len, size);
5578 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5579 				    0, compnp->cname));
5580 			}
5581 
5582 			curvoff += extp->len;
5583 		}
5584 	}
5585 	if (options & MDCMD_PRINT) {
5586 		(void) printf(dgettext(TEXT_DOMAIN,
5587 		    "%s: Soft Partition metadb configuration is valid\n"),
5588 		    compnp->cname);
5589 	}
5590 	return (0);
5591 }
5592 
5593 /*
5594  * FUNCTION:	meta_sp_validate_wm_and_unit()
5595  * INPUT:	sp	- name of set we are recovering in
5596  *		compnp	- name of device we are recovering from
5597  *		options	- metarecover options
5598  * OUTPUT:	ep	- error pointer returned
5599  * RETURNS:	int	- 0 - success, -1 error
5600  * PURPOSE:	cross-validate and display watermarks and metadb records.
5601  *		get both the unit structures for the soft partitions built
5602  *		on the specified component and the watermarks found on that
5603  *		component and check to make sure they are consistent with
5604  *		each other.
5605  */
5606 static int
5607 meta_sp_validate_wm_and_unit(
5608 	mdsetname_t	*sp,
5609 	mdname_t	*np,
5610 	mdcmdopts_t	options,
5611 	md_error_t	*ep
5612 )
5613 {
5614 	sp_ext_node_t	*wmlist = NULL;
5615 	sp_ext_node_t	*unitlist = NULL;
5616 	sp_ext_node_t	*unitext;
5617 	sp_ext_node_t	*wmext;
5618 	sp_ext_offset_t	tmpunitoff;
5619 	mdnamelist_t	*spnlp = NULL;
5620 	int		count;
5621 	int		rval = 0;
5622 	int		verbose = (options & MDCMD_VERBOSE);
5623 
5624 	/* get unit structure list */
5625 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
5626 	if (count <= 0)
5627 		return (-1);
5628 
5629 	meta_sp_list_insert(NULL, NULL, &unitlist,
5630 	    metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
5631 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
5632 
5633 	if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) {
5634 		metafreenamelist(spnlp);
5635 		return (-1);
5636 	}
5637 
5638 	metafreenamelist(spnlp);
5639 
5640 	meta_sp_list_freefill(&unitlist, metagetsize(np, ep));
5641 
5642 	if (meta_sp_extlist_from_wm(sp, np, &wmlist,
5643 	    meta_sp_cmp_by_offset, ep) < 0) {
5644 		meta_sp_list_free(&unitlist);
5645 		return (-1);
5646 	}
5647 
5648 	if (getenv(META_SP_DEBUG)) {
5649 		meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n");
5650 		meta_sp_list_dump(unitlist);
5651 		meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n");
5652 		meta_sp_list_dump(wmlist);
5653 	}
5654 
5655 	/*
5656 	 * step through both lists and compare allocated nodes.  Free
5657 	 * nodes and end watermarks may differ between the two but
5658 	 * that's generally ok, and if they're wrong will typically
5659 	 * cause misplaced allocated extents.
5660 	 */
5661 	if (verbose)
5662 		(void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb "
5663 		    "allocations match extent headers.\n"), np->cname);
5664 
5665 	unitext = unitlist;
5666 	wmext = wmlist;
5667 	while ((wmext != NULL) && (unitext != NULL)) {
5668 		/* find next allocated extents in each list */
5669 		while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC)
5670 			wmext = wmext->ext_next;
5671 
5672 		while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC)
5673 			unitext = unitext->ext_next;
5674 
5675 		if (wmext == NULL || unitext == NULL)
5676 			break;
5677 
5678 		if (verbose) {
5679 			(void) printf(dgettext(TEXT_DOMAIN,
5680 			    "Metadb extent:\n"));
5681 			meta_sp_display_exthdr();
5682 			meta_sp_display_ext(unitext);
5683 			(void) printf(dgettext(TEXT_DOMAIN,
5684 			    "Extent header extent:\n"));
5685 			meta_sp_display_exthdr();
5686 			meta_sp_display_ext(wmext);
5687 			(void) printf("\n");
5688 		}
5689 
5690 		if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0)
5691 			rval = -1;
5692 
5693 		/*
5694 		 * if the offsets aren't equal, only increment the
5695 		 * lowest one in hopes of getting the lists back in sync.
5696 		 */
5697 		tmpunitoff = unitext->ext_offset;
5698 		if (unitext->ext_offset <= wmext->ext_offset)
5699 			unitext = unitext->ext_next;
5700 		if (wmext->ext_offset <= tmpunitoff)
5701 			wmext = wmext->ext_next;
5702 	}
5703 
5704 	/*
5705 	 * if both lists aren't at the end then there are extra
5706 	 * allocated nodes in one of them.
5707 	 */
5708 	if (wmext != NULL) {
5709 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5710 		    "%s: extent headers contain allocations not in "
5711 		    "the metadb\n\n"), np->cname);
5712 		rval = -1;
5713 	}
5714 
5715 	if (unitext != NULL) {
5716 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5717 		    "%s: metadb contains allocations not in the extent "
5718 		    "headers\n\n"), np->cname);
5719 		rval = -1;
5720 	}
5721 
5722 	if (options & MDCMD_PRINT) {
5723 		if (rval == 0) {
5724 			(void) printf(dgettext(TEXT_DOMAIN,
5725 			    "%s: Soft Partition metadb matches extent "
5726 			    "header configuration\n"), np->cname);
5727 		} else {
5728 			(void) printf(dgettext(TEXT_DOMAIN,
5729 			    "%s: Soft Partition metadb does not match extent "
5730 			    "header configuration\n"), np->cname);
5731 		}
5732 	}
5733 
5734 	return (rval);
5735 }
5736 
5737 /*
5738  * FUNCTION:	meta_sp_validate_exts()
5739  * INPUT:	compnp	- name pointer for device we are recovering from
5740  *		wmext	- extent node representing watermark
5741  *		unitext	- extent node from unit structure
5742  * OUTPUT:	ep	- return error pointer
5743  * RETURNS:	int	- 0 - succes, mdmderror return code - error
5744  * PURPOSE:	Takes two extent nodes and checks them against each other.
5745  *		offset, length, sequence number, set, and name are compared.
5746  */
5747 static int
5748 meta_sp_validate_exts(
5749 	mdname_t	*compnp,
5750 	sp_ext_node_t	*wmext,
5751 	sp_ext_node_t	*unitext,
5752 	md_error_t	*ep
5753 )
5754 {
5755 	if (wmext->ext_offset != unitext->ext_offset) {
5756 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5757 		    "%s: unit structure and extent header offsets differ.\n"),
5758 		    compnp->cname);
5759 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5760 	}
5761 
5762 	if (wmext->ext_length != unitext->ext_length) {
5763 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5764 		    "%s: unit structure and extent header lengths differ.\n"),
5765 		    compnp->cname);
5766 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5767 	}
5768 
5769 	if (wmext->ext_seq != unitext->ext_seq) {
5770 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5771 		    "%s: unit structure and extent header sequence numbers "
5772 		    "differ.\n"), compnp->cname);
5773 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5774 	}
5775 
5776 	if (wmext->ext_type != unitext->ext_type) {
5777 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5778 		    "%s: unit structure and extent header types differ.\n"),
5779 		    compnp->cname);
5780 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5781 	}
5782 
5783 	/*
5784 	 * If one has a set pointer and the other doesn't, error.
5785 	 * If both extents have setnames, then make sure they match
5786 	 * If both are NULL, it's ok, they match.
5787 	 */
5788 	if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) {
5789 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5790 		    "%s: unit structure and extent header set values "
5791 		    "differ.\n"), compnp->cname);
5792 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5793 	}
5794 
5795 	if (unitext->ext_setp != NULL) {
5796 		if (strcmp(unitext->ext_setp->setname,
5797 		    wmext->ext_setp->setname) != 0) {
5798 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5799 			    "%s: unit structure and extent header set names "
5800 			    "differ.\n"), compnp->cname);
5801 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5802 			    0, compnp->cname));
5803 		}
5804 	}
5805 
5806 	/*
5807 	 * If one has a name pointer and the other doesn't, error.
5808 	 * If both extents have names, then make sure they match
5809 	 * If both are NULL, it's ok, they match.
5810 	 */
5811 	if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) {
5812 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5813 		    "%s: unit structure and extent header name values "
5814 		    "differ.\n"), compnp->cname);
5815 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5816 	}
5817 
5818 	if (unitext->ext_namep != NULL) {
5819 		if (strcmp(wmext->ext_namep->cname,
5820 		    unitext->ext_namep->cname) != 0) {
5821 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5822 			    "%s: unit structure and extent header names "
5823 			    "differ.\n"), compnp->cname);
5824 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5825 			    0, compnp->cname));
5826 		}
5827 	}
5828 
5829 	return (0);
5830 }
5831 
5832 /*
5833  * FUNCTION:	update_sp_status()
5834  * INPUT:	sp	- name of set we are recovering in
5835  *		minors	- pointer to an array of soft partition minor numbers
5836  *		num_sps	- number of minor numbers in array
5837  *		status	- new status to be applied to all soft parts in array
5838  *		mn_set	- set if current set is a multi-node set
5839  * OUTPUT:	ep	- return error pointer
5840  * RETURNS:	int	- 0 - success, -1 - error
5841  * PURPOSE:	update  status of soft partitions to new status. minors is an
5842  *		array of minor numbers to apply the new status to.
5843  *		If mn_set is set, a message is sent to all nodes in the
5844  *		cluster to update the status locally.
5845  */
5846 static int
5847 update_sp_status(
5848 	mdsetname_t	*sp,
5849 	minor_t		*minors,
5850 	int		num_sps,
5851 	sp_status_t	status,
5852 	bool_t		mn_set,
5853 	md_error_t	*ep
5854 )
5855 {
5856 	int	i;
5857 	int	err = 0;
5858 
5859 	if (mn_set) {
5860 		md_mn_msg_sp_setstat_t	sp_setstat_params;
5861 		int			result;
5862 		md_mn_result_t		*resp = NULL;
5863 
5864 		for (i = 0; i < num_sps; i++) {
5865 			sp_setstat_params.sp_setstat_mnum = minors[i];
5866 			sp_setstat_params.sp_setstat_status = status;
5867 
5868 			result = mdmn_send_message(sp->setno,
5869 			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS,
5870 			    (char *)&sp_setstat_params,
5871 			    sizeof (sp_setstat_params),
5872 			    &resp, ep);
5873 			if (resp != NULL) {
5874 				if (resp->mmr_exitval != 0)
5875 					err = -1;
5876 				free_result(resp);
5877 			}
5878 			if (result != 0) {
5879 				err = -1;
5880 			}
5881 		}
5882 	} else {
5883 		if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0)
5884 			err = -1;
5885 	}
5886 	if (err < 0) {
5887 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5888 		    "Error updating status on recovered soft "
5889 		    "partitions.\n"));
5890 	}
5891 	return (err);
5892 }
5893 
5894 /*
5895  * FUNCTION:	meta_sp_recover_from_wm()
5896  * INPUT:	sp	- name of set we are recovering in
5897  *		compnp	- name pointer for component we are recovering from
5898  *		options	- metarecover options
5899  * OUTPUT:	ep	- return error pointer
5900  * RETURNS:	int	- 0 - success, -1 - error
5901  * PURPOSE:	update metadb records to match watermarks.  begin by getting
5902  *		an extlist representing all soft partitions on the component.
5903  *		then build a unit structure for each soft partition.
5904  *		notify user of changes, then commit each soft partition to
5905  *		the metadb one at a time in the "recovering" state.  update
5906  *		any watermarks that may need it	(to reflect possible name
5907  *		changes), and, finally, set the status of all recovered
5908  *		partitions to the "OK" state at once.
5909  */
5910 static int
5911 meta_sp_recover_from_wm(
5912 	mdsetname_t	*sp,
5913 	mdname_t	*compnp,
5914 	mdcmdopts_t	options,
5915 	md_error_t	*ep
5916 )
5917 {
5918 	sp_ext_node_t		*extlist = NULL;
5919 	sp_ext_node_t		*sp_list = NULL;
5920 	sp_ext_node_t		*update_list = NULL;
5921 	sp_ext_node_t		*ext;
5922 	sp_ext_node_t		*sp_ext;
5923 	mp_unit_t		*mp;
5924 	mp_unit_t		**un_array;
5925 	int			numexts = 0, num_sps = 0, i = 0;
5926 	int			err = 0;
5927 	int			not_recovered = 0;
5928 	int			committed = 0;
5929 	sp_ext_length_t		sp_length = 0LL;
5930 	mdnamelist_t		*keynlp = NULL;
5931 	mdname_t		*np;
5932 	mdname_t		*new_np;
5933 	int			new_name;
5934 	md_set_params_t		set_params;
5935 	minor_t			*minors = NULL;
5936 	char			yesno[255];
5937 	char			*yes;
5938 	bool_t			mn_set = 0;
5939 	md_set_desc		*sd;
5940 	mm_unit_t		*mm;
5941 	md_set_mmown_params_t	*ownpar = NULL;
5942 	int			comp_is_mirror = 0;
5943 
5944 	/*
5945 	 * if this component appears in another metadevice already, do
5946 	 * NOT recover from it.
5947 	 */
5948 	if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0)
5949 		return (-1);
5950 
5951 	/* set flag if dealing with a MN set */
5952 	if (!metaislocalset(sp)) {
5953 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5954 			return (-1);
5955 		}
5956 		if (MD_MNSET_DESC(sd))
5957 			mn_set = 1;
5958 	}
5959 	/*
5960 	 * for each watermark, build an ext_node, place on list.
5961 	 */
5962 	if (meta_sp_extlist_from_wm(sp, compnp, &extlist,
5963 	    meta_sp_cmp_by_nameseq, ep) < 0)
5964 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5965 
5966 	assert(extlist != NULL);
5967 
5968 	/* count number of soft partitions */
5969 	for (ext = extlist;
5970 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5971 	    ext = ext->ext_next) {
5972 		if (ext->ext_next != NULL &&
5973 		    ext->ext_next->ext_namep != NULL &&
5974 		    strcmp(ext->ext_next->ext_namep->cname,
5975 			ext->ext_namep->cname) == 0)
5976 				continue;
5977 		num_sps++;
5978 	}
5979 
5980 	/* allocate array of unit structure pointers */
5981 	un_array = Zalloc(num_sps * sizeof (mp_unit_t *));
5982 
5983 	/*
5984 	 * build unit structures from list of ext_nodes.
5985 	 */
5986 	for (ext = extlist;
5987 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5988 	    ext = ext->ext_next) {
5989 		meta_sp_list_insert(ext->ext_setp, ext->ext_namep,
5990 		    &sp_list, ext->ext_offset, ext->ext_length,
5991 		    ext->ext_type, ext->ext_seq, ext->ext_flags,
5992 		    meta_sp_cmp_by_nameseq);
5993 
5994 		numexts++;
5995 		sp_length += ext->ext_length - MD_SP_WMSIZE;
5996 
5997 		if (ext->ext_next != NULL &&
5998 		    ext->ext_next->ext_namep != NULL &&
5999 		    strcmp(ext->ext_next->ext_namep->cname,
6000 			ext->ext_namep->cname) == 0)
6001 				continue;
6002 
6003 		/*
6004 		 * if we made it here, we are at a soft partition
6005 		 * boundary in the list.
6006 		 */
6007 		if (getenv(META_SP_DEBUG)) {
6008 			meta_sp_debug("meta_recover_from_wm: dumping wm "
6009 			    "list:\n");
6010 			meta_sp_list_dump(sp_list);
6011 		}
6012 
6013 		assert(sp_list != NULL);
6014 		assert(sp_list->ext_namep != NULL);
6015 
6016 		if ((new_name = meta_sp_resolve_name_conflict(sp,
6017 		    sp_list->ext_namep, &new_np, ep)) < 0) {
6018 			err = 1;
6019 			goto out;
6020 		} else if (new_name) {
6021 			for (sp_ext = sp_list;
6022 			    sp_ext != NULL;
6023 			    sp_ext = sp_ext->ext_next) {
6024 				/*
6025 				 * insert into the update list for
6026 				 * watermark update.
6027 				 */
6028 				meta_sp_list_insert(sp_ext->ext_setp,
6029 				    new_np, &update_list, sp_ext->ext_offset,
6030 				    sp_ext->ext_length, sp_ext->ext_type,
6031 				    sp_ext->ext_seq, EXTFLG_UPDATE,
6032 				    meta_sp_cmp_by_offset);
6033 			}
6034 
6035 		}
6036 		if (options & MDCMD_DOIT) {
6037 			/* store name in namespace */
6038 			if (mn_set) {
6039 				/* send message to all nodes to return key */
6040 				md_mn_msg_addkeyname_t	*send_params;
6041 				int			result;
6042 				md_mn_result_t		*resp = NULL;
6043 				int			message_size;
6044 
6045 				message_size =  sizeof (*send_params) +
6046 				    strlen(compnp->cname) + 1;
6047 				send_params = Zalloc(message_size);
6048 				send_params->addkeyname_setno = sp->setno;
6049 				(void) strcpy(&send_params->addkeyname_name[0],
6050 				    compnp->cname);
6051 				result = mdmn_send_message(sp->setno,
6052 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6053 				    (char *)send_params, message_size, &resp,
6054 				    ep);
6055 				Free(send_params);
6056 				if (resp != NULL) {
6057 					if (resp->mmr_exitval >= 0) {
6058 						compnp->key =
6059 						    (mdkey_t)resp->mmr_exitval;
6060 					} else {
6061 						err = 1;
6062 						free_result(resp);
6063 						goto out;
6064 					}
6065 					free_result(resp);
6066 				}
6067 				if (result != 0) {
6068 					err = 1;
6069 					goto out;
6070 				}
6071 				(void) metanamelist_append(&keynlp, compnp);
6072 			} else {
6073 				if (add_key_name(sp, compnp, &keynlp,
6074 				    ep) != 0) {
6075 					err = 1;
6076 					goto out;
6077 				}
6078 			}
6079 		}
6080 
6081 		/* create the unit structure */
6082 		if ((mp = meta_sp_createunit(
6083 		    (new_name) ? new_np : sp_list->ext_namep, compnp,
6084 		    sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) {
6085 			err = 1;
6086 			goto out;
6087 		}
6088 
6089 		if (getenv(META_SP_DEBUG)) {
6090 			meta_sp_debug("meta_sp_recover_from_wm: "
6091 			    "printing newly created unit structure");
6092 			meta_sp_printunit(mp);
6093 		}
6094 
6095 		/* place in unit structure array */
6096 		un_array[i++] = mp;
6097 
6098 		/* free sp_list */
6099 		meta_sp_list_free(&sp_list);
6100 		sp_list = NULL;
6101 		numexts = 0;
6102 		sp_length = 0LL;
6103 	}
6104 
6105 	/* display configuration updates */
6106 	(void) printf(dgettext(TEXT_DOMAIN,
6107 	    "The following soft partitions were found and will be added to\n"
6108 	    "your metadevice configuration.\n"));
6109 	(void) printf("%5s %15s %18s\n",
6110 	    dgettext(TEXT_DOMAIN, "Name"),
6111 	    dgettext(TEXT_DOMAIN, "Size"),
6112 	    dgettext(TEXT_DOMAIN, "No. of Extents"));
6113 	for (i = 0; i < num_sps; i++) {
6114 		(void) printf("%5s%lu %15llu %9d\n", "d",
6115 		    MD_MIN2UNIT(MD_SID(un_array[i])),
6116 		    un_array[i]->un_length, un_array[i]->un_numexts);
6117 	}
6118 
6119 	if (!(options & MDCMD_DOIT)) {
6120 		not_recovered = 1;
6121 		goto out;
6122 	}
6123 
6124 	/* ask user for confirmation */
6125 	(void) printf(dgettext(TEXT_DOMAIN,
6126 	    "WARNING: You are about to add one or more soft partition\n"
6127 	    "metadevices to your metadevice configuration.  If there\n"
6128 	    "appears to be an error in the soft partition(s) displayed\n"
6129 	    "above, do NOT proceed with this recovery operation.\n"));
6130 	(void) printf(dgettext(TEXT_DOMAIN,
6131 	    "Are you sure you want to do this (yes/no)? "));
6132 
6133 	(void) fflush(stdout);
6134 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6135 	    (strlen(yesno) == 1))
6136 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
6137 		    dgettext(TEXT_DOMAIN, "no"));
6138 	yes = dgettext(TEXT_DOMAIN, "yes");
6139 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
6140 		not_recovered = 1;
6141 		goto out;
6142 	}
6143 
6144 	/* commit records one at a time */
6145 	for (i = 0; i < num_sps; i++) {
6146 		(void) memset(&set_params, 0, sizeof (set_params));
6147 		set_params.mnum = MD_SID(un_array[i]);
6148 		set_params.size = (un_array[i])->c.un_size;
6149 		set_params.mdp = (uintptr_t)(un_array[i]);
6150 		set_params.options =
6151 				meta_check_devicesize(un_array[i]->un_length);
6152 		if (set_params.options == MD_CRO_64BIT) {
6153 			un_array[i]->c.un_revision |= MD_64BIT_META_DEV;
6154 		} else {
6155 			un_array[i]->c.un_revision &= ~MD_64BIT_META_DEV;
6156 		}
6157 		MD_SETDRIVERNAME(&set_params, MD_SP,
6158 		    MD_MIN2SET(set_params.mnum));
6159 
6160 		np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep);
6161 
6162 		/*
6163 		 * If this is an MN set, send the MD_IOCSET ioctl to all nodes
6164 		 */
6165 		if (mn_set) {
6166 			md_mn_msg_iocset_t	send_params;
6167 			int			result;
6168 			md_mn_result_t		*resp = NULL;
6169 			int			mess_size;
6170 
6171 			/*
6172 			 * Calculate message size. md_mn_msg_iocset_t only
6173 			 * contains one extent, so increment the size to
6174 			 * include all extents
6175 			 */
6176 			mess_size = sizeof (send_params) -
6177 			    sizeof (mp_ext_t) +
6178 			    (un_array[i]->un_numexts * sizeof (mp_ext_t));
6179 
6180 			send_params.iocset_params = set_params;
6181 			(void) memcpy(&send_params.unit, un_array[i],
6182 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
6183 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
6184 			result = mdmn_send_message(sp->setno,
6185 			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS,
6186 			    (char *)&send_params, mess_size, &resp,
6187 			    ep);
6188 			if (resp != NULL) {
6189 				if (resp->mmr_exitval != 0)
6190 					err = 1;
6191 				free_result(resp);
6192 			}
6193 			if (result != 0) {
6194 				err = 1;
6195 			}
6196 		} else {
6197 			if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
6198 			    np->cname) != 0) {
6199 				err = 1;
6200 			}
6201 		}
6202 
6203 		if (err == 1) {
6204 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6205 			    "%s: Error committing record to metadb.\n"),
6206 			    np->cname);
6207 			goto out;
6208 		}
6209 
6210 		/* note that we've committed a record */
6211 		if (!committed)
6212 			committed = 1;
6213 
6214 		/* update any watermarks that need it */
6215 		if (update_list != NULL) {
6216 			md_sp_t *msp;
6217 
6218 			/*
6219 			 * Check to see if we're trying to create a partition
6220 			 * on a mirror. If so we may have to enforce an
6221 			 * ownership change before writing the watermark out.
6222 			 */
6223 			if (metaismeta(compnp)) {
6224 				char *miscname;
6225 
6226 				miscname = metagetmiscname(compnp, ep);
6227 				if (miscname != NULL)
6228 					comp_is_mirror = (strcmp(miscname,
6229 					    MD_MIRROR) == 0);
6230 				else
6231 					comp_is_mirror = 0;
6232 			}
6233 			/*
6234 			 * If this is a MN set and the component is a mirror,
6235 			 * change ownership to this node in order to write the
6236 			 * watermarks
6237 			 */
6238 			if (mn_set && comp_is_mirror) {
6239 				mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
6240 				if (mm == NULL) {
6241 					err = 1;
6242 					goto out;
6243 				} else {
6244 					err = meta_mn_change_owner(&ownpar,
6245 						sp->setno,
6246 						meta_getminor(compnp->dev),
6247 						sd->sd_mn_mynode->nd_nodeid,
6248 						MD_MN_MM_PREVENT_CHANGE |
6249 						    MD_MN_MM_SPAWN_THREAD);
6250 					if (err != 0)
6251 						goto out;
6252 				}
6253 			}
6254 
6255 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
6256 				err = 1;
6257 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6258 				    "%s: Error updating extent headers.\n"),
6259 				    np->cname);
6260 				goto out;
6261 			}
6262 			if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) {
6263 				err = 1;
6264 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6265 				    "%s: Error updating extent headers "
6266 				    "on disk.\n"), np->cname);
6267 				goto out;
6268 			}
6269 		}
6270 		/*
6271 		 * If we have changed ownership earlier and prevented any
6272 		 * ownership changes, we can now allow ownership changes
6273 		 * again.
6274 		 */
6275 		if (ownpar) {
6276 			(void) meta_mn_change_owner(&ownpar, sp->setno,
6277 			    ownpar->d.mnum,
6278 			    ownpar->d.owner,
6279 			    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
6280 		}
6281 	}
6282 
6283 	/* update status of all soft partitions to OK */
6284 	minors = Zalloc(num_sps * sizeof (minor_t));
6285 	for (i = 0; i < num_sps; i++)
6286 		minors[i] = MD_SID(un_array[i]);
6287 
6288 	err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep);
6289 	if (err != 0)
6290 		goto out;
6291 
6292 	if (options & MDCMD_PRINT)
6293 		(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6294 		    "Soft Partitions recovered from device.\n"),
6295 		    compnp->cname);
6296 out:
6297 	/* free memory */
6298 	if (extlist != NULL)
6299 		meta_sp_list_free(&extlist);
6300 	if (sp_list != NULL)
6301 		meta_sp_list_free(&sp_list);
6302 	if (update_list != NULL)
6303 		meta_sp_list_free(&update_list);
6304 	if (un_array != NULL)	{
6305 		for (i = 0; i < num_sps; i++)
6306 			Free(un_array[i]);
6307 		Free(un_array);
6308 	}
6309 	if (minors != NULL)
6310 		Free(minors);
6311 	if (ownpar != NULL)
6312 		Free(ownpar);
6313 	(void) fflush(stdout);
6314 
6315 	if ((keynlp != NULL) && (committed != 1)) {
6316 		/*
6317 		 * if we haven't committed any softparts, either because of an
6318 		 * error or because the user decided not to proceed, delete
6319 		 * namelist key for the component
6320 		 */
6321 		if (mn_set) {
6322 			mdnamelist_t	*p;
6323 
6324 			for (p = keynlp; (p != NULL); p = p->next) {
6325 				mdname_t		*np = p->namep;
6326 				md_mn_msg_delkeyname_t	send_params;
6327 				md_mn_result_t		*resp = NULL;
6328 
6329 				send_params.delkeyname_dev = np->dev;
6330 				send_params.delkeyname_setno = sp->setno;
6331 				send_params.delkeyname_key = np->key;
6332 				(void) mdmn_send_message(sp->setno,
6333 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6334 				    (char *)&send_params, sizeof (send_params),
6335 				    &resp, ep);
6336 				if (resp != NULL) {
6337 					free_result(resp);
6338 				}
6339 			}
6340 		} else {
6341 			(void) del_key_names(sp, keynlp, NULL);
6342 		}
6343 	}
6344 
6345 	metafreenamelist(keynlp);
6346 
6347 	if (err)
6348 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
6349 
6350 	if (not_recovered)
6351 		if (options & MDCMD_PRINT)
6352 			(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6353 			    "Soft Partitions NOT recovered from device.\n"),
6354 			    compnp->cname);
6355 	return (0);
6356 }
6357 
6358 /*
6359  * FUNCTION:	meta_sp_recover_from_unit()
6360  * INPUT:	sp	- name of set we are recovering in
6361  *		compnp	- name of component we are recovering from
6362  *		options	- metarecover options
6363  * OUTPUT:	ep	- return error pointer
6364  * RETURNS:	int	- 0 - success, -1 - error
6365  * PURPOSE:	update watermarks to match metadb records.  begin by getting
6366  *		a namelist representing all soft partitions on the specified
6367  *		component.  then, build an extlist representing the soft
6368  *		partitions, filling in the freespace extents.  notify user
6369  *		of changes, place all soft partitions into the "recovering"
6370  *		state and update the watermarks.  finally, return all soft
6371  *		partitions to the "OK" state.
6372  */
6373 static int
6374 meta_sp_recover_from_unit(
6375 	mdsetname_t	*sp,
6376 	mdname_t	*compnp,
6377 	mdcmdopts_t	options,
6378 	md_error_t	*ep
6379 )
6380 {
6381 	mdnamelist_t	*spnlp = NULL;
6382 	mdnamelist_t	*nlp = NULL;
6383 	sp_ext_node_t	*ext = NULL;
6384 	sp_ext_node_t	*extlist = NULL;
6385 	int		count;
6386 	char		yesno[255];
6387 	char		*yes;
6388 	int		rval = 0;
6389 	minor_t		*minors = NULL;
6390 	int		i;
6391 	md_sp_t		*msp;
6392 	md_set_desc	*sd;
6393 	bool_t		mn_set = 0;
6394 	daddr_t		start_block;
6395 
6396 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
6397 	if (count <= 0)
6398 		return (-1);
6399 
6400 	/* set flag if dealing with a MN set */
6401 	if (!metaislocalset(sp)) {
6402 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
6403 			return (-1);
6404 		}
6405 		if (MD_MNSET_DESC(sd))
6406 			mn_set = 1;
6407 	}
6408 	/*
6409 	 * Save the XDR unit structure for one of the soft partitions;
6410 	 * we'll use this later to provide metadevice context to
6411 	 * update the watermarks so the device can be resolved by
6412 	 * devid instead of dev_t.
6413 	 */
6414 	if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) {
6415 		metafreenamelist(spnlp);
6416 		return (-1);
6417 	}
6418 
6419 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
6420 	    MD_DISKADDR_ERROR) {
6421 		return (-1);
6422 	}
6423 
6424 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
6425 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
6426 	meta_sp_list_insert(NULL, NULL, &extlist,
6427 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
6428 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
6429 
6430 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
6431 		metafreenamelist(spnlp);
6432 		return (-1);
6433 	}
6434 
6435 	assert(extlist != NULL);
6436 	if ((options & MDCMD_VERBOSE) != 0) {
6437 		(void) printf(dgettext(TEXT_DOMAIN,
6438 		    "Updating extent headers on device %s from metadb.\n\n"),
6439 		    compnp->cname);
6440 		(void) printf(dgettext(TEXT_DOMAIN,
6441 		    "The following extent headers will be written:\n"));
6442 		meta_sp_display_exthdr();
6443 	}
6444 
6445 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
6446 
6447 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
6448 
6449 		/* mark every node for updating except the reserved space */
6450 		if (ext->ext_type != EXTTYP_RESERVED) {
6451 			ext->ext_flags |= EXTFLG_UPDATE;
6452 
6453 			/* print extent information */
6454 			if ((options & MDCMD_VERBOSE) != 0)
6455 				meta_sp_display_ext(ext);
6456 		}
6457 	}
6458 
6459 	/* request verification and then update all watermarks */
6460 	if ((options & MDCMD_DOIT) != 0) {
6461 
6462 		(void) printf(dgettext(TEXT_DOMAIN,
6463 		    "\nWARNING: You are about to overwrite portions of %s\n"
6464 		    "with soft partition metadata. The extent headers will be\n"
6465 		    "written to match the existing metadb configuration.  If\n"
6466 		    "the device was not previously setup with this\n"
6467 		    "configuration, data loss may result.\n\n"),
6468 		    compnp->cname);
6469 		(void) printf(dgettext(TEXT_DOMAIN,
6470 		    "Are you sure you want to do this (yes/no)? "));
6471 
6472 		(void) fflush(stdout);
6473 		if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6474 		    (strlen(yesno) == 1))
6475 			(void) snprintf(yesno, sizeof (yesno),
6476 			    "%s\n", dgettext(TEXT_DOMAIN, "no"));
6477 		yes = dgettext(TEXT_DOMAIN, "yes");
6478 		if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) {
6479 			/* place soft partitions into recovering state */
6480 			minors = Zalloc(count * sizeof (minor_t));
6481 			for (nlp = spnlp, i = 0;
6482 			    nlp != NULL && i < count;
6483 			    nlp = nlp->next, i++) {
6484 				assert(nlp->namep != NULL);
6485 				minors[i] = meta_getminor(nlp->namep->dev);
6486 			}
6487 			if (update_sp_status(sp, minors, count,
6488 			    MD_SP_RECOVER, mn_set, ep) != 0) {
6489 				rval = -1;
6490 				goto out;
6491 			}
6492 
6493 			/* update the watermarks */
6494 			if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
6495 				rval = -1;
6496 				goto out;
6497 			}
6498 
6499 			if (options & MDCMD_PRINT) {
6500 				(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6501 				    "Soft Partitions recovered from metadb\n"),
6502 				    compnp->cname);
6503 			}
6504 
6505 			/* return soft partitions to the OK state */
6506 			if (update_sp_status(sp, minors, count,
6507 			    MD_SP_OK, mn_set, ep) != 0) {
6508 				rval = -1;
6509 				goto out;
6510 			}
6511 
6512 			rval = 0;
6513 			goto out;
6514 		}
6515 	}
6516 
6517 	if (options & MDCMD_PRINT) {
6518 		(void) printf(dgettext(TEXT_DOMAIN,
6519 		    "%s: Soft Partitions NOT recovered from metadb\n"),
6520 		    compnp->cname);
6521 	}
6522 
6523 out:
6524 	if (minors != NULL)
6525 		Free(minors);
6526 	metafreenamelist(spnlp);
6527 	meta_sp_list_free(&extlist);
6528 	(void) fflush(stdout);
6529 	return (rval);
6530 }
6531 
6532 
6533 /*
6534  * FUNCTION:	meta_sp_update_abr()
6535  * INPUT:	sp	- name of set we are recovering in
6536  * OUTPUT:	ep	- return error pointer
6537  * RETURNS:	int	- 0 - success, -1 - error
6538  * PURPOSE:	update the ABR state for all soft partitions in the set. This
6539  *		is called when joining a set. It sends a message to the master
6540  *		node for each soft partition to get the value of tstate and
6541  *		then sets ABR ,if required, by opening the sp, setting ABR
6542  *		and then closing the sp. This approach is taken rather that
6543  *		just issuing the MD_MN_SET_CAP ioctl, in order to deal with
6544  *		the case when we have another node simultaneously unsetting ABR.
6545  */
6546 int
6547 meta_sp_update_abr(
6548 	mdsetname_t	*sp,
6549 	md_error_t	*ep
6550 )
6551 {
6552 	mdnamelist_t	*devnlp = NULL;
6553 	mdnamelist_t	*p;
6554 	mdname_t	*devnp = NULL;
6555 	md_unit_t	*un;
6556 	char		fname[MAXPATHLEN];
6557 	int		mnum, fd;
6558 	volcap_t	vc;
6559 	uint_t		tstate;
6560 
6561 
6562 	if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
6563 		return (-1);
6564 	}
6565 
6566 	/* Exit if no soft partitions in this set */
6567 	if (devnlp == NULL)
6568 		return (0);
6569 
6570 	/* For each soft partition */
6571 	for (p = devnlp; (p != NULL); p = p->next) {
6572 		devnp = p->namep;
6573 
6574 		/* check if this is a top level metadevice */
6575 		if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL)
6576 			goto out;
6577 		if (MD_HAS_PARENT(MD_PARENT(un))) {
6578 			Free(un);
6579 			continue;
6580 		}
6581 		Free(un);
6582 
6583 		/* Get tstate from Master */
6584 		if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) {
6585 			mdname_t	*np;
6586 			np = metamnumname(&sp, meta_getminor(devnp->dev), 0,
6587 			    ep);
6588 			if (np) {
6589 				md_perror(dgettext(TEXT_DOMAIN,
6590 				    "Unable to get tstate for %s"), np->cname);
6591 			}
6592 			continue;
6593 		}
6594 		/* If not set on the master, nothing to do */
6595 		if (!(tstate & MD_ABR_CAP))
6596 			continue;
6597 
6598 		mnum = meta_getminor(devnp->dev);
6599 		(void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u",
6600 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
6601 		if ((fd = open(fname, O_RDWR, 0)) < 0) {
6602 			md_perror(dgettext(TEXT_DOMAIN,
6603 			    "Could not open device %s"), fname);
6604 			continue;
6605 		}
6606 
6607 		/* Set ABR state */
6608 		vc.vc_info = 0;
6609 		vc.vc_set = 0;
6610 		if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
6611 			(void) close(fd);
6612 			continue;
6613 		}
6614 
6615 		vc.vc_set = DKV_ABR_CAP;
6616 		if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
6617 			(void) close(fd);
6618 			goto out;
6619 		}
6620 
6621 		(void) close(fd);
6622 	}
6623 	metafreenamelist(devnlp);
6624 	return (0);
6625 out:
6626 	metafreenamelist(devnlp);
6627 	return (-1);
6628 }
6629 
6630 /*
6631  * FUNCTION:	meta_mn_sp_update_abr()
6632  * INPUT:	arg	- Given set.
6633  * PURPOSE:	update the ABR state for all soft partitions in the set by
6634  *		forking a process to call meta_sp_update_abr()
6635  *		This function is only called via rpc.metad when adding a node
6636  *		to a set, ie this node is beong joined to the set by another
6637  *		node.
6638  */
6639 void *
6640 meta_mn_sp_update_abr(void *arg)
6641 {
6642 	set_t		setno = *((set_t *)arg);
6643 	mdsetname_t	*sp;
6644 	md_error_t	mde = mdnullerror;
6645 	int		fval;
6646 
6647 	/* should have a set */
6648 	assert(setno != NULL);
6649 
6650 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6651 		mde_perror(&mde, "");
6652 		return (NULL);
6653 	}
6654 
6655 	if (!(meta_is_mn_set(sp, &mde))) {
6656 		mde_perror(&mde, "");
6657 		return (NULL);
6658 	}
6659 
6660 	/* fork a process */
6661 	if ((fval = md_daemonize(sp, &mde)) != 0) {
6662 		/*
6663 		 * md_daemonize will fork off a process.  The is the
6664 		 * parent or error.
6665 		 */
6666 		if (fval > 0) {
6667 			return (NULL);
6668 		}
6669 		mde_perror(&mde, "");
6670 		return (NULL);
6671 	}
6672 	/*
6673 	 * Child process should never return back to rpc.metad, but
6674 	 * should exit.
6675 	 * Flush all internally cached data inherited from parent process
6676 	 * since cached data will be cleared when parent process RPC request
6677 	 * has completed (which is possibly before this child process
6678 	 * can complete).
6679 	 * Child process can retrieve and cache its own copy of data from
6680 	 * rpc.metad that won't be changed by the parent process.
6681 	 *
6682 	 * Reset md_in_daemon since this child will be a client of rpc.metad
6683 	 * not part of the rpc.metad daemon itself.
6684 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
6685 	 * this thread is rpc.metad or any other thread.  (If this thread
6686 	 * was rpc.metad it could use some short circuit code to get data
6687 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
6688 	 */
6689 	md_in_daemon = 0;
6690 	metaflushsetname(sp);
6691 	sr_cache_flush_setno(setno);
6692 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6693 		mde_perror(&mde, "");
6694 		md_exit(sp, 1);
6695 	}
6696 
6697 
6698 	/*
6699 	 * Closing stdin/out/err here.
6700 	 */
6701 	(void) close(0);
6702 	(void) close(1);
6703 	(void) close(2);
6704 	assert(fval == 0);
6705 
6706 	(void) meta_sp_update_abr(sp, &mde);
6707 
6708 	md_exit(sp, 0);
6709 	/*NOTREACHED*/
6710 	return (NULL);
6711 }
6712