xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_sp.c (revision 69bbc66400b6af121ee9f95667811cc0acd84d6e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * soft partition operations
38  *
39  * Soft Partitions provide a virtual disk mechanism which is used to
40  * divide a large volume into many small pieces, each appearing as a
41  * separate device.  A soft partition consists of a series of extents,
42  * each having an offset and a length.  The extents are logically
43  * contiguous, so where the first extent leaves off the second extent
44  * picks up.  Which extent a given "virtual offset" belongs to is
45  * dependent on the size of all the previous extents in the soft
46  * partition.
47  *
48  * Soft partitions are represented in memory by an extent node
49  * (sp_ext_node_t) which contains all of the information necessary to
50  * create a unit structure and update the on-disk format, called
51  * "watermarks".  These extent nodes are typically kept in a doubly
52  * linked list and are manipulated by list manipulation routines.  A
53  * list of extents may represent all of the soft partitions on a volume,
54  * a single soft partition, or perhaps just a set of extents that need
55  * to be updated.  Extent lists may be sorted by extent or by name/seq#,
56  * depending on which compare function is used.  Most of the routines
57  * require the list be sorted by offset to work, and that's the typical
58  * configuration.
59  *
60  * In order to do an allocation, knowledge of all soft partitions on the
61  * volume is required.  Then free space is determined from the space
62  * that is not allocated, and new allocations can be made from the free
63  * space.  Once the new allocations are made, a unit structure is created
64  * and the watermarks are updated.  The status is then changed to "okay"
65  * on the unit structure to commit the transaction.  If updating the
66  * watermarks fails, the unit structure is in an intermediate state and
67  * the driver will not allow access to the device.
68  *
69  * A typical sequence of events is:
70  *     1. Fetch the list of names for all soft partitions on a volume
71  *         meta_sp_get_by_component()
72  *     2. Construct an extent list from the name list
73  *         meta_sp_extlist_from_namelist()
74  *     3. Fill the gaps in the extent list with free extents
75  *         meta_sp_list_freefill()
76  *     4. Allocate from the free extents
77  *         meta_sp_alloc_by_len()
78  *         meta_sp_alloc_by_list()
79  *     5. Create the unit structure from the extent list
80  *         meta_sp_createunit()
81  *         meta_sp_updateunit()
82  *     6. Write out the watermarks
83  *         meta_sp_update_wm()
84  *     7. Set the status to "Okay"
85  *         meta_sp_setstatus()
86  *
87  */
88 
89 #include <stdio.h>
90 #include <meta.h>
91 #include "meta_repartition.h"
92 #include <sys/lvm/md_sp.h>
93 #include <sys/lvm/md_crc.h>
94 #include <strings.h>
95 #include <sys/lvm/md_mirror.h>
96 #include <sys/bitmap.h>
97 
98 extern int	md_in_daemon;
99 
100 typedef struct sp_ext_node {
101 	struct sp_ext_node	*ext_next;	/* next element */
102 	struct sp_ext_node	*ext_prev;	/* previous element */
103 	sp_ext_type_t		ext_type;	/* type of extent */
104 	sp_ext_offset_t		ext_offset;	/* starting offset */
105 	sp_ext_length_t		ext_length;	/* length of this node */
106 	uint_t			ext_flags;	/* extent flags */
107 	uint32_t		ext_seq;	/* watermark seq no */
108 	mdname_t		*ext_namep;	/* name pointer */
109 	mdsetname_t		*ext_setp;	/* set pointer */
110 } sp_ext_node_t;
111 
112 /* extent flags */
113 #define	EXTFLG_UPDATE	(1)
114 
115 /* Extent node compare function for list sorting */
116 typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *);
117 
118 
119 /* Function Prototypes */
120 
121 /* Debugging Functions */
122 static void meta_sp_debug(char *format, ...);
123 static void meta_sp_printunit(mp_unit_t *mp);
124 
125 /* Misc Support Functions */
126 int meta_sp_parsesize(char *s, sp_ext_length_t *szp);
127 static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp);
128 static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp,
129 	md_error_t *ep);
130 static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp,
131     mdnamelist_t **nlpp, int force, md_error_t *ep);
132 static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp,
133     mdname_t *compnp, md_error_t *ep);
134 
135 /* Extent List Manipulation Functions */
136 static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2);
137 static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2);
138 static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np,
139     sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length,
140     sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare);
141 static void meta_sp_list_free(sp_ext_node_t **head);
142 static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext);
143 static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head,
144     sp_ext_type_t exttype, int exclude_wm);
145 static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head,
146     sp_ext_offset_t offset);
147 static void meta_sp_list_freefill(sp_ext_node_t **extlist,
148     sp_ext_length_t size);
149 static void meta_sp_list_dump(sp_ext_node_t *head);
150 static int meta_sp_list_overlaps(sp_ext_node_t *head);
151 
152 /* Extent List Query Functions */
153 static boolean_t meta_sp_enough_space(int desired_number_of_sps,
154 	blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp,
155 	sp_ext_length_t alignment);
156 static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep,
157 	mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp,
158 	md_error_t *ep);
159 static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep,
160 	mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp);
161 
162 
163 /* Extent Allocation Functions */
164 static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np,
165     sp_ext_node_t **extlist, sp_ext_node_t *free_ext,
166     sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq);
167 static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np,
168     sp_ext_node_t **extlist, sp_ext_length_t *lp,
169     sp_ext_offset_t last_off, sp_ext_length_t alignment);
170 static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np,
171     sp_ext_node_t **extlist, sp_ext_node_t *oblist);
172 
173 /* Extent List Population Functions */
174 static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp,
175     sp_ext_node_t **extlist, md_error_t *ep);
176 static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp,
177     sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep);
178 
179 /* Print (metastat) Functions */
180 static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp,
181     mdprtopts_t options, md_error_t *ep);
182 static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate);
183 static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp,
184     char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep);
185 
186 /* Watermark Manipulation Functions */
187 static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp,
188     sp_ext_node_t *extlist, md_error_t *ep);
189 static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep);
190 static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp,
191     mp_watermark_t *wm, sp_ext_offset_t offset,  md_error_t *ep);
192 static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp,
193     md_error_t *ep);
194 
195 /* Unit Structure Manipulation Functions */
196 static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist);
197 static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp,
198     sp_ext_node_t *extlist, int numexts, sp_ext_length_t len,
199     sp_status_t status, md_error_t *ep);
200 static mp_unit_t *meta_sp_updateunit(mdname_t *np,  mp_unit_t *old_un,
201     sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts,
202     md_error_t *ep);
203 static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist,
204     mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep);
205 static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options,
206     int *repart_options, md_error_t *ep);
207 
208 /* Reset (metaclear) Functions */
209 static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp,
210     md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep);
211 
212 /* Recovery (metarecover) Functions */
213 static void meta_sp_display_exthdr(void);
214 static void meta_sp_display_ext(sp_ext_node_t *ext);
215 static int meta_sp_checkseq(sp_ext_node_t *extlist);
216 static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *,
217     mdname_t **, md_error_t *);
218 static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np,
219     mdcmdopts_t options, md_error_t *ep);
220 static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp,
221     mdcmdopts_t options, md_error_t *ep);
222 static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np,
223     mdcmdopts_t options, md_error_t *ep);
224 static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext,
225     sp_ext_node_t *unitext, md_error_t *ep);
226 static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp,
227     mdcmdopts_t options, md_error_t *ep);
228 static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np,
229     mdcmdopts_t options, md_error_t *ep);
230 
231 /*
232  * Private Constants
233  */
234 
235 static const int FORCE_RELOAD_CACHE = 1;
236 static const uint_t NO_FLAGS = 0;
237 static const sp_ext_offset_t NO_OFFSET = 0ULL;
238 static const uint_t NO_SEQUENCE_NUMBER = 0;
239 static const int ONE_SOFT_PARTITION = 1;
240 
241 static unsigned long sp_parent_printed[BT_BITOUL(MD_MAXUNITS)];
242 
243 #define	TEST_SOFT_PARTITION_NAMEP NULL
244 #define	TEST_SETNAMEP NULL
245 
246 #define	EXCLUDE_WM	(1)
247 #define	INCLUDE_WM	(0)
248 
249 #define	SP_UNALIGNED	(0LL)
250 
251 /*
252  * **************************************************************************
253  *                          Debugging Functions                             *
254  * **************************************************************************
255  */
256 
257 /*PRINTFLIKE1*/
258 static void
259 meta_sp_debug(char *format, ...)
260 {
261 	static int debug;
262 	static int debug_set = 0;
263 	va_list ap;
264 
265 	if (!debug_set) {
266 		debug = getenv(META_SP_DEBUG) ? 1 : 0;
267 		debug_set = 1;
268 	}
269 
270 	if (debug) {
271 		va_start(ap, format);
272 		(void) vfprintf(stderr, format, ap);
273 		va_end(ap);
274 	}
275 }
276 
277 static void
278 meta_sp_printunit(mp_unit_t *mp)
279 {
280 	int i;
281 
282 	if (mp == NULL)
283 		return;
284 
285 	/* print the common fields we know about */
286 	(void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type);
287 	(void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size);
288 	(void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp));
289 
290 	/* sp-specific fields */
291 	(void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status);
292 	(void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts);
293 	(void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length);
294 	(void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev);
295 	(void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev);
296 	(void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key);
297 
298 	/* print extent information */
299 	(void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n");
300 	for (i = 0; i < mp->un_numexts; i++) {
301 		(void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i,
302 		    mp->un_ext[i].un_voff, mp->un_ext[i].un_poff,
303 		    mp->un_ext[i].un_len);
304 	}
305 }
306 
307 /*
308  * FUNCTION:    meta_sp_parsesize()
309  * INPUT:       s       - the string to parse
310  * OUTPUT:      *szp    - disk block count (0 for "all")
311  * RETURNS:     -1 for error, 0 for success
312  * PURPOSE:     parses the command line parameter that specifies the
313  *              requested size of a soft partition.  The input string
314  *              is either the literal "all" or a numeric value
315  *              followed by a single character, b for disk blocks, k
316  *              for kilobytes, m for megabytes, g for gigabytes, or t
317  *              for terabytes.  p for petabytes and e for exabytes
318  *              have been added as undocumented features for future
319  *              expansion.  For example, 100m is 100 megabytes, while
320  *              50g is 50 gigabytes.  All values are rounded up to the
321  *              nearest block size.
322  */
323 int
324 meta_sp_parsesize(char *s, sp_ext_length_t *szp)
325 {
326 	if (s == NULL || szp == NULL) {
327 		return (-1);
328 	}
329 
330 	/* Check for literal "all" */
331 	if (strcasecmp(s, "all") == 0) {
332 		*szp = 0;
333 		return (0);
334 	}
335 
336 	return (meta_sp_parsesizestring(s, szp));
337 }
338 
339 /*
340  * FUNCTION:	meta_sp_parsesizestring()
341  * INPUT:	s	- the string to parse
342  * OUTPUT:	*szp	- disk block count
343  * RETURNS:	-1 for error, 0 for success
344  * PURPOSE:	parses a string that specifies size. The input string is a
345  *		numeric value followed by a single character, b for disk blocks,
346  *		k for kilobytes, m for megabytes, g for gigabytes, or t for
347  *		terabytes.  p for petabytes and e for exabytes have been added
348  *		as undocumented features for future expansion.  For example,
349  *		100m is 100 megabytes, while 50g is 50 gigabytes.  All values
350  *		are rounded up to the nearest block size.
351  */
352 static int
353 meta_sp_parsesizestring(char *s, sp_ext_length_t *szp)
354 {
355 	sp_ext_length_t	len = 0;
356 	char		len_type[2];
357 
358 	if (s == NULL || szp == NULL) {
359 		return (-1);
360 	}
361 
362 	/*
363 	 * make sure block offset does not overflow 2^64 bytes.
364 	 */
365 	if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) ||
366 	    (len == 0LL) ||
367 	    (len > (1LL << (64 - DEV_BSHIFT))))
368 		return (-1);
369 
370 	switch (len_type[0]) {
371 	case 'B':
372 	case 'b':
373 		len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE));
374 		break;
375 	case 'K':
376 	case 'k':
377 		len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE));
378 		break;
379 	case 'M':
380 	case 'm':
381 		len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE));
382 		break;
383 	case 'g':
384 	case 'G':
385 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE));
386 		break;
387 	case 't':
388 	case 'T':
389 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL,
390 		    DEV_BSIZE));
391 		break;
392 	case 'p':
393 	case 'P':
394 		len = lbtodb(roundup(
395 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
396 		    DEV_BSIZE));
397 		break;
398 	case 'e':
399 	case 'E':
400 		len = lbtodb(roundup(
401 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
402 		    DEV_BSIZE));
403 		break;
404 	default:
405 		/* error */
406 		return (-1);
407 	}
408 
409 	*szp = len;
410 	return (0);
411 }
412 
413 /*
414  * FUNCTION:	meta_sp_setgeom()
415  * INPUT:	np      - the underlying device to setup geometry for
416  *		compnp	- the underlying device to setup geometry for
417  *		mp	- the unit structure to set the geometry for
418  * OUTPUT:	ep	- return error pointer
419  * RETURNS:	int	- -1 if error, 0 otherwise
420  * PURPOSE:	establishes geometry information for a device
421  */
422 static int
423 meta_sp_setgeom(
424 	mdname_t	*np,
425 	mdname_t	*compnp,
426 	mp_unit_t	*mp,
427 	md_error_t	*ep
428 )
429 {
430 	mdgeom_t	*geomp;
431 	uint_t		round_cyl = 0;
432 
433 	if ((geomp = metagetgeom(compnp, ep)) == NULL)
434 		return (-1);
435 	if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct,
436 	    geomp->read_reinstruct, round_cyl, ep) != 0)
437 		return (-1);
438 
439 	return (0);
440 }
441 
442 /*
443  * FUNCTION:	meta_sp_setstatus()
444  * INPUT:	sp	- the set name for the devices to set the status on
445  *		minors	- an array of minor numbers of devices to set status on
446  *		num_units - number of entries in the array
447  *		status	- status value to set all units to
448  * OUTPUT:	ep	- return error pointer
449  * RETURNS:	int	- -1 if error, 0 success
450  * PURPOSE:	sets the status of one or more soft partitions to the
451  *		requested value
452  */
453 int
454 meta_sp_setstatus(
455 	mdsetname_t	*sp,
456 	minor_t		*minors,
457 	int		num_units,
458 	sp_status_t	status,
459 	md_error_t	*ep
460 )
461 {
462 	md_sp_statusset_t	status_params;
463 
464 	assert(minors != NULL);
465 
466 	/* update status of all soft partitions to the status passed in */
467 	(void) memset(&status_params, 0, sizeof (status_params));
468 	status_params.num_units = num_units;
469 	status_params.new_status = status;
470 	status_params.size = num_units * sizeof (minor_t);
471 	status_params.minors = (uintptr_t)minors;
472 	MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno);
473 	if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde,
474 	    NULL) != 0) {
475 		(void) mdstealerror(ep, &status_params.mde);
476 		return (-1);
477 	}
478 	return (0);
479 }
480 
481 /*
482  * FUNCTION:	meta_get_sp_names()
483  * INPUT:	sp	- the set name to get soft partitions from
484  *		options	- options from the command line
485  * OUTPUT:	nlpp	- list of all soft partition names
486  *		ep	- return error pointer
487  * RETURNS:	int	- -1 if error, 0 success
488  * PURPOSE:	returns a list of all soft partitions in the metadb
489  *		for all devices in the specified set
490  */
491 int
492 meta_get_sp_names(
493 	mdsetname_t	*sp,
494 	mdnamelist_t	**nlpp,
495 	int		options,
496 	md_error_t	*ep
497 )
498 {
499 	return (meta_get_names(MD_SP, sp, nlpp, options, ep));
500 }
501 
502 /*
503  * FUNCTION:	meta_get_by_component()
504  * INPUT:	sp	- the set name to get soft partitions from
505  *		compnp	- the name of the device containing the soft
506  *			  partitions that will be returned
507  *		force	- 0 - reads cached namelist if available,
508  *			  1 - reloads cached namelist, frees old namelist
509  * OUTPUT:	nlpp	- list of all soft partition names
510  *		ep	- return error pointer
511  * RETURNS:	int	- -1 error, otherwise the number of soft partitions
512  *			  found on the component (0 = none found).
513  * PURPOSE:	returns a list of all soft partitions on a given device
514  *		from the metadb information
515  */
516 static int
517 meta_sp_get_by_component(
518 	mdsetname_t	*sp,
519 	mdname_t	*compnp,
520 	mdnamelist_t	**nlpp,
521 	int		force,
522 	md_error_t	*ep
523 )
524 {
525 	static mdnamelist_t	*cached_list = NULL;	/* cached namelist */
526 	static int		cached_count = 0;	/* cached count */
527 	mdnamelist_t		*spnlp = NULL;		/* all sp names */
528 	mdnamelist_t		*namep;			/* list iterator */
529 	mdnamelist_t		**tailpp = nlpp;	/* namelist tail */
530 	mdnamelist_t		**cachetailpp;		/* cache tail */
531 	md_sp_t			*msp;			/* unit structure */
532 	int			count = 0;		/* count of sp's */
533 	int			err;
534 	mdname_t		*curnp;
535 
536 	if ((cached_list != NULL) && (!force)) {
537 		/* return a copy of the cached list */
538 		for (namep = cached_list; namep != NULL; namep = namep->next)
539 			tailpp = meta_namelist_append_wrapper(tailpp,
540 			    namep->namep);
541 		return (cached_count);
542 	}
543 
544 	/* free the cache and reset values to zeros to prepare for a new list */
545 	metafreenamelist(cached_list);
546 	cached_count = 0;
547 	cached_list = NULL;
548 	cachetailpp = &cached_list;
549 	*nlpp = NULL;
550 
551 	/* get all the softpartitions first of all */
552 	if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
553 		return (-1);
554 
555 	/*
556 	 * Now for each sp, see if it resides on the component we
557 	 * are interested in, if so then add it to our list
558 	 */
559 	for (namep = spnlp; namep != NULL; namep = namep->next) {
560 		curnp = namep->namep;
561 
562 		/* get the unit structure */
563 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
564 			continue;
565 
566 		/*
567 		 * If the current soft partition is not on the same
568 		 * component, continue the search.  If it is on the same
569 		 * component, add it to our namelist.
570 		 */
571 		err = meta_check_samedrive(compnp, msp->compnamep, ep);
572 		if (err <= 0) {
573 			/* not on the same device, check the next one */
574 			continue;
575 		}
576 
577 		/* it's on the same drive */
578 
579 		/*
580 		 * Check for overlapping partitions if the component is not
581 		 * a metadevice.
582 		 */
583 		if (!metaismeta(msp->compnamep)) {
584 			/*
585 			 * if they're on the same drive, neither
586 			 * should be a metadevice if one isn't
587 			 */
588 			assert(!metaismeta(compnp));
589 
590 			if (meta_check_overlap(msp->compnamep->cname,
591 			    compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0)
592 				continue;
593 
594 			/* in this case it's not an error for them to overlap */
595 			mdclrerror(ep);
596 		}
597 
598 		/* Component is on the same device, add to the used list */
599 		tailpp = meta_namelist_append_wrapper(tailpp, curnp);
600 		cachetailpp = meta_namelist_append_wrapper(cachetailpp,
601 		    curnp);
602 
603 		++count;
604 		++cached_count;
605 	}
606 
607 	assert(count == cached_count);
608 	return (count);
609 
610 out:
611 	metafreenamelist(*nlpp);
612 	*nlpp = NULL;
613 	return (-1);
614 }
615 
616 /*
617  * FUNCTION:    meta_sp_get_default_alignment()
618  * INPUT:       sp      - the pertinent set name
619  *              compnp  - the name of the underlying component
620  * OUTPUT:      ep      - return error pointer
621  * RETURNS:     sp_ext_length_t =0: no default alignment
622  *                              >0: default alignment
623  * PURPOSE:     returns the default alignment for soft partitions to
624  *              be built on top of the specified component or
625  *              metadevice
626  */
627 static sp_ext_length_t
628 meta_sp_get_default_alignment(
629 	mdsetname_t	*sp,
630 	mdname_t	*compnp,
631 	md_error_t	*ep
632 )
633 {
634 	sp_ext_length_t	a = SP_UNALIGNED;
635 	char		*mname;
636 
637 	assert(compnp != NULL);
638 
639 	/*
640 	 * We treat raw devices as opaque, and assume nothing about
641 	 * their alignment requirements.
642 	 */
643 	if (!metaismeta(compnp))
644 		return (SP_UNALIGNED);
645 
646 	/*
647 	 * We already know it's a metadevice from the previous test;
648 	 * metagetmiscname() will tell us which metadevice type we
649 	 * have
650 	 */
651 	mname = metagetmiscname(compnp, ep);
652 	if (mname == NULL)
653 		goto out;
654 
655 	/*
656 	 * For a mirror, we want to deal with the stripe that is the
657 	 * primary side.  If it happens to be asymmetrically
658 	 * configured, there is no simple way to fake a universal
659 	 * alignment.  There's a chance that the least common
660 	 * denominator of the set of interlaces from all stripes of
661 	 * all submirrors would do it, but nobody that really cared
662 	 * that much about this issue would create an asymmetric
663 	 * config to start with.
664 	 *
665 	 * If the component underlying the soft partition is a mirror,
666 	 * then at the exit of this loop, compnp will have been
667 	 * updated to describe the first active submirror.
668 	 */
669 	if (strcmp(mname, MD_MIRROR) == 0) {
670 		md_mirror_t	*mp;
671 		int		smi;
672 		md_submirror_t	*smp;
673 
674 		mp = meta_get_mirror(sp, compnp, ep);
675 		if (mp == NULL)
676 			goto out;
677 
678 		for (smi = 0; smi < NMIRROR; smi++) {
679 
680 			smp = &mp->submirrors[smi];
681 			if (smp->state == SMS_UNUSED)
682 				continue;
683 
684 			compnp = smp->submirnamep;
685 			assert(compnp != NULL);
686 
687 			mname = metagetmiscname(compnp, ep);
688 			if (mname == NULL)
689 				goto out;
690 
691 			break;
692 		}
693 
694 		if (smi == NMIRROR)
695 			goto out;
696 	}
697 
698 	/*
699 	 * Handle stripes and submirrors identically; just return the
700 	 * interlace of the first row.
701 	 */
702 	if (strcmp(mname, MD_STRIPE) == 0) {
703 		md_stripe_t	*stp;
704 
705 		stp = meta_get_stripe(sp, compnp, ep);
706 		if (stp == NULL)
707 			goto out;
708 
709 		a = stp->rows.rows_val[0].interlace;
710 		goto out;
711 	}
712 
713 	/*
714 	 * Raid is even more straightforward; the interlace applies to
715 	 * the entire device.
716 	 */
717 	if (strcmp(mname, MD_RAID) == 0) {
718 		md_raid_t	*rp;
719 
720 		rp = meta_get_raid(sp, compnp, ep);
721 		if (rp == NULL)
722 			goto out;
723 
724 		a = rp->interlace;
725 		goto out;
726 	}
727 
728 	/*
729 	 * If we have arrived here with the alignment still not set,
730 	 * then we expect the error to have been set by one of the
731 	 * routines we called.  If neither is the case, something has
732 	 * really gone wrong above.  (Probably the submirror walk
733 	 * failed to produce a valid submirror, but that would be
734 	 * really bad...)
735 	 */
736 out:
737 	meta_sp_debug("meta_sp_get_default_alignment: miscname %s, "
738 	    "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a);
739 
740 	if (getenv(META_SP_DEBUG) && !mdisok(ep)) {
741 		mde_perror(ep, NULL);
742 	}
743 
744 	assert((a > 0) || (!mdisok(ep)));
745 
746 	return (a);
747 }
748 
749 
750 
751 /*
752  * FUNCTION:	meta_check_insp()
753  * INPUT:	sp	- the set name for the device to check
754  *		np	- the name of the device to check
755  *		slblk	- the starting offset of the device to check
756  *		nblks	- the number of blocks in the device to check
757  * OUTPUT:	ep	- return error pointer
758  * RETURNS:	int	-  0 - device contains soft partitions
759  *			  -1 - device does not contain soft partitions
760  * PURPOSE:	determines whether a device contains any soft partitions
761  */
762 /* ARGSUSED */
763 int
764 meta_check_insp(
765 	mdsetname_t	*sp,
766 	mdname_t	*np,
767 	diskaddr_t	slblk,
768 	diskaddr_t	nblks,
769 	md_error_t	*ep
770 )
771 {
772 	mdnamelist_t	*spnlp = NULL;	/* soft partition name list */
773 	int		count;
774 	int		rval;
775 
776 	/* check set pointer */
777 	assert(sp != NULL);
778 
779 	/*
780 	 * Get a list of the soft partitions that currently reside on
781 	 * the component.  We should ALWAYS force reload the cache,
782 	 * because if we're using the md.tab, we must rebuild
783 	 * the list because it won't contain the previous (if any)
784 	 * soft partition.
785 	 */
786 	/* find all soft partitions on the component */
787 	count = meta_sp_get_by_component(sp, np, &spnlp, 1, ep);
788 
789 	if (count == -1) {
790 		rval = -1;
791 	} else if (count > 0) {
792 		rval = mduseerror(ep, MDE_ALREADY, np->dev,
793 		    spnlp->namep->cname, np->cname);
794 	} else {
795 		rval = 0;
796 	}
797 
798 	metafreenamelist(spnlp);
799 	return (rval);
800 }
801 
802 /*
803  * **************************************************************************
804  *                    Extent List Manipulation Functions                    *
805  * **************************************************************************
806  */
807 
808 /*
809  * FUNCTION:	meta_sp_cmp_by_nameseq()
810  * INPUT:	e1	- first node to compare
811  *		e2	- second node to compare
812  * OUTPUT:	none
813  * RETURNS:	int	- =0 - nodes are equal
814  *			  <0 - e1 should go before e2
815  *			  >0 - e1 should go after e2
816  * PURPOSE:	used for sorted list inserts to build a list sorted by
817  *		name first and sequence number second.
818  */
819 static int
820 meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2)
821 {
822 	int rval;
823 
824 	if (e1->ext_namep == NULL)
825 		return (1);
826 	if (e2->ext_namep == NULL)
827 		return (-1);
828 	if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0)
829 		return (rval);
830 
831 	/* the names are equal, compare sequence numbers */
832 	if (e1->ext_seq > e2->ext_seq)
833 		return (1);
834 	if (e1->ext_seq < e2->ext_seq)
835 		return (-1);
836 	/* sequence numbers are also equal */
837 	return (0);
838 }
839 
840 /*
841  * FUNCTION:	meta_sp_cmp_by_offset()
842  * INPUT:	e1	- first node to compare
843  *		e2	- second node to compare
844  * OUTPUT:	none
845  * RETURNS:	int	- =0 - nodes are equal
846  *			  <0 - e1 should go before e2
847  *			  >0 - e1 should go after e2
848  * PURPOSE:	used for sorted list inserts to build a list sorted by offset
849  */
850 static int
851 meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2)
852 {
853 	if (e1->ext_offset > e2->ext_offset)
854 		return (1);
855 	if (e1->ext_offset < e2->ext_offset)
856 		return (-1);
857 	/* offsets are equal */
858 	return (0);
859 }
860 
861 /*
862  * FUNCTION:	meta_sp_list_insert()
863  * INPUT:	sp	- the set name for the device the node belongs to
864  *		np	- the name of the device the node belongs to
865  *		head	- the head of the list, must be NULL for empty list
866  *		offset	- the physical offset of this extent in sectors
867  *		length	- the length of this extent in sectors
868  *		type	- the type of the extent being inserted
869  *		seq	- the sequence number of the extent being inserted
870  *		flags	- extent flags (eg. whether it needs to be updated)
871  *		compare	- the compare function to use
872  * OUTPUT:	head	- points to the new head if a node was inserted
873  *			  at the beginning
874  * RETURNS:	void
875  * PURPOSE:	inserts an extent node into a sorted doubly linked list.
876  *		The sort order is determined by the compare function.
877  *		Memory is allocated for the node in this function and it
878  *		is up to the caller to free it, possibly using
879  *		meta_sp_list_free().  If a node is inserted at the
880  *		beginning of the list, the head pointer is updated to
881  *		point to the new first node.
882  */
883 static void
884 meta_sp_list_insert(
885 	mdsetname_t	*sp,
886 	mdname_t	*np,
887 	sp_ext_node_t	**head,
888 	sp_ext_offset_t	offset,
889 	sp_ext_length_t	length,
890 	sp_ext_type_t	type,
891 	uint_t		seq,
892 	uint_t		flags,
893 	ext_cmpfunc_t	compare
894 )
895 {
896 	sp_ext_node_t	*newext;
897 	sp_ext_node_t	*curext;
898 
899 	assert(head != NULL);
900 
901 	/* Don't bother adding zero length nodes */
902 	if (length == 0ULL)
903 		return;
904 
905 	/* allocate and fill in new ext_node */
906 	newext = Zalloc(sizeof (sp_ext_node_t));
907 
908 	newext->ext_offset = offset;
909 	newext->ext_length = length;
910 	newext->ext_flags = flags;
911 	newext->ext_type = type;
912 	newext->ext_seq = seq;
913 	newext->ext_setp = sp;
914 	newext->ext_namep = np;
915 
916 	/* first node in the list */
917 	if (*head == NULL) {
918 		newext->ext_next = newext->ext_prev = NULL;
919 		*head = newext;
920 	} else if ((*compare)(*head, newext) >= 0) {
921 		/* the first node has a bigger offset, so insert before it */
922 		assert((*head)->ext_prev == NULL);
923 
924 		newext->ext_prev = NULL;
925 		newext->ext_next = *head;
926 		(*head)->ext_prev = newext;
927 		*head = newext;
928 	} else {
929 		/*
930 		 * find the next node whose offset is greater than
931 		 * the one we want to insert, or the end of the list.
932 		 */
933 		for (curext = *head;
934 		    (curext->ext_next != NULL) &&
935 		    ((*compare)(curext->ext_next, newext) < 0);
936 		    (curext = curext->ext_next))
937 			;
938 
939 		/* link the new node in after the current node */
940 		newext->ext_next = curext->ext_next;
941 		newext->ext_prev = curext;
942 
943 		if (curext->ext_next != NULL)
944 			curext->ext_next->ext_prev = newext;
945 
946 		curext->ext_next = newext;
947 	}
948 }
949 
950 /*
951  * FUNCTION:	meta_sp_list_free()
952  * INPUT:	head	- the head of the list, must be NULL for empty list
953  * OUTPUT:	head	- points to NULL on return
954  * RETURNS:	void
955  * PURPOSE:	walks a double linked extent list and frees each node
956  */
957 static void
958 meta_sp_list_free(sp_ext_node_t **head)
959 {
960 	sp_ext_node_t	*ext;
961 	sp_ext_node_t	*next;
962 
963 	assert(head != NULL);
964 
965 	ext = *head;
966 	while (ext) {
967 		next = ext->ext_next;
968 		Free(ext);
969 		ext = next;
970 	}
971 	*head = NULL;
972 }
973 
974 /*
975  * FUNCTION:	meta_sp_list_remove()
976  * INPUT:	head	- the head of the list, must be NULL for empty list
977  *		ext	- the extent to remove, must be a member of the list
978  * OUTPUT:	head	- points to the new head of the list
979  * RETURNS:	void
980  * PURPOSE:	unlinks the node specified by ext from the list and
981  *		frees it, possibly moving the head pointer forward if
982  *		the head is the node being removed.
983  */
984 static void
985 meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext)
986 {
987 	assert(head != NULL);
988 	assert(*head != NULL);
989 
990 	if (*head == ext)
991 		*head = ext->ext_next;
992 
993 	if (ext->ext_prev != NULL)
994 		ext->ext_prev->ext_next = ext->ext_next;
995 	if (ext->ext_next != NULL)
996 		ext->ext_next->ext_prev = ext->ext_prev;
997 	Free(ext);
998 }
999 
1000 /*
1001  * FUNCTION:	meta_sp_list_size()
1002  * INPUT:	head	- the head of the list, must be NULL for empty list
1003  *		exttype	- the type of the extents to sum
1004  *		exclude_wm - subtract space for extent headers from total
1005  * OUTPUT:	none
1006  * RETURNS:	sp_ext_length_t	- the sum of all of the lengths
1007  * PURPOSE:	sums the lengths of all extents in the list matching the
1008  *		specified type.  This could be used for computing the
1009  *		amount of free or used space, for example.
1010  */
1011 static sp_ext_length_t
1012 meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm)
1013 {
1014 	sp_ext_node_t	*ext;
1015 	sp_ext_length_t	size = 0LL;
1016 
1017 	for (ext = head; ext != NULL; ext = ext->ext_next)
1018 		if (ext->ext_type == exttype)
1019 			size += ext->ext_length -
1020 			    ((exclude_wm) ? MD_SP_WMSIZE : 0);
1021 
1022 	return (size);
1023 }
1024 
1025 /*
1026  * FUNCTION:	meta_sp_list_find()
1027  * INPUT:	head	- the head of the list, must be NULL for empty list
1028  *		offset	- the offset contained by the node to find
1029  * OUTPUT:	none
1030  * RETURNS:	sp_ext_node_t *	- the node containing the requested offset
1031  *				  or NULL if no such nodes were found.
1032  * PURPOSE:	finds a node in a list containing the requested offset
1033  *		(inclusive).  If multiple nodes contain this offset then
1034  *		only the first will be returned, though typically these
1035  *		lists are managed with non-overlapping nodes.
1036  *
1037  *		*The list MUST be sorted by offset for this function to work.*
1038  */
1039 static sp_ext_node_t *
1040 meta_sp_list_find(
1041 	sp_ext_node_t	*head,
1042 	sp_ext_offset_t	offset
1043 )
1044 {
1045 	sp_ext_node_t	*ext;
1046 
1047 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1048 		/* check if the offset lies within this extent */
1049 		if ((offset >= ext->ext_offset) &&
1050 		    (offset < ext->ext_offset + ext->ext_length)) {
1051 			/*
1052 			 * the requested extent should always be a
1053 			 * subset of an extent in the list.
1054 			 */
1055 			return (ext);
1056 		}
1057 	}
1058 	return (NULL);
1059 }
1060 
1061 /*
1062  * FUNCTION:	meta_sp_list_freefill()
1063  * INPUT:	head	- the head of the list, must be NULL for empty list
1064  *		size	- the size of the volume this extent list is
1065  *			  representing
1066  * OUTPUT:	head	- the new head of the list
1067  * RETURNS:	void
1068  * PURPOSE:	finds gaps in the extent list and fills them with a free
1069  *		node.  If there is a gap at the beginning the head
1070  *		pointer will be changed to point to the new free node.
1071  *		If there is free space at the end, the last free extent
1072  *		will extend all the way out to the size specified.
1073  *
1074  *		*The list MUST be sorted by offset for this function to work.*
1075  */
1076 static void
1077 meta_sp_list_freefill(
1078 	sp_ext_node_t	**head,
1079 	sp_ext_length_t	size
1080 )
1081 {
1082 	sp_ext_node_t	*ext;
1083 	sp_ext_offset_t	curoff = 0LL;
1084 
1085 	for (ext = *head; ext != NULL; ext = ext->ext_next) {
1086 		if (curoff < ext->ext_offset)
1087 			meta_sp_list_insert(NULL, NULL, head,
1088 			    curoff, ext->ext_offset - curoff,
1089 			    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1090 		curoff = ext->ext_offset + ext->ext_length;
1091 	}
1092 
1093 	/* pad inverse list out to the end */
1094 	if (curoff < size)
1095 		meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff,
1096 		    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1097 
1098 	if (getenv(META_SP_DEBUG)) {
1099 		meta_sp_debug("meta_sp_list_freefill: Extent list with "
1100 		    "holes freefilled:\n");
1101 		meta_sp_list_dump(*head);
1102 	}
1103 }
1104 
1105 /*
1106  * FUNCTION:	meta_sp_list_dump()
1107  * INPUT:	head	- the head of the list, must be NULL for empty list
1108  * OUTPUT:	none
1109  * RETURNS:	void
1110  * PURPOSE:	dumps the entire extent list to stdout for easy debugging
1111  */
1112 static void
1113 meta_sp_list_dump(sp_ext_node_t *head)
1114 {
1115 	sp_ext_node_t	*ext;
1116 
1117 	meta_sp_debug("meta_sp_list_dump: dumping extent list:\n");
1118 	meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name",
1119 	    "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev",
1120 	    "Next");
1121 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1122 		if (ext->ext_namep != NULL)
1123 			meta_sp_debug("%5s", ext->ext_namep->cname);
1124 		else
1125 			meta_sp_debug("%5s", "NONE");
1126 
1127 		meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq);
1128 		switch (ext->ext_type) {
1129 		case EXTTYP_ALLOC:
1130 			meta_sp_debug("%7s ", "ALLOC");
1131 			break;
1132 		case EXTTYP_FREE:
1133 			meta_sp_debug("%7s ", "FREE");
1134 			break;
1135 		case EXTTYP_END:
1136 			meta_sp_debug("%7s ", "END");
1137 			break;
1138 		case EXTTYP_RESERVED:
1139 			meta_sp_debug("%7s ", "RESV");
1140 			break;
1141 		default:
1142 			meta_sp_debug("%7s ", "INVLD");
1143 			break;
1144 		}
1145 
1146 		meta_sp_debug("%10llu %10llu %5u %10p %10p\n",
1147 		    ext->ext_offset, ext->ext_length,
1148 		    ext->ext_flags, (void *) ext->ext_prev,
1149 		    (void *) ext->ext_next);
1150 	}
1151 	meta_sp_debug("\n");
1152 }
1153 
1154 /*
1155  * FUNCTION:	meta_sp_list_overlaps()
1156  * INPUT:	head	- the head of the list, must be NULL for empty list
1157  * OUTPUT:	none
1158  * RETURNS:	int	- 1 if extents overlap, 0 if ok
1159  * PURPOSE:	checks a list for overlaps.  The list MUST be sorted by
1160  *		offset for this function to work properly.
1161  */
1162 static int
1163 meta_sp_list_overlaps(sp_ext_node_t *head)
1164 {
1165 	sp_ext_node_t	*ext;
1166 
1167 	for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) {
1168 		if (ext->ext_offset + ext->ext_length >
1169 		    ext->ext_next->ext_offset)
1170 			return (1);
1171 	}
1172 	return (0);
1173 }
1174 
1175 /*
1176  * **************************************************************************
1177  *                        Extent Allocation Functions                       *
1178  * **************************************************************************
1179  */
1180 
1181 /*
1182  * FUNCTION:	meta_sp_alloc_by_ext()
1183  * INPUT:	sp	- the set name for the device the node belongs to
1184  *		np	- the name of the device the node belongs to
1185  *		head	- the head of the list, must be NULL for empty list
1186  *		free_ext	- the free extent being allocated from
1187  *		alloc_offset	- the offset of the allocation
1188  *		alloc_len	- the length of the allocation
1189  *		seq		- the sequence number of the allocation
1190  * OUTPUT:	head	- the new head pointer
1191  * RETURNS:	void
1192  * PURPOSE:	allocates a portion of the free extent free_ext.  The
1193  *		allocated portion starts at alloc_offset and is
1194  *		alloc_length long.  Both (alloc_offset) and (alloc_offset +
1195  *		alloc_length) must be contained within the free extent.
1196  *
1197  *		The free extent is split into as many as 3 pieces - a
1198  *		free extent containing [ free_offset .. alloc_offset ), an
1199  *		allocated extent containing the range [ alloc_offset ..
1200  *		alloc_end ], and another free extent containing the
1201  *		range ( alloc_end .. free_end ].  If either of the two
1202  *		new free extents would be zero length, they are not created.
1203  *
1204  *		Finally, the original free extent is removed.  All newly
1205  *		created extents have the EXTFLG_UPDATE flag set.
1206  */
1207 static void
1208 meta_sp_alloc_by_ext(
1209 	mdsetname_t	*sp,
1210 	mdname_t	*np,
1211 	sp_ext_node_t	**head,
1212 	sp_ext_node_t	*free_ext,
1213 	sp_ext_offset_t	alloc_offset,
1214 	sp_ext_length_t	alloc_length,
1215 	uint_t		seq
1216 )
1217 {
1218 	sp_ext_offset_t	free_offset = free_ext->ext_offset;
1219 	sp_ext_length_t	free_length = free_ext->ext_length;
1220 
1221 	sp_ext_offset_t	alloc_end = alloc_offset + alloc_length;
1222 	sp_ext_offset_t	free_end  = free_offset  + free_length;
1223 
1224 	/* allocated extent must be a subset of the free extent */
1225 	assert(free_offset <= alloc_offset);
1226 	assert(free_end >= alloc_end);
1227 
1228 	meta_sp_list_remove(head, free_ext);
1229 
1230 	if (free_offset < alloc_offset) {
1231 		meta_sp_list_insert(NULL, NULL, head, free_offset,
1232 		    (alloc_offset - free_offset), EXTTYP_FREE, 0,
1233 		    EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1234 	}
1235 
1236 	if (free_end > alloc_end) {
1237 		meta_sp_list_insert(NULL, NULL, head, alloc_end,
1238 		    (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE,
1239 		    meta_sp_cmp_by_offset);
1240 	}
1241 
1242 	meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length,
1243 	    EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1244 
1245 	if (getenv(META_SP_DEBUG)) {
1246 		meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n");
1247 		meta_sp_list_dump(*head);
1248 	}
1249 }
1250 
1251 /*
1252  * FUNCTION:	meta_sp_alloc_by_len()
1253  * INPUT:	sp	- the set name for the device the node belongs to
1254  *		np	- the name of the device the node belongs to
1255  *		head	- the head of the list, must be NULL for empty list
1256  *		*lp	- the requested length to allocate
1257  *		last_off	- the last offset already allocated.
1258  *		alignment	- the desired extent alignmeent
1259  * OUTPUT:	head	- the new head pointer
1260  *		*lp	- the length allocated
1261  * RETURNS:	int	- -1 if error, the number of new extents on success
1262  * PURPOSE:	allocates extents from free space to satisfy the requested
1263  *		length.  If requested length is zero, allocates all
1264  *		remaining free space.  This function provides the meat
1265  *		of the extent allocation algorithm.  Allocation is a
1266  *		three tier process:
1267  *
1268  *		1. If last_off is nonzero and there is free space following
1269  *		   that node, then it is extended to allocate as much of that
1270  *		   free space as possible.  This is useful for metattach.
1271  *		2. If a free extent can be found to satisfy the remaining
1272  *		   requested space, then satisfy the rest of the request
1273  *		   from that extent.
1274  *		3. Start allocating space from any remaining free extents until
1275  *		   the remainder of the request is satisified.
1276  *
1277  *              If alignment is non-zero, then every extent modified
1278  *              or newly allocated will be aligned modulo alignment,
1279  *              with a length that is an integer multiple of
1280  *              alignment.
1281  *
1282  *		The EXTFLG_UPDATE flag is set for all nodes (free and
1283  *		allocated) that require updated watermarks.
1284  *
1285  *		This algorithm may have a negative impact on fragmentation
1286  *		in pathological cases and may be improved if it turns out
1287  *		to be a problem.  This may be exacerbated by particularly
1288  *		large alignments.
1289  *
1290  * NOTE:	It's confusing, so it demands an explanation:
1291  *		- len is used to represent requested data space; it
1292  *		  does not include room for a watermark.  On each full
1293  *		  or partial allocation, len will be decremented by
1294  *		  alloc_len (see next paragraph) until it reaches
1295  *		  zero.
1296  *		- alloc_len is used to represent data space allocated
1297  *		  from a particular extent; it does not include space
1298  *		  for a watermark.  In the rare event that a_length
1299  *		  (see next paragraph) is equal to MD_SP_WMSIZE,
1300  *		  alloc_len will be zero and the resulting MD_SP_WMSIZE
1301  *		  fragment of space will be utterly unusable.
1302  *		- a_length is used to represent all space to be
1303  *		  allocated from a particular extent; it DOES include
1304  *		  space for a watermark.
1305  */
1306 static int
1307 meta_sp_alloc_by_len(
1308 	mdsetname_t	*sp,
1309 	mdname_t	*np,
1310 	sp_ext_node_t	**head,
1311 	sp_ext_length_t	*lp,
1312 	sp_ext_offset_t	last_off,
1313 	sp_ext_offset_t	alignment
1314 )
1315 {
1316 	sp_ext_node_t	*free_ext;
1317 	sp_ext_node_t	*alloc_ext;
1318 	uint_t		last_seq = 0;
1319 	uint_t		numexts = 0;
1320 	sp_ext_length_t	freespace;
1321 	sp_ext_length_t	alloc_len;
1322 	sp_ext_length_t	len;
1323 
1324 	/* We're DOA if we can't read *lp */
1325 	assert(lp != NULL);
1326 	len = *lp;
1327 
1328 	/*
1329 	 * Process the nominal case first: we've been given an actual
1330 	 * size argument, rather than the literal "all"
1331 	 */
1332 
1333 	if (len != 0) {
1334 
1335 		/*
1336 		 * Short circuit the check for free space.  This may
1337 		 * tell us we have enough space when we really don't
1338 		 * because each extent loses space to a watermark, but
1339 		 * it will always tell us there isn't enough space
1340 		 * correctly.  Worst case we do some extra work.
1341 		 */
1342 		freespace = meta_sp_list_size(*head, EXTTYP_FREE,
1343 		    INCLUDE_WM);
1344 
1345 		if (freespace < len)
1346 			return (-1);
1347 
1348 		/*
1349 		 * First see if we can extend the last extent for an
1350 		 * attach.
1351 		 */
1352 		if (last_off != 0LL) {
1353 			int align = 0;
1354 
1355 			alloc_ext =
1356 			    meta_sp_list_find(*head, last_off);
1357 			assert(alloc_ext != NULL);
1358 
1359 			/*
1360 			 * The offset test reflects the
1361 			 * inclusion of the watermark in the extent
1362 			 */
1363 			align = (alignment > 0) &&
1364 			    (((alloc_ext->ext_offset + MD_SP_WMSIZE) %
1365 				alignment) == 0);
1366 
1367 			/*
1368 			 * If we decided not to align here, we should
1369 			 * also reset "alignment" so we don't bother
1370 			 * later, either.
1371 			 */
1372 			if (!align) {
1373 				alignment = 0;
1374 			}
1375 
1376 			last_seq = alloc_ext->ext_seq;
1377 
1378 			free_ext = meta_sp_list_find(*head,
1379 			    alloc_ext->ext_offset +
1380 			    alloc_ext->ext_length);
1381 
1382 			/*
1383 			 * If a free extent follows our last allocated
1384 			 * extent, then remove the last allocated
1385 			 * extent and increase the size of the free
1386 			 * extent to overlap it, then allocate the
1387 			 * total space from the new free extent.
1388 			 */
1389 			if (free_ext != NULL &&
1390 			    free_ext->ext_type == EXTTYP_FREE) {
1391 				assert(free_ext->ext_offset ==
1392 				    alloc_ext->ext_offset +
1393 				    alloc_ext->ext_length);
1394 
1395 				alloc_len =
1396 				    MIN(len, free_ext->ext_length);
1397 
1398 				if (align && (alloc_len < len)) {
1399 					/* No watermark space needed */
1400 					alloc_len -= alloc_len % alignment;
1401 				}
1402 
1403 				if (alloc_len > 0) {
1404 					free_ext->ext_offset -=
1405 					    alloc_ext->ext_length;
1406 					free_ext->ext_length +=
1407 					    alloc_ext->ext_length;
1408 
1409 					meta_sp_alloc_by_ext(sp, np, head,
1410 					    free_ext, free_ext->ext_offset,
1411 					    alloc_ext->ext_length + alloc_len,
1412 					    last_seq);
1413 
1414 					/*
1415 					 * now remove the original allocated
1416 					 * node.  We may have overlapping
1417 					 * extents for a short time before
1418 					 * this node is removed.
1419 					 */
1420 					meta_sp_list_remove(head, alloc_ext);
1421 					len -= alloc_len;
1422 				}
1423 			}
1424 			last_seq++;
1425 		}
1426 
1427 		if (len == 0LL)
1428 			goto out;
1429 
1430 		/*
1431 		 * Next, see if we can find a single allocation for
1432 		 * the remainder.  This may make fragmentation worse
1433 		 * in some cases, but there's no good way to allocate
1434 		 * that doesn't have a highly fragmented corner case.
1435 		 */
1436 		for (free_ext = *head; free_ext != NULL;
1437 			free_ext = free_ext->ext_next) {
1438 			sp_ext_offset_t	a_offset;
1439 			sp_ext_offset_t	a_length;
1440 
1441 			if (free_ext->ext_type != EXTTYP_FREE)
1442 				continue;
1443 
1444 			/*
1445 			 * The length test should include space for
1446 			 * the watermark
1447 			 */
1448 
1449 			a_offset = free_ext->ext_offset;
1450 			a_length = free_ext->ext_length;
1451 
1452 			if (alignment > 0) {
1453 
1454 				/*
1455 				 * Shortcut for extents that have been
1456 				 * previously added to pad out the
1457 				 * data space
1458 				 */
1459 				if (a_length < alignment) {
1460 					continue;
1461 				}
1462 
1463 				/*
1464 				 * Round up so the data space begins
1465 				 * on a properly aligned boundary.
1466 				 */
1467 				a_offset += alignment -
1468 				    (a_offset % alignment) - MD_SP_WMSIZE;
1469 
1470 				/*
1471 				 * This is only necessary in case the
1472 				 * watermark size is ever greater than
1473 				 * one.  It'll never happen, of
1474 				 * course; we'll get rid of watermarks
1475 				 * before we make 'em bigger.
1476 				 */
1477 				if (a_offset < free_ext->ext_offset) {
1478 					a_offset += alignment;
1479 				}
1480 
1481 				/*
1482 				 * Adjust the length to account for
1483 				 * the space lost above (if any)
1484 				 */
1485 				a_length -=
1486 					(a_offset - free_ext->ext_offset);
1487 			}
1488 
1489 			if (a_length >= len + MD_SP_WMSIZE) {
1490 				meta_sp_alloc_by_ext(sp, np, head,
1491 					free_ext, a_offset,
1492 					len + MD_SP_WMSIZE, last_seq);
1493 
1494 				len = 0LL;
1495 				numexts++;
1496 				break;
1497 			}
1498 		}
1499 
1500 		if (len == 0LL)
1501 			goto out;
1502 
1503 
1504 		/*
1505 		 * If the request could not be satisfied by extending
1506 		 * the last extent or by a single extent, then put
1507 		 * multiple smaller extents together until the request
1508 		 * is satisfied.
1509 		 */
1510 		for (free_ext = *head; (free_ext != NULL) && (len > 0);
1511 			free_ext = free_ext->ext_next) {
1512 			sp_ext_offset_t a_offset;
1513 			sp_ext_length_t a_length;
1514 
1515 			if (free_ext->ext_type != EXTTYP_FREE)
1516 				continue;
1517 
1518 			a_offset = free_ext->ext_offset;
1519 			a_length = free_ext->ext_length;
1520 
1521 			if (alignment > 0) {
1522 
1523 				/*
1524 				 * Shortcut for extents that have been
1525 				 * previously added to pad out the
1526 				 * data space
1527 				 */
1528 				if (a_length < alignment) {
1529 					continue;
1530 				}
1531 
1532 				/*
1533 				 * Round up so the data space begins
1534 				 * on a properly aligned boundary.
1535 				 */
1536 				a_offset += alignment -
1537 					(a_offset % alignment) - MD_SP_WMSIZE;
1538 
1539 				/*
1540 				 * This is only necessary in case the
1541 				 * watermark size is ever greater than
1542 				 * one.  It'll never happen, of
1543 				 * course; we'll get rid of watermarks
1544 				 * before we make 'em bigger.
1545 				 */
1546 				if (a_offset < free_ext->ext_offset) {
1547 					a_offset += alignment;
1548 				}
1549 
1550 				/*
1551 				 * Adjust the length to account for
1552 				 * the space lost above (if any)
1553 				 */
1554 				a_length -=
1555 					(a_offset - free_ext->ext_offset);
1556 
1557 				/*
1558 				 * Adjust the length to be properly
1559 				 * aligned if it is NOT to be the
1560 				 * last extent in the soft partition.
1561 				 */
1562 				if ((a_length - MD_SP_WMSIZE) < len)
1563 					a_length -=
1564 						(a_length - MD_SP_WMSIZE)
1565 						% alignment;
1566 			}
1567 
1568 			alloc_len = MIN(len, a_length - MD_SP_WMSIZE);
1569 			if (alloc_len == 0)
1570 				continue;
1571 
1572 			/*
1573 			 * meta_sp_alloc_by_ext() expects the
1574 			 * allocation length to include the watermark
1575 			 * size, which is why we don't simply pass in
1576 			 * alloc_len here.
1577 			 */
1578 			meta_sp_alloc_by_ext(sp, np, head, free_ext,
1579 				a_offset, MIN(len + MD_SP_WMSIZE, a_length),
1580 				last_seq);
1581 
1582 			len -= alloc_len;
1583 			numexts++;
1584 			last_seq++;
1585 		}
1586 
1587 
1588 		/*
1589 		 * If there was not enough space we can throw it all
1590 		 * away since no real work has been done yet.
1591 		 */
1592 		if (len != 0) {
1593 			meta_sp_list_free(head);
1594 			return (-1);
1595 		}
1596 	}
1597 
1598 	/*
1599 	 * Otherwise, the literal "all" was specified: allocate all
1600 	 * available free space.  Don't bother with alignment.
1601 	 */
1602 	else {
1603 		/* First, extend the last extent if this is a grow */
1604 		if (last_off != 0LL) {
1605 			alloc_ext =
1606 				meta_sp_list_find(*head, last_off);
1607 			assert(alloc_ext != NULL);
1608 
1609 			last_seq = alloc_ext->ext_seq;
1610 
1611 			free_ext = meta_sp_list_find(*head,
1612 				alloc_ext->ext_offset +
1613 				alloc_ext->ext_length);
1614 
1615 			/*
1616 			 * If a free extent follows our last allocated
1617 			 * extent, then remove the last allocated
1618 			 * extent and increase the size of the free
1619 			 * extent to overlap it, then allocate the
1620 			 * total space from the new free extent.
1621 			 */
1622 			if (free_ext != NULL &&
1623 			    free_ext->ext_type == EXTTYP_FREE) {
1624 				assert(free_ext->ext_offset ==
1625 				    alloc_ext->ext_offset +
1626 				    alloc_ext->ext_length);
1627 
1628 				len = alloc_len =
1629 				    free_ext->ext_length;
1630 
1631 				free_ext->ext_offset -=
1632 				    alloc_ext->ext_length;
1633 				free_ext->ext_length +=
1634 				    alloc_ext->ext_length;
1635 
1636 				meta_sp_alloc_by_ext(sp, np, head,
1637 				    free_ext, free_ext->ext_offset,
1638 				    alloc_ext->ext_length + alloc_len,
1639 				    last_seq);
1640 
1641 				/*
1642 				 * now remove the original allocated
1643 				 * node.  We may have overlapping
1644 				 * extents for a short time before
1645 				 * this node is removed.
1646 				 */
1647 				meta_sp_list_remove(head, alloc_ext);
1648 			}
1649 
1650 			last_seq++;
1651 		}
1652 
1653 		/* Next, grab all remaining free space */
1654 		for (free_ext = *head; free_ext != NULL;
1655 			free_ext = free_ext->ext_next) {
1656 
1657 			if (free_ext->ext_type == EXTTYP_FREE) {
1658 				alloc_len =
1659 				    free_ext->ext_length - MD_SP_WMSIZE;
1660 				if (alloc_len == 0)
1661 					continue;
1662 
1663 				/*
1664 				 * meta_sp_alloc_by_ext() expects the
1665 				 * allocation length to include the
1666 				 * watermark size, which is why we
1667 				 * don't simply pass in alloc_len
1668 				 * here.
1669 				 */
1670 				meta_sp_alloc_by_ext(sp, np, head,
1671 				    free_ext, free_ext->ext_offset,
1672 				    free_ext->ext_length,
1673 				    last_seq);
1674 
1675 				len += alloc_len;
1676 				numexts++;
1677 				last_seq++;
1678 			}
1679 		}
1680 	}
1681 
1682 out:
1683 	if (getenv(META_SP_DEBUG)) {
1684 		meta_sp_debug("meta_sp_alloc_by_len: Extent list after "
1685 		    "allocation:\n");
1686 		meta_sp_list_dump(*head);
1687 	}
1688 
1689 	if (*lp == 0) {
1690 		*lp = len;
1691 
1692 		/*
1693 		 * Make sure the callers hit a no space error if we
1694 		 * didn't actually find anything.
1695 		 */
1696 		if (len == 0) {
1697 			return (-1);
1698 		}
1699 	}
1700 
1701 	return (numexts);
1702 }
1703 
1704 /*
1705  * FUNCTION:	meta_sp_alloc_by_list()
1706  * INPUT:	sp	- the set name for the device the node belongs to
1707  *		np	- the name of the device the node belongs to
1708  *		head	- the head of the list, must be NULL for empty list
1709  *		oblist	- an extent list containing requested nodes to allocate
1710  * OUTPUT:	head	- the new head pointer
1711  * RETURNS:	int	- -1 if error, the number of new extents on success
1712  * PURPOSE:	allocates extents from free space to satisfy the requested
1713  *		extent list.  This is primarily used for the -o/-b options
1714  *		where the user may specifically request extents to allocate.
1715  *		Each extent in the oblist must be a subset (inclusive) of a
1716  *		free extent and may not overlap each other.  This
1717  *		function sets the EXTFLG_UPDATE flag for each node that
1718  *		requires a watermark update after allocating.
1719  */
1720 static int
1721 meta_sp_alloc_by_list(
1722 	mdsetname_t	*sp,
1723 	mdname_t	*np,
1724 	sp_ext_node_t	**head,
1725 	sp_ext_node_t	*oblist
1726 )
1727 {
1728 	sp_ext_node_t	*ext;
1729 	sp_ext_node_t	*free_ext;
1730 	uint_t		numexts = 0;
1731 
1732 	for (ext = oblist; ext != NULL; ext = ext->ext_next) {
1733 
1734 		free_ext = meta_sp_list_find(*head,
1735 		    ext->ext_offset - MD_SP_WMSIZE);
1736 
1737 		/* Make sure the allocation is within the free extent */
1738 		if ((free_ext == NULL) ||
1739 		    (ext->ext_offset + ext->ext_length >
1740 		    free_ext->ext_offset + free_ext->ext_length) ||
1741 		    (free_ext->ext_type != EXTTYP_FREE))
1742 			return (-1);
1743 
1744 		meta_sp_alloc_by_ext(sp, np, head, free_ext,
1745 		    ext->ext_offset - MD_SP_WMSIZE,
1746 		    ext->ext_length + MD_SP_WMSIZE, ext->ext_seq);
1747 
1748 		numexts++;
1749 	}
1750 
1751 	assert(meta_sp_list_overlaps(*head) == 0);
1752 
1753 	if (getenv(META_SP_DEBUG)) {
1754 		meta_sp_debug("meta_sp_alloc_by_list: Extent list after "
1755 		    "allocation:\n");
1756 		meta_sp_list_dump(*head);
1757 	}
1758 
1759 	return (numexts);
1760 }
1761 
1762 /*
1763  * **************************************************************************
1764  *                     Extent List Population Functions                     *
1765  * **************************************************************************
1766  */
1767 
1768 /*
1769  * FUNCTION:	meta_sp_extlist_from_namelist()
1770  * INPUT:	sp	- the set name for the device the node belongs to
1771  *		spnplp	- the namelist of soft partitions to build a list from
1772  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1773  *		ep	- return error pointer
1774  * RETURNS:	int	- -1 if error, 0 on success
1775  * PURPOSE:	builds an extent list representing the soft partitions
1776  *		specified in the namelist.  Each extent in each soft
1777  *		partition is added to the list with the type EXTTYP_ALLOC.
1778  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1779  *		extent in the list includes the space occupied by the
1780  *		watermark, which is not included in the unit structures.
1781  */
1782 static int
1783 meta_sp_extlist_from_namelist(
1784 	mdsetname_t	*sp,
1785 	mdnamelist_t	*spnlp,
1786 	sp_ext_node_t	**extlist,
1787 	md_error_t	*ep
1788 )
1789 {
1790 	int		extn;
1791 	md_sp_t		*msp;		/* unit structure of the sp's */
1792 	mdnamelist_t	*namep;
1793 
1794 	assert(sp != NULL);
1795 
1796 	/*
1797 	 * Now go through the soft partitions and add a node to the used
1798 	 * list for each allocated extent.
1799 	 */
1800 	for (namep = spnlp; namep != NULL; namep = namep->next) {
1801 		mdname_t	*curnp = namep->namep;
1802 
1803 		/* get the unit structure */
1804 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
1805 			return (-1);
1806 
1807 		for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1808 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1809 
1810 			/*
1811 			 * subtract from offset and add to the length
1812 			 * to account for the watermark, which is not
1813 			 * contained in the extents in the unit structure.
1814 			 */
1815 			meta_sp_list_insert(sp, curnp, extlist,
1816 			    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
1817 			    EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset);
1818 		}
1819 	}
1820 	return (0);
1821 }
1822 
1823 /*
1824  * FUNCTION:	meta_sp_extlist_from_wm()
1825  * INPUT:	sp	- the set name for the device the node belongs to
1826  *		compnp	- the name of the device to scan watermarks on
1827  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1828  *		ep	- return error pointer
1829  * RETURNS:	int	- -1 if error, 0 on success
1830  * PURPOSE:	builds an extent list representing the soft partitions
1831  *		specified in the namelist.  Each extent in each soft
1832  *		partition is added to the list with the type EXTTYP_ALLOC.
1833  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1834  *		extent in the list includes the space occupied by the
1835  *		watermark, which is not included in the unit structures.
1836  */
1837 static int
1838 meta_sp_extlist_from_wm(
1839 	mdsetname_t	*sp,
1840 	mdname_t	*compnp,
1841 	sp_ext_node_t	**extlist,
1842 	ext_cmpfunc_t	compare,
1843 	md_error_t	*ep
1844 )
1845 {
1846 	mp_watermark_t	wm;
1847 	mdname_t	*np = NULL;
1848 	mdsetname_t	*spsetp = NULL;
1849 	sp_ext_offset_t	cur_off;
1850 	md_set_desc	*sd;
1851 	int		init = 0;
1852 	mdkey_t		key;
1853 	minor_t		mnum;
1854 
1855 	if (!metaislocalset(sp)) {
1856 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1857 			return (-1);
1858 	}
1859 
1860 	if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR)
1861 		return (-1);
1862 
1863 	for (;;) {
1864 		if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) {
1865 			return (-1);
1866 		}
1867 
1868 		/* get the set and name pointers */
1869 		if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) {
1870 			if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) {
1871 				return (-1);
1872 			}
1873 		}
1874 
1875 		/*
1876 		 * For the MN set, meta_init_make_device needs to
1877 		 * be run on all the nodes so the entries for the
1878 		 * softpart device name and its comp can be created
1879 		 * in the same order in the replica namespace.  If
1880 		 * we have it run on mdmn_do_iocset then the mddbs
1881 		 * will be out of sync between master node and slave
1882 		 * nodes.
1883 		 */
1884 		if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) {
1885 
1886 		    if (!metaislocalset(sp) && MD_MNSET_DESC(sd)) {
1887 			md_mn_msg_addmdname_t	*send_params;
1888 			int			result;
1889 			md_mn_result_t		*resp = NULL;
1890 			int			message_size;
1891 
1892 			message_size =  sizeof (*send_params) +
1893 			    strlen(wm.wm_mdname) + 1;
1894 			send_params = Zalloc(message_size);
1895 			send_params->addmdname_setno = sp->setno;
1896 			(void) strcpy(&send_params->addmdname_name[0],
1897 			    wm.wm_mdname);
1898 			result = mdmn_send_message(sp->setno,
1899 			    MD_MN_MSG_ADDMDNAME,
1900 			    MD_MSGF_PANIC_WHEN_INCONSISTENT,
1901 			    (char *)send_params, message_size, &resp,
1902 			    ep);
1903 			Free(send_params);
1904 			if (resp != NULL) {
1905 				if (resp->mmr_exitval != 0) {
1906 					free_result(resp);
1907 					return (-1);
1908 				}
1909 				free_result(resp);
1910 			}
1911 			if (result != 0)
1912 				return (-1);
1913 		    } else {
1914 
1915 			if (!is_existing_meta_hsp(sp, wm.wm_mdname)) {
1916 			    if ((key = meta_init_make_device(&sp,
1917 				wm.wm_mdname, ep)) <= 0) {
1918 					return (-1);
1919 				}
1920 				init = 1;
1921 			}
1922 		    }
1923 
1924 		    np = metaname(&spsetp, wm.wm_mdname, META_DEVICE, ep);
1925 		    if (np == NULL) {
1926 			if (init) {
1927 			    if (meta_getnmentbykey(sp->setno, MD_SIDEWILD,
1928 				key, NULL, &mnum, NULL, ep) != NULL) {
1929 				    (void) metaioctl(MD_IOCREM_DEV, &mnum,
1930 						ep, NULL);
1931 			    }
1932 			    (void) del_self_name(sp, key, ep);
1933 			}
1934 			return (-1);
1935 		    }
1936 		}
1937 
1938 		/* insert watermark into extent list */
1939 		meta_sp_list_insert(spsetp, np, extlist, cur_off,
1940 		    wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq,
1941 		    EXTFLG_UPDATE, compare);
1942 
1943 		/* if we see the end watermark, we're done */
1944 		if (wm.wm_type == EXTTYP_END)
1945 			break;
1946 
1947 		cur_off += wm.wm_length + 1;
1948 
1949 		/* clear out set and name pointers for next iteration */
1950 		np = NULL;
1951 		spsetp = NULL;
1952 	}
1953 
1954 	return (0);
1955 }
1956 
1957 /*
1958  * **************************************************************************
1959  *                        Print (metastat) Functions                        *
1960  * **************************************************************************
1961  */
1962 
1963 /*
1964  * FUNCTION:	meta_sp_short_print()
1965  * INPUT:	msp	- the unit structure to display
1966  *		fp	- the file pointer to send output to
1967  *		options	- print options from the command line processor
1968  * OUTPUT:	ep	- return error pointer
1969  * RETURNS:	int	- -1 if error, 0 on success
1970  * PURPOSE:	display a short report of the soft partition in md.tab
1971  *		form, primarily used for metastat -p.
1972  */
1973 static int
1974 meta_sp_short_print(
1975 	md_sp_t		*msp,
1976 	char		*fname,
1977 	FILE		*fp,
1978 	mdprtopts_t	options,
1979 	md_error_t	*ep
1980 )
1981 {
1982 	int	extn;
1983 
1984 	if (options & PRINT_LARGEDEVICES) {
1985 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0)
1986 			return (0);
1987 	}
1988 
1989 	if (options & PRINT_FN) {
1990 		if ((msp->common.revision & MD_FN_META_DEV) == 0)
1991 			return (0);
1992 	}
1993 
1994 	/* print name and -p */
1995 	if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF)
1996 		return (mdsyserror(ep, errno, fname));
1997 
1998 	/* print the component */
1999 	/*
2000 	 * Always print the full path name
2001 	 */
2002 	if (fprintf(fp, " %s", msp->compnamep->rname) == EOF)
2003 		return (mdsyserror(ep, errno, fname));
2004 
2005 	/* print out each extent */
2006 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2007 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2008 		if (fprintf(fp, " -o %llu -b %llu ", extp->poff,
2009 		    extp->len) == EOF)
2010 			return (mdsyserror(ep, errno, fname));
2011 	}
2012 
2013 	if (fprintf(fp, "\n") == EOF)
2014 		return (mdsyserror(ep, errno, fname));
2015 
2016 	/* success */
2017 	return (0);
2018 }
2019 
2020 /*
2021  * FUNCTION:	meta_sp_status_to_name()
2022  * INPUT:	xsp_status	- the status value to convert to a string
2023  *		tstate		- transient errored device state. If set the
2024  *				  device is Unavailable
2025  * OUTPUT:	none
2026  * RETURNS:	char *	- a pointer to the string representing the status value
2027  * PURPOSE:	return an internationalized string representing the
2028  *		status value for a soft partition.  The strings are
2029  *		strdup'd and must be freed by the caller.
2030  */
2031 static char *
2032 meta_sp_status_to_name(
2033 	xsp_status_t	xsp_status,
2034 	uint_t		tstate
2035 )
2036 {
2037 	char *rval = NULL;
2038 
2039 	/*
2040 	 * Check to see if we have MD_INACCESSIBLE set. This is the only valid
2041 	 * value for an 'Unavailable' return. tstate can be set because of
2042 	 * other multi-node reasons (e.g. ABR being set)
2043 	 */
2044 	if (tstate & MD_INACCESSIBLE) {
2045 		return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable")));
2046 	}
2047 
2048 	switch (xsp_status) {
2049 	case MD_SP_CREATEPEND:
2050 		rval = Strdup(dgettext(TEXT_DOMAIN, "Creating"));
2051 		break;
2052 	case MD_SP_GROWPEND:
2053 		rval = Strdup(dgettext(TEXT_DOMAIN, "Growing"));
2054 		break;
2055 	case MD_SP_DELPEND:
2056 		rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting"));
2057 		break;
2058 	case MD_SP_OK:
2059 		rval = Strdup(dgettext(TEXT_DOMAIN, "Okay"));
2060 		break;
2061 	case MD_SP_ERR:
2062 		rval = Strdup(dgettext(TEXT_DOMAIN, "Errored"));
2063 		break;
2064 	case MD_SP_RECOVER:
2065 		rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering"));
2066 		break;
2067 	}
2068 
2069 	if (rval == NULL)
2070 		rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid"));
2071 
2072 	return (rval);
2073 }
2074 
2075 /*
2076  * FUNCTION:	meta_sp_report()
2077  * INPUT:	sp	- the set name for the unit being displayed
2078  *		msp	- the unit structure to display
2079  *		nlpp	- pass back the large devs
2080  *		fp	- the file pointer to send output to
2081  *		options	- print options from the command line processor
2082  * OUTPUT:	ep	- return error pointer
2083  * RETURNS:	int	- -1 if error, 0 on success
2084  * PURPOSE:	print a full report of the device specified
2085  */
2086 static int
2087 meta_sp_report(
2088 	mdsetname_t	*sp,
2089 	md_sp_t		*msp,
2090 	mdnamelist_t	**nlpp,
2091 	char		*fname,
2092 	FILE		*fp,
2093 	mdprtopts_t	options,
2094 	md_error_t	*ep
2095 )
2096 {
2097 	uint_t		extn;
2098 	char		*status;
2099 	char		*devid = "";
2100 	mdname_t	*didnp = NULL;
2101 	ddi_devid_t	dtp;
2102 	int		len;
2103 	uint_t		tstate = 0;
2104 
2105 	if (options & PRINT_LARGEDEVICES) {
2106 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0) {
2107 			return (0);
2108 		} else {
2109 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2110 				return (-1);
2111 		}
2112 	}
2113 
2114 	if (options & PRINT_FN) {
2115 		if ((msp->common.revision & MD_FN_META_DEV) == 0) {
2116 			return (0);
2117 		} else {
2118 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2119 				return (-1);
2120 		}
2121 	}
2122 
2123 	if (options & PRINT_HEADER) {
2124 		if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"),
2125 		    msp->common.namep->cname) == EOF)
2126 			return (mdsyserror(ep, errno, fname));
2127 	}
2128 
2129 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Device: %s\n"),
2130 	    msp->compnamep->cname) == EOF)
2131 		return (mdsyserror(ep, errno, fname));
2132 
2133 	/* Determine if device is available before displaying status */
2134 	if (metaismeta(msp->common.namep)) {
2135 		if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0)
2136 			return (-1);
2137 	}
2138 	status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED);
2139 
2140 	/* print out "State" to be consistent with other metadevices */
2141 	if (tstate & MD_ABR_CAP) {
2142 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2143 		    "    State: %s - Application Based Recovery (ABR)\n"),
2144 		    status) == EOF) {
2145 			Free(status);
2146 			return (mdsyserror(ep, errno, fname));
2147 		}
2148 	} else {
2149 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2150 		    "    State: %s\n"), status) == EOF) {
2151 			Free(status);
2152 			return (mdsyserror(ep, errno, fname));
2153 		}
2154 	}
2155 	free(status);
2156 
2157 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Size: %llu blocks (%s)\n"),
2158 	    msp->common.size,
2159 	    meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF)
2160 		return (mdsyserror(ep, errno, fname));
2161 
2162 	/* print component details */
2163 	if (! metaismeta(msp->compnamep)) {
2164 		diskaddr_t	start_blk;
2165 		int		has_mddb;
2166 		char		*has_mddb_str;
2167 
2168 		/* print header */
2169 		/*
2170 		 * Building a format string on the fly that will
2171 		 * be used in (f)printf. This allows the length
2172 		 * of the ctd to vary from small to large without
2173 		 * looking horrible.
2174 		 */
2175 		len = strlen(msp->compnamep->cname);
2176 		len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device")));
2177 		len += 2;
2178 		if (fprintf(fp,
2179 		    "\t%-*.*s %-12.12s %-5.5s %s\n",
2180 		    len, len,
2181 		    dgettext(TEXT_DOMAIN, "Device"),
2182 		    dgettext(TEXT_DOMAIN, "Start Block"),
2183 		    dgettext(TEXT_DOMAIN, "Dbase"),
2184 		    dgettext(TEXT_DOMAIN, "Reloc")) == EOF) {
2185 			return (mdsyserror(ep, errno, fname));
2186 		}
2187 
2188 
2189 		/* get info */
2190 		if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) ==
2191 		    MD_DISKADDR_ERROR)
2192 			return (-1);
2193 
2194 		if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0)
2195 			return (-1);
2196 
2197 		if (has_mddb)
2198 			has_mddb_str = dgettext(TEXT_DOMAIN, "Yes");
2199 		else
2200 			has_mddb_str = dgettext(TEXT_DOMAIN, "No");
2201 
2202 		/* populate the key in the name_p structure */
2203 		didnp = metadevname(&sp, msp->compnamep->dev, ep);
2204 		if (didnp == NULL) {
2205 			return (-1);
2206 		}
2207 
2208 		/* determine if devid does NOT exist */
2209 		if (options & PRINT_DEVID) {
2210 		    if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep),
2211 					didnp->key, ep)) == NULL)
2212 				devid = dgettext(TEXT_DOMAIN, "No ");
2213 			else {
2214 				devid = dgettext(TEXT_DOMAIN, "Yes");
2215 				free(dtp);
2216 			}
2217 		}
2218 
2219 		/* print info */
2220 		/*
2221 		 * This allows the length
2222 		 * of the ctd to vary from small to large without
2223 		 * looking horrible.
2224 		 */
2225 		if (fprintf(fp, "\t%-*s %8lld     %-5.5s %s\n",
2226 		    len, msp->compnamep->cname,
2227 		    start_blk, has_mddb_str, devid) == EOF) {
2228 			return (mdsyserror(ep, errno, fname));
2229 		}
2230 		(void) fprintf(fp, "\n");
2231 	}
2232 
2233 
2234 	/* print the headers */
2235 	if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n",
2236 	    dgettext(TEXT_DOMAIN, "Extent"),
2237 	    dgettext(TEXT_DOMAIN, "Start Block"),
2238 	    dgettext(TEXT_DOMAIN, "Block count")) == EOF)
2239 		return (mdsyserror(ep, errno, fname));
2240 
2241 	/* print out each extent */
2242 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2243 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2244 
2245 		/* If PRINT_TIMES option is ever supported, add output here */
2246 		if (fprintf(fp, "\t%6u %24llu %24llu\n",
2247 		    extn, extp->poff, extp->len) == EOF)
2248 			return (mdsyserror(ep, errno, fname));
2249 	}
2250 
2251 	/* separate records with a newline */
2252 	(void) fprintf(fp, "\n");
2253 	return (0);
2254 }
2255 
2256 /*
2257  * FUNCTION:	meta_sp_print()
2258  * INPUT:	sp	- the set name for the unit being displayed
2259  *		np	- the name of the device to print
2260  *		fname	- ??? not used
2261  *		fp	- the file pointer to send output to
2262  *		options	- print options from the command line processor
2263  * OUTPUT:	ep	- return error pointer
2264  * RETURNS:	int	- -1 if error, 0 on success
2265  * PURPOSE:	print a full report of the device specified by metastat.
2266  *		This is the main entry point for printing.
2267  */
2268 int
2269 meta_sp_print(
2270 	mdsetname_t	*sp,
2271 	mdname_t	*np,
2272 	mdnamelist_t	**nlpp,
2273 	char		*fname,
2274 	FILE		*fp,
2275 	mdprtopts_t	options,
2276 	md_error_t	*ep
2277 )
2278 {
2279 	md_sp_t		*msp;
2280 	md_unit_t	*mdp;
2281 	int		rval = 0;
2282 
2283 	/* should always have the same set */
2284 	assert(sp != NULL);
2285 
2286 	/* print all the soft partitions */
2287 	if (np == NULL) {
2288 		mdnamelist_t	*nlp = NULL;
2289 		mdnamelist_t	*p;
2290 		int		cnt;
2291 
2292 		if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0)
2293 			return (-1);
2294 		else if (cnt == 0)
2295 			return (0);
2296 
2297 		/* recusively print them out */
2298 		for (p = nlp; (p != NULL); p = p->next) {
2299 			mdname_t	*curnp = p->namep;
2300 
2301 			/*
2302 			 * one problem with the rval of -1 here is that
2303 			 * the error gets "lost" when the next device is
2304 			 * printed, but we want to print them all anyway.
2305 			 */
2306 			rval = meta_sp_print(sp, curnp, nlpp, fname, fp,
2307 			    options, ep);
2308 		}
2309 
2310 		/* clean up, return success */
2311 		metafreenamelist(nlp);
2312 		return (rval);
2313 	}
2314 
2315 	/* get the unit structure */
2316 	if ((msp = meta_get_sp_common(sp, np,
2317 	    ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL)
2318 		return (-1);
2319 
2320 	/* check for parented */
2321 	if ((! (options & PRINT_SUBDEVS)) &&
2322 	    (MD_HAS_PARENT(msp->common.parent))) {
2323 		return (0);
2324 	}
2325 
2326 	/* print appropriate detail */
2327 	if (options & PRINT_SHORT) {
2328 		if (meta_sp_short_print(msp, fname, fp, options, ep) != 0)
2329 			return (-1);
2330 	} else {
2331 		if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0)
2332 			return (-1);
2333 	}
2334 
2335 	/*
2336 	 * Print underlying metadevices if they are parented to us and
2337 	 * if the info for the underlying metadevice has not been printed.
2338 	 */
2339 	if (metaismeta(msp->compnamep)) {
2340 		/* get the unit structure for the subdevice */
2341 		if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL)
2342 			return (-1);
2343 
2344 		/* If info not already printed, recurse */
2345 		if (!BT_TEST(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)))) {
2346 			if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp,
2347 			    (options | PRINT_HEADER | PRINT_SUBDEVS),
2348 			    NULL, ep) != 0) {
2349 				return (-1);
2350 			}
2351 			BT_SET(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)));
2352 		}
2353 	}
2354 	return (0);
2355 }
2356 
2357 /*
2358  * **************************************************************************
2359  *                     Watermark Manipulation Functions                     *
2360  * **************************************************************************
2361  */
2362 
2363 /*
2364  * FUNCTION:	meta_sp_get_start()
2365  * INPUT:	sp	- the operating set
2366  *		np 	- device upon which the sp is being built
2367  * OUTPUT:	ep	- return error pointer
2368  * RETURNS:	daddr_t	- -1 if error, otherwise the start block
2369  * PURPOSE:	Encapsulate the determination of the start block of the
2370  *		device upon which the sp is built or being built.
2371  *		This is done to hide the ugliness of the algorithm.  In
2372  *		the case where a sp is being built upon a stripe of > 1
2373  *		TB that is made up of a set of disks in which the first
2374  *		has a VTOC label the result returned from the call to
2375  *		metagetstart is incorrect.  The reason being that a > 1
2376  *		TB metadevice will manufacture an EFI label in which the
2377  *		start address is zero.  This is irrespective of the underlying
2378  *		devices.  The long term fix for this is to fix
2379  *		meta_efi_to_mdvtoc and meta_efi_to mdgeom so that they return
2380  *		values that are indicative of the first underlying device in
2381  *		metadevice.
2382  */
2383 static diskaddr_t
2384 meta_sp_get_start(
2385 	mdsetname_t	*sp,
2386 	mdname_t	*np,
2387 	md_error_t	*ep
2388 )
2389 {
2390 	daddr_t		start_block;
2391 
2392 	if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR) {
2393 		start_block += MD_SP_START;
2394 		/*
2395 		 * In the case that the device upon which the sp is being
2396 		 * created is a metadevice then ensure that in the case that
2397 		 * the first underlying device has a vtoc label that it is
2398 		 * not overwritten with a watermark by setting the start block
2399 		 * to point just past the vtoc label
2400 		 */
2401 		if (start_block < VTOC_SIZE && metaismeta(np))
2402 			start_block = VTOC_SIZE;
2403 	}
2404 
2405 	return (start_block);
2406 }
2407 
2408 /*
2409  * FUNCTION:	meta_sp_update_wm()
2410  * INPUT:	sp	- the operating set
2411  *		msp	- a pointer to the XDR unit structure
2412  *		extlist	- the extent list specifying watermarks to update
2413  * OUTPUT:	ep	- return error pointer
2414  * RETURNS:	int	- -1 if error, 0 on success
2415  * PURPOSE:	steps backwards through the extent list updating
2416  *		watermarks for all extents with the EXTFLG_UPDATE flag
2417  *		set.  Writing the watermarks guarantees consistency when
2418  *		extents must be broken into pieces since the original
2419  *		watermark will be the last to be updated, and will be
2420  *		changed to point to a new watermark that is already
2421  *		known to be consistent.  If one of the writes fails, the
2422  *		original watermark stays intact and none of the changes
2423  *		are realized.
2424  */
2425 static int
2426 meta_sp_update_wm(
2427 	mdsetname_t	*sp,
2428 	md_sp_t		*msp,
2429 	sp_ext_node_t	*extlist,
2430 	md_error_t	*ep
2431 )
2432 {
2433 	sp_ext_node_t	*ext;
2434 	sp_ext_node_t	*tail;
2435 	mp_watermark_t	*wmp, *watermarks;
2436 	xsp_offset_t	*osp, *offsets;
2437 	int		update_count = 0;
2438 	int		rval = 0;
2439 	md_unit_t	*mdp;
2440 	md_sp_update_wm_t	update_params;
2441 
2442 	if (getenv(META_SP_DEBUG)) {
2443 		meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n");
2444 		meta_sp_list_dump(extlist);
2445 	}
2446 
2447 	/*
2448 	 * find the last node so we can write the watermarks backwards
2449 	 * and count watermarks to update so we can allocate space
2450 	 */
2451 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
2452 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2453 			update_count++;
2454 		}
2455 
2456 		if (ext->ext_next == NULL) {
2457 			tail = ext;
2458 		}
2459 	}
2460 	ext = tail;
2461 
2462 	wmp = watermarks =
2463 	    Zalloc(update_count * sizeof (mp_watermark_t));
2464 	osp = offsets =
2465 	    Zalloc(update_count * sizeof (sp_ext_offset_t));
2466 
2467 	while (ext != NULL) {
2468 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2469 			/* update watermark */
2470 			wmp->wm_magic = MD_SP_MAGIC;
2471 			wmp->wm_version = MD_SP_VERSION;
2472 			wmp->wm_type = ext->ext_type;
2473 			wmp->wm_seq = ext->ext_seq;
2474 			wmp->wm_length = ext->ext_length - MD_SP_WMSIZE;
2475 
2476 			/* fill in the volume name and set name */
2477 			if (ext->ext_namep != NULL)
2478 				(void) strcpy(wmp->wm_mdname,
2479 				    ext->ext_namep->cname);
2480 			else
2481 				(void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME);
2482 			if (ext->ext_setp != NULL &&
2483 			    ext->ext_setp->setno != MD_LOCAL_SET)
2484 				(void) strcpy(wmp->wm_setname,
2485 				    ext->ext_setp->setname);
2486 			else
2487 				(void) strcpy(wmp->wm_setname,
2488 				    MD_SP_LOCALSETNAME);
2489 
2490 			/* Generate the checksum */
2491 			wmp->wm_checksum = 0;
2492 			crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum,
2493 			    sizeof (*wmp), NULL);
2494 
2495 			/* record the extent offset */
2496 			*osp = ext->ext_offset;
2497 
2498 			/* Advance the placeholders */
2499 			osp++; wmp++;
2500 		}
2501 		ext = ext->ext_prev;
2502 	}
2503 
2504 	mdp = meta_get_mdunit(sp, msp->common.namep, ep);
2505 	if (mdp == NULL) {
2506 		rval = -1;
2507 		goto out;
2508 	}
2509 
2510 	(void) memset(&update_params, 0, sizeof (update_params));
2511 	update_params.mnum = MD_SID(mdp);
2512 	update_params.count = update_count;
2513 	update_params.wmp = (uintptr_t)watermarks;
2514 	update_params.osp = (uintptr_t)offsets;
2515 	MD_SETDRIVERNAME(&update_params, MD_SP,
2516 	    MD_MIN2SET(update_params.mnum));
2517 
2518 	if (metaioctl(MD_IOC_SPUPDATEWM, &update_params,
2519 	    &update_params.mde, msp->common.namep->cname) != 0) {
2520 		(void) mdstealerror(ep, &update_params.mde);
2521 		rval = -1;
2522 		goto out;
2523 	}
2524 
2525 out:
2526 	Free(watermarks);
2527 	Free(offsets);
2528 
2529 	return (rval);
2530 }
2531 
2532 /*
2533  * FUNCTION:	meta_sp_clear_wm()
2534  * INPUT:	sp	- the operating set
2535  *		msp	- the unit structure for the soft partition to clear
2536  * OUTPUT:	ep	- return error pointer
2537  * RETURNS:	int	- -1 if error, 0 on success
2538  * PURPOSE:	steps through the extents for a soft partition unit and
2539  *		creates an extent list designed to mark all of the
2540  *		watermarks for those extents as free.  The extent list
2541  *		is then passed to meta_sp_update_wm() to actually write
2542  *		the watermarks out.
2543  */
2544 static int
2545 meta_sp_clear_wm(
2546 	mdsetname_t	*sp,
2547 	md_sp_t		*msp,
2548 	md_error_t	*ep
2549 )
2550 {
2551 	sp_ext_node_t	*extlist = NULL;
2552 	int		numexts = msp->ext.ext_len;
2553 	uint_t		i;
2554 	int		rval = 0;
2555 
2556 	/* for each watermark must set the flag to SP_FREE */
2557 	for (i = 0; i < numexts; i++) {
2558 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
2559 
2560 		meta_sp_list_insert(NULL, NULL, &extlist,
2561 		    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
2562 		    EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
2563 	}
2564 
2565 	/* update watermarks */
2566 	rval = meta_sp_update_wm(sp, msp, extlist, ep);
2567 
2568 	meta_sp_list_free(&extlist);
2569 	return (rval);
2570 }
2571 
2572 /*
2573  * FUNCTION:	meta_sp_read_wm()
2574  * INPUT:	sp	- setname for component
2575  *		compnp	- mdname_t for component
2576  *		offset	- the offset of the watermark to read (sectors)
2577  * OUTPUT:	wm	- the watermark structure to read into
2578  *		ep	- return error pointer
2579  * RETURNS:	int	- -1 if error, 0 on success
2580  * PURPOSE:	seeks out to the requested offset and reads a watermark.
2581  *		It then verifies that the magic number is correct and
2582  *		that the checksum is valid, returning an error if either
2583  *		is wrong.
2584  */
2585 static int
2586 meta_sp_read_wm(
2587 	mdsetname_t	*sp,
2588 	mdname_t	*compnp,
2589 	mp_watermark_t	*wm,
2590 	sp_ext_offset_t	offset,
2591 	md_error_t	*ep
2592 )
2593 {
2594 	md_sp_read_wm_t	read_params;
2595 
2596 	/*
2597 	 * make sure block offset does not overflow 2^64 bytes and it's a
2598 	 * multiple of the block size.
2599 	 */
2600 	assert(offset <= (1LL << (64 - DEV_BSHIFT)));
2601 	/* LINTED */
2602 	assert((sizeof (*wm) % DEV_BSIZE) == 0);
2603 
2604 	(void) memset(wm, 0, sizeof (*wm));
2605 
2606 	(void) memset(&read_params, 0, sizeof (read_params));
2607 	read_params.rdev = compnp->dev;
2608 	read_params.wmp = (uintptr_t)wm;
2609 	read_params.offset = offset;
2610 	MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno);
2611 
2612 	if (metaioctl(MD_IOC_SPREADWM, &read_params,
2613 	    &read_params.mde, compnp->cname) != 0) {
2614 
2615 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2616 		    "Extent header read failed, block %llu.\n"), offset);
2617 		return (mdstealerror(ep, &read_params.mde));
2618 	}
2619 
2620 	/* make sure magic number is correct */
2621 	if (wm->wm_magic != MD_SP_MAGIC) {
2622 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2623 		    "found incorrect magic number %x, expected %x.\n"),
2624 		    wm->wm_magic, MD_SP_MAGIC);
2625 		/*
2626 		 * Pass NULL for the device name as we don't have
2627 		 * valid watermark contents.
2628 		 */
2629 		return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL));
2630 	}
2631 
2632 	if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
2633 	    sizeof (*wm), NULL)) {
2634 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2635 		    "found incorrect checksum %x.\n"),
2636 		    wm->wm_checksum);
2637 		return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname));
2638 	}
2639 
2640 	return (0);
2641 }
2642 
2643 /*
2644  * **************************************************************************
2645  *                  Query Functions
2646  * **************************************************************************
2647  */
2648 
2649 /*
2650  * IMPORTANT NOTE: This is a static function that assumes that
2651  *		   its input parameters have been checked and
2652  *		   have valid values that lie within acceptable
2653  *		   ranges.
2654  *
2655  * FUNCTION:	meta_sp_enough_space()
2656  * INPUT:	desired_number_of_sps - the number of soft partitions desired;
2657  *					must be > 0
2658  *		desired_sp_size - the desired soft partition size in blocks;
2659  *				  must be > 0
2660  *		extent_listpp - a reference to a reference to an extent
2661  *				list that lists the extents on a device;
2662  *				must be a reference to a reference to a
2663  *				valid extent list
2664  *		alignment - the desired data space alignment for the sp's
2665  * OUTPUT:	boolean_t return value
2666  * RETURNS:	boolean_t - B_TRUE if there's enough space in the extent
2667  *			    list to create the desired soft partitions,
2668  *			    B_FALSE if there's not enough space
2669  * PURPOSE:	determines whether there's enough free space in an extent
2670  *		list to allow creation of a set of soft partitions
2671  */
2672 static boolean_t
2673 meta_sp_enough_space(
2674 	int		desired_number_of_sps,
2675 	blkcnt_t	desired_sp_size,
2676 	sp_ext_node_t	**extent_listpp,
2677 	sp_ext_length_t	alignment
2678 )
2679 {
2680 	boolean_t		enough_space;
2681 	int			number_of_sps;
2682 	int			number_of_extents_used;
2683 	sp_ext_length_t		desired_ext_length = desired_sp_size;
2684 
2685 	enough_space = B_TRUE;
2686 	number_of_sps = 0;
2687 	while ((enough_space == B_TRUE) &&
2688 		(number_of_sps < desired_number_of_sps)) {
2689 		/*
2690 		 * Use the extent allocation algorithm implemented by
2691 		 * meta_sp_alloc_by_len() to test whether the free
2692 		 * extents in the extent list referenced by *extent_listpp
2693 		 * contain enough space to accomodate a soft partition
2694 		 * of size desired_ext_length.
2695 		 *
2696 		 * Repeat the test <desired_number_of_sps> times
2697 		 * or until it fails, whichever comes first,
2698 		 * each time allocating the extents required to
2699 		 * create the soft partition without actually
2700 		 * creating the soft partition.
2701 		 */
2702 		number_of_extents_used = meta_sp_alloc_by_len(
2703 						TEST_SETNAMEP,
2704 						TEST_SOFT_PARTITION_NAMEP,
2705 						extent_listpp,
2706 						&desired_ext_length,
2707 						NO_OFFSET,
2708 						alignment);
2709 		if (number_of_extents_used == -1) {
2710 			enough_space = B_FALSE;
2711 		} else {
2712 			number_of_sps++;
2713 		}
2714 	}
2715 	return (enough_space);
2716 }
2717 
2718 /*
2719  * IMPORTANT NOTE: This is a static function that calls other functions
2720  *		   that check its mdsetnamep and device_mdnamep
2721  *		   input parameters, but expects extent_listpp to
2722  *		   be a initialized to a valid address to which
2723  *		   it can write a reference to the extent list that
2724  *		   it creates.
2725  *
2726  * FUNCTION:	meta_sp_get_extent_list()
2727  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2728  *			     for the set containing the device for
2729  *			     which the extents are to be listed
2730  *		device_mdnamep - a reference to the mdname_t structure
2731  *				 for the device for which the extents
2732  *				 are to be listed
2733  * OUTPUT:	*extent_listpp - a reference to the extent list for
2734  *				 the device; NULL if the function fails
2735  *		*ep - the libmeta error encountered, if any
2736  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2737  *			    B_FALSE if not
2738  * PURPOSE:	gets the extent list for a device
2739  */
2740 static boolean_t
2741 meta_sp_get_extent_list(
2742 	mdsetname_t	*mdsetnamep,
2743 	mdname_t	*device_mdnamep,
2744 	sp_ext_node_t	**extent_listpp,
2745 	md_error_t	*ep
2746 )
2747 {
2748 	diskaddr_t		device_size_in_blocks;
2749 	mdnamelist_t		*sp_name_listp;
2750 	diskaddr_t		start_block_address_in_blocks;
2751 
2752 	*extent_listpp = NULL;
2753 	sp_name_listp = NULL;
2754 
2755 	start_block_address_in_blocks = meta_sp_get_start(mdsetnamep,
2756 						device_mdnamep,
2757 						ep);
2758 	if (start_block_address_in_blocks == MD_DISKADDR_ERROR) {
2759 	    if (getenv(META_SP_DEBUG)) {
2760 		mde_perror(ep, "meta_sp_get_extent_list:meta_sp_get_start");
2761 	    }
2762 	    return (B_FALSE);
2763 	}
2764 
2765 	device_size_in_blocks = metagetsize(device_mdnamep, ep);
2766 	if (device_size_in_blocks == MD_DISKADDR_ERROR) {
2767 	    if (getenv(META_SP_DEBUG)) {
2768 		mde_perror(ep,
2769 		    "meta_sp_get_extent_list:metagetsize");
2770 	    }
2771 	    return (B_FALSE);
2772 	}
2773 
2774 	/*
2775 	 * Sanity check: the start block will have skipped an integer
2776 	 * number of cylinders, C.  C will usually be zero.  If (C > 0),
2777 	 * and the disk slice happens to only be C cylinders in total
2778 	 * size, we'll fail this check.
2779 	 */
2780 	if (device_size_in_blocks <=
2781 	    (start_block_address_in_blocks + MD_SP_WMSIZE)) {
2782 	    (void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname);
2783 	    return (B_FALSE);
2784 	}
2785 
2786 	/*
2787 	 * After this point, we will have allocated resources, so any
2788 	 * failure returns must be through the supplied "fail" label
2789 	 * to properly deallocate things.
2790 	 */
2791 
2792 	/*
2793 	 * Create an empty extent list that starts one watermark past
2794 	 * the start block of the device and ends one watermark before
2795 	 * the end of the device.
2796 	 */
2797 	meta_sp_list_insert(TEST_SETNAMEP,
2798 			    TEST_SOFT_PARTITION_NAMEP,
2799 			    extent_listpp,
2800 			    NO_OFFSET,
2801 			    (sp_ext_length_t)start_block_address_in_blocks,
2802 			    EXTTYP_RESERVED,
2803 			    NO_SEQUENCE_NUMBER,
2804 			    NO_FLAGS,
2805 			    meta_sp_cmp_by_offset);
2806 	meta_sp_list_insert(TEST_SETNAMEP,
2807 			    TEST_SOFT_PARTITION_NAMEP,
2808 			    extent_listpp,
2809 			    (sp_ext_offset_t)(device_size_in_blocks -
2810 				MD_SP_WMSIZE),
2811 			    MD_SP_WMSIZE,
2812 			    EXTTYP_END,
2813 			    NO_SEQUENCE_NUMBER,
2814 			    NO_FLAGS,
2815 			    meta_sp_cmp_by_offset);
2816 
2817 	/*
2818 	 * Get the list of soft partitions that are already on the
2819 	 * device.
2820 	 */
2821 	if (meta_sp_get_by_component(mdsetnamep, device_mdnamep,
2822 	    &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) {
2823 		if (getenv(META_SP_DEBUG)) {
2824 			mde_perror(ep,
2825 			    "meta_sp_get_extent_list:meta_sp_get_by_component");
2826 		}
2827 		goto fail;
2828 	}
2829 
2830 	if (sp_name_listp != NULL) {
2831 		/*
2832 		 * If there are soft partitions on the device, add the
2833 		 * extents used in them to the extent list.
2834 		 */
2835 		if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp,
2836 		    extent_listpp, ep) == -1) {
2837 			if (getenv(META_SP_DEBUG)) {
2838 				mde_perror(ep, "meta_sp_get_extent_list:"
2839 				    "meta_sp_extlist_from_namelist");
2840 			}
2841 			goto fail;
2842 		}
2843 		metafreenamelist(sp_name_listp);
2844 	}
2845 
2846 	/*
2847 	 * Add free extents to the extent list to represent
2848 	 * the remaining regions of free space on the
2849 	 * device.
2850 	 */
2851 	meta_sp_list_freefill(extent_listpp, device_size_in_blocks);
2852 	return (B_TRUE);
2853 
2854 fail:
2855 	if (sp_name_listp != NULL) {
2856 		metafreenamelist(sp_name_listp);
2857 	}
2858 
2859 	if (*extent_listpp != NULL) {
2860 		/*
2861 		 * meta_sp_list_free sets *extent_listpp to NULL.
2862 		 */
2863 		meta_sp_list_free(extent_listpp);
2864 	}
2865 	return (B_FALSE);
2866 }
2867 
2868 /*
2869  * IMPORTANT NOTE: This is a static function that calls other functions
2870  *		   that check its mdsetnamep and mddrivenamep
2871  *		   input parameters, but expects extent_listpp to
2872  *		   be a initialized to a valid address to which
2873  *		   it can write a reference to the extent list that
2874  *		   it creates.
2875  *
2876  * FUNCTION:	meta_sp_get_extent_list_for_drive()
2877  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2878  *			     for the set containing the drive for
2879  *			     which the extents are to be listed
2880  *		mddrivenamep   - a reference to the mddrivename_t structure
2881  *				 for the drive for which the extents
2882  *				 are to be listed
2883  * OUTPUT:	*extent_listpp - a reference to the extent list for
2884  *				 the drive; NULL if the function fails
2885  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2886  *			    B_FALSE if not
2887  * PURPOSE:	gets the extent list for a drive when the entire drive
2888  *		is to be soft partitioned
2889  */
2890 static boolean_t
2891 meta_sp_get_extent_list_for_drive(
2892 	mdsetname_t	*mdsetnamep,
2893 	mddrivename_t	*mddrivenamep,
2894 	sp_ext_node_t	**extent_listpp
2895 )
2896 {
2897 	boolean_t		can_use;
2898 	diskaddr_t		free_space;
2899 	md_error_t		mderror;
2900 	mdvtoc_t		proposed_vtoc;
2901 	int			repartition_options;
2902 	int			return_value;
2903 	md_sp_t			test_sp_struct;
2904 
2905 	can_use = B_TRUE;
2906 	*extent_listpp = NULL;
2907 	mderror = mdnullerror;
2908 	test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0,
2909 					&mderror);
2910 	if (test_sp_struct.compnamep == NULL) {
2911 		can_use = B_FALSE;
2912 	}
2913 
2914 	if (can_use == B_TRUE) {
2915 		mderror = mdnullerror;
2916 		repartition_options = 0;
2917 		return_value = meta_check_sp(mdsetnamep, &test_sp_struct,
2918 				MDCMD_USE_WHOLE_DISK, &repartition_options,
2919 				&mderror);
2920 		if (return_value != 0) {
2921 			can_use = B_FALSE;
2922 		}
2923 	}
2924 
2925 	if (can_use == B_TRUE) {
2926 		mderror = mdnullerror;
2927 		repartition_options = repartition_options |
2928 			(MD_REPART_FORCE | MD_REPART_DONT_LABEL);
2929 		return_value = meta_repartition_drive(mdsetnamep, mddrivenamep,
2930 				repartition_options, &proposed_vtoc, &mderror);
2931 		if (return_value != 0) {
2932 			can_use = B_FALSE;
2933 		}
2934 	}
2935 
2936 	if (can_use == B_TRUE) {
2937 		free_space = proposed_vtoc.parts[MD_SLICE0].size;
2938 		if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) {
2939 			can_use = B_FALSE;
2940 		}
2941 	}
2942 
2943 	if (can_use == B_TRUE) {
2944 		/*
2945 		 * Create an extent list that starts with
2946 		 * a reserved extent that ends at the start
2947 		 * of the usable space on slice zero of the
2948 		 * proposed VTOC, ends with an extent that
2949 		 * reserves space for a watermark at the end
2950 		 * of slice zero, and contains a single free
2951 		 * extent that occupies the rest of the space
2952 		 * on the slice.
2953 		 *
2954 		 * NOTE:
2955 		 *
2956 		 * Don't use metagetstart() or metagetsize() to
2957 		 * find the usable space.  They query the mdname_t
2958 		 * structure that represents an actual device to
2959 		 * determine the amount of space on the device that
2960 		 * contains metadata and the total amount of space
2961 		 * on the device.  Since this function creates a
2962 		 * proposed extent list that doesn't reflect the
2963 		 * state of an actual device, there's no mdname_t
2964 		 * structure to be queried.
2965 		 *
2966 		 * When a drive is reformatted to prepare for
2967 		 * soft partitioning, all of slice seven is
2968 		 * reserved for metadata, all of slice zero is
2969 		 * available for soft partitioning, and all other
2970 		 * slices on the drive are empty.  The proposed
2971 		 * extent list for the drive therefore contains
2972 		 * only three extents: a reserved extent that ends
2973 		 * at the start of the usable space on slice zero,
2974 		 * a single free extent that occupies all the usable
2975 		 * space on slice zero, and an ending extent that
2976 		 * reserves space for a watermark at the end of
2977 		 * slice zero.
2978 		 */
2979 		meta_sp_list_insert(TEST_SETNAMEP,
2980 			TEST_SOFT_PARTITION_NAMEP,
2981 			extent_listpp,
2982 			NO_OFFSET,
2983 			(sp_ext_length_t)(MD_SP_START),
2984 			EXTTYP_RESERVED,
2985 			NO_SEQUENCE_NUMBER,
2986 			NO_FLAGS,
2987 			meta_sp_cmp_by_offset);
2988 		meta_sp_list_insert(TEST_SETNAMEP,
2989 			TEST_SOFT_PARTITION_NAMEP,
2990 			extent_listpp,
2991 			(sp_ext_offset_t)(free_space - MD_SP_WMSIZE),
2992 			MD_SP_WMSIZE,
2993 			EXTTYP_END,
2994 			NO_SEQUENCE_NUMBER,
2995 			NO_FLAGS,
2996 			meta_sp_cmp_by_offset);
2997 		meta_sp_list_freefill(extent_listpp, free_space);
2998 	}
2999 	return (can_use);
3000 }
3001 
3002 /*
3003  * FUNCTION:	meta_sp_can_create_sps()
3004  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3005  *			     for the set containing the device for
3006  *			     which the extents are to be listed
3007  *		mdnamep - a reference to the mdname_t of the device
3008  *			  on which the soft parititions are to be created
3009  *		number_of_sps - the desired number of soft partitions
3010  *		sp_size - the desired soft partition size
3011  * OUTPUT:	boolean_t return value
3012  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3013  *			    B_FALSE if not
3014  * PURPOSE:	determines whether a set of soft partitions can be created
3015  *		on a device
3016  */
3017 boolean_t
3018 meta_sp_can_create_sps(
3019 	mdsetname_t	*mdsetnamep,
3020 	mdname_t	*mdnamep,
3021 	int		number_of_sps,
3022 	blkcnt_t	sp_size
3023 )
3024 {
3025 	sp_ext_node_t	*extent_listp;
3026 	boolean_t	succeeded;
3027 	md_error_t	mde;
3028 
3029 	if ((number_of_sps > 0) && (sp_size > 0)) {
3030 		succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3031 						    &extent_listp, &mde);
3032 	} else {
3033 		succeeded = B_FALSE;
3034 	}
3035 
3036 	/*
3037 	 * We don't really care about an error return from the
3038 	 * alignment call; that will just result in passing zero,
3039 	 * which will be interpreted as no alignment.
3040 	 */
3041 
3042 	if (succeeded == B_TRUE) {
3043 		succeeded = meta_sp_enough_space(number_of_sps,
3044 		    sp_size, &extent_listp,
3045 		    meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde));
3046 		meta_sp_list_free(&extent_listp);
3047 	}
3048 	return (succeeded);
3049 }
3050 
3051 /*
3052  * FUNCTION:	meta_sp_can_create_sps_on_drive()
3053  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3054  *			     for the set containing the drive for
3055  *			     which the extents are to be listed
3056  *		mddrivenamep - a reference to the mddrivename_t of the drive
3057  *			       on which the soft parititions are to be created
3058  *		number_of_sps - the desired number of soft partitions
3059  *		sp_size - the desired soft partition size
3060  * OUTPUT:	boolean_t return value
3061  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3062  *			    B_FALSE if not
3063  * PURPOSE:	determines whether a set of soft partitions can be created
3064  *		on a drive if the entire drive is soft partitioned
3065  */
3066 boolean_t
3067 meta_sp_can_create_sps_on_drive(
3068 	mdsetname_t	*mdsetnamep,
3069 	mddrivename_t	*mddrivenamep,
3070 	int		number_of_sps,
3071 	blkcnt_t	sp_size
3072 )
3073 {
3074 	sp_ext_node_t	*extent_listp;
3075 	boolean_t	succeeded;
3076 
3077 	if ((number_of_sps > 0) && (sp_size > 0)) {
3078 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3079 							mddrivenamep,
3080 							&extent_listp);
3081 	} else {
3082 		succeeded = B_FALSE;
3083 	}
3084 
3085 	/*
3086 	 * We don't care about alignment on the space call because
3087 	 * we're specifically dealing with a drive, which will have no
3088 	 * inherent alignment.
3089 	 */
3090 
3091 	if (succeeded == B_TRUE) {
3092 		succeeded = meta_sp_enough_space(number_of_sps, sp_size,
3093 		    &extent_listp, SP_UNALIGNED);
3094 		meta_sp_list_free(&extent_listp);
3095 	}
3096 	return (succeeded);
3097 }
3098 
3099 /*
3100  * FUNCTION:	meta_sp_get_free_space()
3101  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3102  *			     for the set containing the device for
3103  *			     which the free space is to be returned
3104  *		mdnamep - a reference to the mdname_t of the device
3105  *			  for which the free space is to be returned
3106  * OUTPUT:	blkcnt_t return value
3107  * RETURNS:	blkcnt_t - the number of blocks of free space on the device
3108  * PURPOSE:	returns the number of blocks of free space on a device
3109  */
3110 blkcnt_t
3111 meta_sp_get_free_space(
3112 	mdsetname_t	*mdsetnamep,
3113 	mdname_t	*mdnamep
3114 )
3115 {
3116 	sp_ext_node_t		*extent_listp;
3117 	sp_ext_length_t		free_blocks;
3118 	boolean_t		succeeded;
3119 	md_error_t		mde;
3120 
3121 	extent_listp = NULL;
3122 	free_blocks = 0;
3123 	succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3124 					    &extent_listp, &mde);
3125 	if (succeeded == B_TRUE) {
3126 		free_blocks = meta_sp_list_size(extent_listp,
3127 		    EXTTYP_FREE, INCLUDE_WM);
3128 		meta_sp_list_free(&extent_listp);
3129 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3130 			/*
3131 			 * Subtract a safety margin for watermarks when
3132 			 * computing the number of blocks available for
3133 			 * use.  The actual number of watermarks can't
3134 			 * be calculated without knowing the exact numbers
3135 			 * and sizes of both the free extents and the soft
3136 			 * partitions to be created.  The calculation is
3137 			 * highly complex and error-prone even if those
3138 			 * quantities are known.  The approximate value
3139 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3140 			 * correct value in all practical cases.
3141 			 */
3142 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3143 		} else {
3144 			free_blocks = 0;
3145 		}
3146 	} else {
3147 	    mdclrerror(&mde);
3148 	}
3149 
3150 	return (free_blocks);
3151 }
3152 
3153 /*
3154  * FUNCTION:	meta_sp_get_free_space_on_drive()
3155  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3156  *			     for the set containing the drive for
3157  *			     which the free space is to be returned
3158  *		mddrivenamep - a reference to the mddrivename_t of the drive
3159  *			       for which the free space is to be returned
3160  * OUTPUT:	blkcnt_t return value
3161  * RETURNS:	blkcnt_t - the number of blocks of free space on the drive
3162  * PURPOSE:	returns the number of blocks of space usable for soft
3163  *		partitions on an entire drive, if the entire drive is
3164  *		soft partitioned
3165  */
3166 blkcnt_t
3167 meta_sp_get_free_space_on_drive(
3168 	mdsetname_t	*mdsetnamep,
3169 	mddrivename_t	*mddrivenamep
3170 )
3171 {
3172 	sp_ext_node_t		*extent_listp;
3173 	sp_ext_length_t		free_blocks;
3174 	boolean_t		succeeded;
3175 
3176 	extent_listp = NULL;
3177 	free_blocks = 0;
3178 	succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3179 			mddrivenamep, &extent_listp);
3180 	if (succeeded == B_TRUE) {
3181 		free_blocks = meta_sp_list_size(extent_listp,
3182 		    EXTTYP_FREE, INCLUDE_WM);
3183 		meta_sp_list_free(&extent_listp);
3184 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3185 			/*
3186 			 * Subtract a safety margin for watermarks when
3187 			 * computing the number of blocks available for
3188 			 * use.  The actual number of watermarks can't
3189 			 * be calculated without knowing the exact numbers
3190 			 * and sizes of both the free extents and the soft
3191 			 * partitions to be created.  The calculation is
3192 			 * highly complex and error-prone even if those
3193 			 * quantities are known.  The approximate value
3194 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3195 			 * correct value in all practical cases.
3196 			 */
3197 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3198 		} else {
3199 			free_blocks = 0;
3200 		}
3201 	}
3202 	return (free_blocks);
3203 }
3204 
3205 /*
3206  * FUNCTION:	meta_sp_get_number_of_possible_sps()
3207  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3208  *			     for the set containing the device for
3209  *			     which the number of possible soft partitions
3210  *			     is to be returned
3211  *		mdnamep - a reference to the mdname_t of the device
3212  *			  for which the number of possible soft partitions
3213  *			  is to be returned
3214  * OUTPUT:	int return value
3215  * RETURNS:	int - the number of soft partitions of the desired size
3216  *		      that can be created on the device
3217  * PURPOSE:	returns the number of soft partitions of a given size
3218  *		that can be created on a device
3219  */
3220 int
3221 meta_sp_get_number_of_possible_sps(
3222 	mdsetname_t	*mdsetnamep,
3223 	mdname_t	*mdnamep,
3224 	blkcnt_t	sp_size
3225 )
3226 {
3227 	sp_ext_node_t	*extent_listp;
3228 	int		number_of_possible_sps;
3229 	boolean_t	succeeded;
3230 	md_error_t	mde;
3231 	sp_ext_length_t	alignment;
3232 
3233 	extent_listp = NULL;
3234 	number_of_possible_sps = 0;
3235 	if (sp_size > 0) {
3236 	    if ((succeeded = meta_sp_get_extent_list(mdsetnamep,
3237 		mdnamep, &extent_listp, &mde)) == B_FALSE)
3238 		mdclrerror(&mde);
3239 	} else {
3240 		succeeded = B_FALSE;
3241 	}
3242 
3243 	if (succeeded == B_TRUE) {
3244 		alignment = meta_sp_get_default_alignment(mdsetnamep,
3245 		    mdnamep, &mde);
3246 	}
3247 
3248 	while (succeeded == B_TRUE) {
3249 		/*
3250 		 * Keep allocating space from the extent list
3251 		 * for soft partitions of the desired size until
3252 		 * there's not enough free space left in the list
3253 		 * for another soft partiition of that size.
3254 		 * Add one to the number of possible soft partitions
3255 		 * for each soft partition for which there is
3256 		 * enough free space left.
3257 		 */
3258 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3259 		    sp_size, &extent_listp, alignment);
3260 		if (succeeded == B_TRUE) {
3261 			number_of_possible_sps++;
3262 		}
3263 	}
3264 	if (extent_listp != NULL) {
3265 		meta_sp_list_free(&extent_listp);
3266 	}
3267 	return (number_of_possible_sps);
3268 }
3269 
3270 /*
3271  * FUNCTION:	meta_sp_get_number_of_possible_sps_on_drive()
3272  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3273  *			     for the set containing the drive for
3274  *			     which the number of possible soft partitions
3275  *			     is to be returned
3276  *		mddrivenamep - a reference to the mddrivename_t of the drive
3277  *			       for which the number of possible soft partitions
3278  *			       is to be returned
3279  *		sp_size - the size in blocks of the proposed soft partitions
3280  * OUTPUT:	int return value
3281  * RETURNS:	int - the number of soft partitions of the desired size
3282  *		      that can be created on the drive
3283  * PURPOSE:	returns the number of soft partitions of a given size
3284  *		that can be created on a drive, if the entire drive is
3285  *		soft partitioned
3286  */
3287 int
3288 meta_sp_get_number_of_possible_sps_on_drive(
3289 	mdsetname_t	*mdsetnamep,
3290 	mddrivename_t	*mddrivenamep,
3291 	blkcnt_t	sp_size
3292 )
3293 {
3294 	sp_ext_node_t	*extent_listp;
3295 	int		number_of_possible_sps;
3296 	boolean_t	succeeded;
3297 
3298 	extent_listp = NULL;
3299 	number_of_possible_sps = 0;
3300 	if (sp_size > 0) {
3301 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3302 					mddrivenamep, &extent_listp);
3303 	} else {
3304 		succeeded = B_FALSE;
3305 	}
3306 	while (succeeded == B_TRUE) {
3307 		/*
3308 		 * Keep allocating space from the extent list
3309 		 * for soft partitions of the desired size until
3310 		 * there's not enough free space left in the list
3311 		 * for another soft partition of that size.
3312 		 * Add one to the number of possible soft partitions
3313 		 * for each soft partition for which there is
3314 		 * enough free space left.
3315 		 *
3316 		 * Since it's a drive, not a metadevice, make no
3317 		 * assumptions about alignment.
3318 		 */
3319 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3320 		    sp_size, &extent_listp, SP_UNALIGNED);
3321 		if (succeeded == B_TRUE) {
3322 			number_of_possible_sps++;
3323 		}
3324 	}
3325 	if (extent_listp != NULL) {
3326 		meta_sp_list_free(&extent_listp);
3327 	}
3328 	return (number_of_possible_sps);
3329 }
3330 
3331 /*
3332  * FUNCTION:	meta_sp_get_possible_sp_size()
3333  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3334  *			     for the set containing the device for
3335  *			     which the possible soft partition size
3336  *			     is to be returned
3337  *		mdnamep - a reference to the mdname_t of the device
3338  *			  for which the possible soft partition size
3339  *			  is to be returned
3340  *		number_of_sps - the desired number of soft partitions
3341  * OUTPUT:	blkcnt_t return value
3342  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3343  * PURPOSE:	returns the maximum possible size of each of a given number of
3344  *		soft partitions of equal size that can be created on a device
3345  */
3346 blkcnt_t
3347 meta_sp_get_possible_sp_size(
3348 	mdsetname_t	*mdsetnamep,
3349 	mdname_t	*mdnamep,
3350 	int		number_of_sps
3351 )
3352 {
3353 	blkcnt_t	free_blocks;
3354 	blkcnt_t	sp_size;
3355 	boolean_t	succeeded;
3356 
3357 	sp_size = 0;
3358 	if (number_of_sps > 0) {
3359 		free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep);
3360 		sp_size = free_blocks / number_of_sps;
3361 		succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3362 						number_of_sps, sp_size);
3363 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3364 			/*
3365 			 * To compensate for space that may have been
3366 			 * occupied by watermarks, reduce sp_size by a
3367 			 * number of blocks equal to the number of soft
3368 			 * partitions desired, and test again to see
3369 			 * whether the desired number of soft partitions
3370 			 * can be created.
3371 			 */
3372 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3373 			succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3374 							number_of_sps, sp_size);
3375 		}
3376 		if (sp_size < 0) {
3377 			sp_size = 0;
3378 		}
3379 	}
3380 	return (sp_size);
3381 }
3382 
3383 /*
3384  * FUNCTION:	meta_sp_get_possible_sp_size_on_drive()
3385  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3386  *			     for the set containing the drive for
3387  *			     which the possible soft partition size
3388  *			     is to be returned
3389  *		mddrivenamep - a reference to the mddrivename_t of the drive
3390  *			       for which the possible soft partition size
3391  *			       is to be returned
3392  *		number_of_sps - the desired number of soft partitions
3393  * OUTPUT:	blkcnt_t return value
3394  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3395  * PURPOSE:	returns the maximum possible size of each of a given number of
3396  *		soft partitions of equal size that can be created on a drive
3397  *              if the entire drive is soft partitioned
3398  */
3399 blkcnt_t
3400 meta_sp_get_possible_sp_size_on_drive(
3401 	mdsetname_t	*mdsetnamep,
3402 	mddrivename_t	*mddrivenamep,
3403 	int		number_of_sps
3404 )
3405 {
3406 	blkcnt_t	free_blocks;
3407 	blkcnt_t	sp_size;
3408 	boolean_t	succeeded;
3409 
3410 	sp_size = 0;
3411 	if (number_of_sps > 0) {
3412 		free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep,
3413 								mddrivenamep);
3414 		sp_size = free_blocks / number_of_sps;
3415 		succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3416 						mddrivenamep,
3417 						number_of_sps, sp_size);
3418 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3419 			/*
3420 			 * To compensate for space that may have been
3421 			 * occupied by watermarks, reduce sp_size by a
3422 			 * number of blocks equal to the number of soft
3423 			 * partitions desired, and test again to see
3424 			 * whether the desired number of soft partitions
3425 			 * can be created.
3426 			 */
3427 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3428 			succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3429 							mddrivenamep,
3430 							number_of_sps, sp_size);
3431 		}
3432 		if (sp_size < 0) {
3433 			sp_size = 0;
3434 		}
3435 	}
3436 	return (sp_size);
3437 }
3438 
3439 /*
3440  * **************************************************************************
3441  *                  Unit Structure Manipulation Functions                   *
3442  * **************************************************************************
3443  */
3444 
3445 /*
3446  * FUNCTION:	meta_sp_fillextarray()
3447  * INPUT:	mp	- the unit structure to fill
3448  *		extlist	- the list of extents to fill with
3449  * OUTPUT:	none
3450  * RETURNS:	void
3451  * PURPOSE:	fills in the unit structure extent list with the extents
3452  *		specified by extlist.  Only extents in extlist with the
3453  *		EXTFLG_UPDATE flag are changed in the unit structure,
3454  *		and the index into the unit structure is the sequence
3455  *		number in the extent list.  After all of the nodes have
3456  *		been updated the virtual offsets in the unit structure
3457  *		are updated to reflect the new lengths.
3458  */
3459 static void
3460 meta_sp_fillextarray(
3461 	mp_unit_t	*mp,
3462 	sp_ext_node_t	*extlist
3463 )
3464 {
3465 	int	i;
3466 	sp_ext_node_t	*ext;
3467 	sp_ext_offset_t	curvoff = 0LL;
3468 
3469 	assert(mp != NULL);
3470 
3471 	/* go through the allocation list and fill in our unit structure */
3472 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
3473 		if ((ext->ext_type == EXTTYP_ALLOC) &&
3474 		    (ext->ext_flags & EXTFLG_UPDATE) != 0) {
3475 			mp->un_ext[ext->ext_seq].un_poff =
3476 			    ext->ext_offset + MD_SP_WMSIZE;
3477 			mp->un_ext[ext->ext_seq].un_len =
3478 			    ext->ext_length - MD_SP_WMSIZE;
3479 		}
3480 	}
3481 
3482 	for (i = 0; i < mp->un_numexts; i++) {
3483 		assert(mp->un_ext[i].un_poff != 0);
3484 		assert(mp->un_ext[i].un_len  != 0);
3485 		mp->un_ext[i].un_voff = curvoff;
3486 		curvoff += mp->un_ext[i].un_len;
3487 	}
3488 }
3489 
3490 /*
3491  * FUNCTION:	meta_sp_createunit()
3492  * INPUT:	np	- the name of the device to create a unit structure for
3493  *		compnp	- the name of the device the soft partition is on
3494  *		extlist	- the extent list to populate the new unit with
3495  *		numexts	- the number of extents in the extent list
3496  *		len	- the total size of the soft partition (sectors)
3497  *		status	- the initial status of the unit structure
3498  * OUTPUT:	ep	- return error pointer
3499  * RETURNS:	mp_unit_t * - the new unit structure.
3500  * PURPOSE:	allocates and fills in a new soft partition unit
3501  *		structure to be passed to the soft partitioning driver
3502  *		for creation.
3503  */
3504 static mp_unit_t *
3505 meta_sp_createunit(
3506 	mdname_t	*np,
3507 	mdname_t	*compnp,
3508 	sp_ext_node_t	*extlist,
3509 	int		numexts,
3510 	sp_ext_length_t	len,
3511 	sp_status_t	status,
3512 	md_error_t	*ep
3513 )
3514 {
3515 	mp_unit_t	*mp;
3516 	uint_t		ms_size;
3517 
3518 	ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) +
3519 	    (numexts * sizeof (mp->un_ext[0]));
3520 
3521 	mp = Zalloc(ms_size);
3522 
3523 	/* fill in fields in common unit structure */
3524 	mp->c.un_type = MD_METASP;
3525 	mp->c.un_size = ms_size;
3526 	MD_SID(mp) = meta_getminor(np->dev);
3527 	mp->c.un_total_blocks = len;
3528 	mp->c.un_actual_tb = len;
3529 
3530 	/* set up geometry */
3531 	(void) meta_sp_setgeom(np, compnp, mp, ep);
3532 
3533 	/* if we're building on metadevice we can't parent */
3534 	if (metaismeta(compnp))
3535 		MD_CAPAB(mp) = MD_CANT_PARENT;
3536 	else
3537 		MD_CAPAB(mp) = MD_CAN_PARENT;
3538 
3539 	/* fill soft partition-specific fields */
3540 	mp->un_dev = compnp->dev;
3541 	mp->un_key = compnp->key;
3542 
3543 	/* mdname_t start_blk field is not 64-bit! */
3544 	mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk;
3545 	mp->un_status = status;
3546 	mp->un_numexts = numexts;
3547 	mp->un_length = len;
3548 
3549 	/* fill in the extent array */
3550 	meta_sp_fillextarray(mp, extlist);
3551 
3552 	return (mp);
3553 }
3554 
3555 /*
3556  * FUNCTION:	meta_sp_updateunit()
3557  * INPUT:	np       - name structure for the metadevice being updated
3558  *		old_un	 - the original unit structure that is being updated
3559  *		extlist	 - the extent list to populate the new unit with
3560  *		grow_len - the amount by which the partition is being grown
3561  *		numexts	 - the number of extents in the extent list
3562  *		ep       - return error pointer
3563  * OUTPUT:	none
3564  * RETURNS:	mp_unit_t * - the updated unit structure
3565  * PURPOSE:	allocates and fills in a new soft partition unit structure to
3566  *		be passed to the soft partitioning driver for creation.  The
3567  *		old unit structure is first copied in, and then the updated
3568  *		extents are changed in the new unit structure.  This is
3569  *		typically used when the size of an existing unit is changed.
3570  */
3571 static mp_unit_t *
3572 meta_sp_updateunit(
3573 	mdname_t	*np,
3574 	mp_unit_t	*old_un,
3575 	sp_ext_node_t	*extlist,
3576 	sp_ext_length_t	grow_len,
3577 	int		numexts,
3578 	md_error_t	*ep
3579 )
3580 {
3581 	mp_unit_t	*new_un;
3582 	sp_ext_length_t	new_len;
3583 	uint_t		new_size;
3584 
3585 	assert(old_un != NULL);
3586 	assert(extlist != NULL);
3587 
3588 	/* allocate new unit structure and copy in old unit */
3589 	new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) +
3590 	    ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0]));
3591 	new_len = old_un->un_length + grow_len;
3592 	new_un = Zalloc(new_size);
3593 	bcopy(old_un, new_un, old_un->c.un_size);
3594 
3595 	/* update size and geometry information */
3596 	new_un->c.un_size = new_size;
3597 	new_un->un_length = new_len;
3598 	new_un->c.un_total_blocks = new_len;
3599 	new_un->c.un_actual_tb = new_len;
3600 	if (meta_adjust_geom((md_unit_t *)new_un, np,
3601 	    old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct,
3602 	    0, ep) != 0) {
3603 		Free(new_un);
3604 		return (NULL);
3605 	}
3606 
3607 	/* update extent information */
3608 	new_un->un_numexts += numexts;
3609 
3610 	meta_sp_fillextarray(new_un, extlist);
3611 
3612 	return (new_un);
3613 }
3614 
3615 /*
3616  * FUNCTION:	meta_get_sp()
3617  * INPUT:	sp	- the set name for the device to get
3618  *		np	- the name of the device to get
3619  * OUTPUT:	ep	- return error pointer
3620  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition
3621  * PURPOSE:	interface to the rest of libmeta for fetching a unit structure
3622  *		for the named device.  Just a wrapper for meta_get_sp_common().
3623  */
3624 md_sp_t *
3625 meta_get_sp(
3626 	mdsetname_t	*sp,
3627 	mdname_t	*np,
3628 	md_error_t	*ep
3629 )
3630 {
3631 	return (meta_get_sp_common(sp, np, 0, ep));
3632 }
3633 
3634 /*
3635  * FUNCTION:	meta_get_sp_common()
3636  * INPUT:	sp	- the set name for the device to get
3637  *		np	- the name of the device to get
3638  *		fast	- whether to use the cache or not (NOT IMPLEMENTED!)
3639  * OUTPUT:	ep	- return error pointer
3640  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition,
3641  *			    NULL if np is not a soft partition
3642  * PURPOSE:	common routine for fetching a soft partition unit structure
3643  */
3644 md_sp_t *
3645 meta_get_sp_common(
3646 	mdsetname_t	*sp,
3647 	mdname_t	*np,
3648 	int		fast,
3649 	md_error_t	*ep
3650 )
3651 {
3652 	mddrivename_t	*dnp = np->drivenamep;
3653 	char		*miscname;
3654 	mp_unit_t	*mp;
3655 	md_sp_t		*msp;
3656 	int		i;
3657 
3658 	/* must have set */
3659 	assert(sp != NULL);
3660 
3661 	/* short circuit */
3662 	if (dnp->unitp != NULL) {
3663 		if (dnp->unitp->type != MD_METASP)
3664 			return (NULL);
3665 		return ((md_sp_t *)dnp->unitp);
3666 	}
3667 	/* get miscname and unit */
3668 	if ((miscname = metagetmiscname(np, ep)) == NULL)
3669 		return (NULL);
3670 
3671 	if (strcmp(miscname, MD_SP) != 0) {
3672 		(void) mdmderror(ep, MDE_NOT_SP, 0, np->cname);
3673 		return (NULL);
3674 	}
3675 
3676 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
3677 		return (NULL);
3678 
3679 	assert(mp->c.un_type == MD_METASP);
3680 
3681 	/* allocate soft partition */
3682 	msp = Zalloc(sizeof (*msp));
3683 
3684 	/* get the common information */
3685 	msp->common.namep = np;
3686 	msp->common.type = mp->c.un_type;
3687 	msp->common.state = mp->c.un_status;
3688 	msp->common.capabilities = mp->c.un_capabilities;
3689 	msp->common.parent = mp->c.un_parent;
3690 	msp->common.size = mp->c.un_total_blocks;
3691 	msp->common.user_flags = mp->c.un_user_flags;
3692 	msp->common.revision = mp->c.un_revision;
3693 
3694 	/* get soft partition information */
3695 	if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL)
3696 		goto out;
3697 
3698 	/*
3699 	 * Fill in the key and the start block.  Note that the start
3700 	 * block in the unit structure is 64 bits but the name pointer
3701 	 * only supports 32 bits.
3702 	 */
3703 	msp->compnamep->key = mp->un_key;
3704 	msp->compnamep->start_blk = mp->un_start_blk;
3705 
3706 	/* fill in status field */
3707 	msp->status = mp->un_status;
3708 
3709 	/* allocate the extents */
3710 	msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val));
3711 	msp->ext.ext_len = mp->un_numexts;
3712 
3713 	/* do the extents for this soft partition */
3714 	for (i = 0; i < mp->un_numexts; i++) {
3715 		struct mp_ext	*mde = &mp->un_ext[i];
3716 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
3717 
3718 		extp->voff = mde->un_voff;
3719 		extp->poff = mde->un_poff;
3720 		extp->len = mde->un_len;
3721 	}
3722 
3723 	/* cleanup, return success */
3724 	Free(mp);
3725 	dnp->unitp = (md_common_t *)msp;
3726 	return (msp);
3727 
3728 out:
3729 	/* clean up and return error */
3730 	Free(mp);
3731 	Free(msp);
3732 	return (NULL);
3733 }
3734 
3735 
3736 /*
3737  * FUNCTION:	meta_init_sp()
3738  * INPUT:	spp	- the set name for the new device
3739  *		argc	- the remaining argument count for the metainit cmdline
3740  *		argv	- the remainder of the unparsed command line
3741  *		options	- global options parsed by metainit
3742  * OUTPUT:	ep	- return error pointer
3743  * RETURNS:	int	- -1 failure, 0 success
3744  * PURPOSE:	provides the command line parsing and name management overhead
3745  *		for creating a new soft partition.  Ultimately this calls
3746  *		meta_create_sp() which does the real work of allocating space
3747  *		for the new soft partition.
3748  */
3749 int
3750 meta_init_sp(
3751 	mdsetname_t	**spp,
3752 	int		argc,
3753 	char		*argv[],
3754 	mdcmdopts_t	options,
3755 	md_error_t	*ep
3756 )
3757 {
3758 	char		*compname = NULL;
3759 	mdname_t	*spcompnp = NULL;	/* name of component volume */
3760 	char		*devname = argv[0];	/* unit name */
3761 	mdname_t	*np = NULL;		/* name of soft partition */
3762 	md_sp_t		*msp = NULL;
3763 	int		c;
3764 	int		old_optind;
3765 	sp_ext_length_t	len = 0LL;
3766 	int		rval = -1;
3767 	uint_t		seq;
3768 	int		oflag;
3769 	int		failed;
3770 	mddrivename_t	*dnp = NULL;
3771 	sp_ext_length_t	alignment = 0LL;
3772 	sp_ext_node_t	*extlist = NULL;
3773 
3774 	assert(argc > 0);
3775 
3776 	/* expect sp name, -p, optional -e, compname, and size parameters */
3777 	/* grab soft partition name */
3778 	if ((np = metaname(spp, devname, META_DEVICE, ep)) == NULL)
3779 		goto out;
3780 
3781 	/* see if it exists already */
3782 	if (metagetmiscname(np, ep) != NULL) {
3783 		(void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP,
3784 		    meta_getminor(np->dev), devname);
3785 		goto out;
3786 	} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) {
3787 		goto out;
3788 	} else {
3789 		mdclrerror(ep);
3790 	}
3791 	--argc, ++argv;
3792 
3793 	if (argc == 0)
3794 		goto syntax;
3795 
3796 	/* grab -p */
3797 	if (strcmp(argv[0], "-p") != 0)
3798 		goto syntax;
3799 	--argc, ++argv;
3800 
3801 	if (argc == 0)
3802 		goto syntax;
3803 
3804 	/* see if -e is there */
3805 	if (strcmp(argv[0], "-e") == 0) {
3806 		/* use the whole disk */
3807 		options |= MDCMD_USE_WHOLE_DISK;
3808 		--argc, ++argv;
3809 	}
3810 
3811 	if (argc == 0)
3812 		goto syntax;
3813 
3814 	/* get component name */
3815 	compname = Strdup(argv[0]);
3816 
3817 	if (options & MDCMD_USE_WHOLE_DISK) {
3818 		if ((dnp = metadrivename(spp, compname, ep)) == NULL) {
3819 			goto out;
3820 		}
3821 		if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) {
3822 			goto out;
3823 		}
3824 	} else if ((spcompnp = metaname(spp, compname, UNKNOWN, ep)) == NULL) {
3825 		goto out;
3826 	}
3827 	assert(*spp != NULL);
3828 
3829 	if (!(options & MDCMD_NOLOCK)) {
3830 		/* grab set lock */
3831 		if (meta_lock(*spp, TRUE, ep))
3832 			goto out;
3833 
3834 		if (meta_check_ownership(*spp, ep) != 0)
3835 			goto out;
3836 	}
3837 
3838 	/* allocate the soft partition */
3839 	msp = Zalloc(sizeof (*msp));
3840 
3841 	/* setup common */
3842 	msp->common.namep = np;
3843 	msp->common.type = MD_METASP;
3844 
3845 	compname = spcompnp->cname;
3846 
3847 	assert(spcompnp->rname != NULL);
3848 	--argc, ++argv;
3849 
3850 	if (argc == 0) {
3851 		goto syntax;
3852 	}
3853 
3854 	if (*argv[0] == '-') {
3855 		/*
3856 		 * parse any other command line options, this includes
3857 		 * the recovery options -o and -b. The special thing
3858 		 * with these options is that the len needs to be
3859 		 * kept track of otherwise when the geometry of the
3860 		 * "device" is built it will create an invalid geometry
3861 		 */
3862 		old_optind = optind = 0;
3863 		opterr = 0;
3864 		oflag = 0;
3865 		seq = 0;
3866 		failed = 0;
3867 		while ((c = getopt(argc, argv, "A:o:b:")) != -1) {
3868 			sp_ext_offset_t	offset;
3869 			sp_ext_length_t	length;
3870 			longlong_t	tmp_size;
3871 
3872 			switch (c) {
3873 			case 'A':	/* data alignment */
3874 				if (meta_sp_parsesizestring(optarg,
3875 					&alignment) == -1) {
3876 					failed = 1;
3877 				}
3878 				break;
3879 			case 'o':	/* offset in the partition */
3880 				if (oflag == 1) {
3881 					failed = 1;
3882 				} else {
3883 					tmp_size = atoll(optarg);
3884 					if (tmp_size <= 0) {
3885 						failed = 1;
3886 					} else {
3887 						oflag = 1;
3888 						options |= MDCMD_DIRECT;
3889 
3890 						offset = tmp_size;
3891 					}
3892 				}
3893 
3894 				break;
3895 			case 'b':	/* number of blocks */
3896 				if (oflag == 0) {
3897 					failed = 1;
3898 				} else {
3899 					tmp_size = atoll(optarg);
3900 					if (tmp_size <= 0) {
3901 						failed = 1;
3902 					} else {
3903 						oflag = 0;
3904 
3905 						length = tmp_size;
3906 
3907 						/* we have a pair of values */
3908 						meta_sp_list_insert(*spp, np,
3909 							&extlist, offset,
3910 							length, EXTTYP_ALLOC,
3911 							seq++, EXTFLG_UPDATE,
3912 							meta_sp_cmp_by_offset);
3913 						len += length;
3914 					}
3915 				}
3916 
3917 				break;
3918 			default:
3919 				argc -= old_optind;
3920 				argv += old_optind;
3921 				goto options;
3922 			}
3923 
3924 			if (failed) {
3925 				argc -= old_optind;
3926 				argv += old_optind;
3927 				goto syntax;
3928 			}
3929 
3930 			old_optind = optind;
3931 		}
3932 		argc -= optind;
3933 		argv += optind;
3934 
3935 		/*
3936 		 * Must have matching pairs of -o and -b flags
3937 		 */
3938 		if (oflag != 0)
3939 			goto syntax;
3940 
3941 		/*
3942 		 * Can't specify both layout (indicated indirectly by
3943 		 * len being set by thye -o/-b cases above) AND
3944 		 * alignment
3945 		 */
3946 		if ((len > 0LL) && (alignment > 0LL))
3947 			goto syntax;
3948 
3949 		/*
3950 		 * sanity check the allocation list
3951 		 */
3952 		if ((extlist != NULL) && meta_sp_list_overlaps(extlist))
3953 			goto syntax;
3954 	}
3955 
3956 	if (len == 0LL) {
3957 		if (argc == 0)
3958 			goto syntax;
3959 		if (meta_sp_parsesize(argv[0], &len) == -1)
3960 			goto syntax;
3961 		--argc, ++argv;
3962 	}
3963 
3964 	msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val));
3965 	msp->ext.ext_val->len = len;
3966 	msp->compnamep = spcompnp;
3967 
3968 	/* we should be at the end */
3969 	if (argc != 0)
3970 		goto syntax;
3971 
3972 	/* create soft partition */
3973 	if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0)
3974 		goto out;
3975 	rval = 0;
3976 
3977 	/* let em know */
3978 	if (options & MDCMD_PRINT) {
3979 		(void) printf(dgettext(TEXT_DOMAIN,
3980 		    "%s: Soft Partition is setup\n"),
3981 		    devname);
3982 		(void) fflush(stdout);
3983 	}
3984 	goto out;
3985 
3986 syntax:
3987 	/* syntax error */
3988 	rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv);
3989 	goto out;
3990 
3991 options:
3992 	/* options error */
3993 	rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv);
3994 	goto out;
3995 
3996 out:
3997 	if (msp != NULL) {
3998 		if (msp->ext.ext_val != NULL) {
3999 			Free(msp->ext.ext_val);
4000 		}
4001 		Free(msp);
4002 	}
4003 
4004 	return (rval);
4005 }
4006 
4007 /*
4008  * FUNCTION:	meta_free_sp()
4009  * INPUT:	msp	- the soft partition unit to free
4010  * OUTPUT:	none
4011  * RETURNS:	void
4012  * PURPOSE:	provides an interface from the rest of libmeta for freeing a
4013  *		soft partition unit
4014  */
4015 void
4016 meta_free_sp(md_sp_t *msp)
4017 {
4018 	Free(msp);
4019 }
4020 
4021 /*
4022  * FUNCTION:	meta_sp_issp()
4023  * INPUT:	sp	- the set name to check
4024  *		np	- the name to check
4025  * OUTPUT:	ep	- return error pointer
4026  * RETURNS:	int	- 0 means sp,np is a soft partition
4027  *			  1 means sp,np is not a soft partition
4028  * PURPOSE:	determines whether the given device is a soft partition
4029  *		device.  This is called by other metadevice check routines.
4030  */
4031 int
4032 meta_sp_issp(
4033 	mdsetname_t	*sp,
4034 	mdname_t	*np,
4035 	md_error_t	*ep
4036 )
4037 {
4038 	if (meta_get_sp_common(sp, np, 0, ep) == NULL)
4039 		return (1);
4040 
4041 	return (0);
4042 }
4043 
4044 /*
4045  * FUNCTION:	meta_check_sp()
4046  * INPUT:	sp	- the set name to check
4047  *		msp	- the unit structure to check
4048  *		options	- creation options
4049  * OUTPUT:	repart_options - options to be passed to
4050  *				meta_repartition_drive()
4051  *		ep	- return error pointer
4052  * RETURNS:	int	-  0 ok to create on this component
4053  *			  -1 error or not ok to create on this component
4054  * PURPOSE:	Checks to determine whether the rules for creation of
4055  *		soft partitions allow creation of a soft partition on
4056  *		the device described by the mdname_t structure referred
4057  *		to by msp->compnamep.
4058  *
4059  *		NOTE: Does NOT check to determine whether the extents
4060  *		      described in the md_sp_t structure referred to by
4061  *		      msp will fit on the device described by the mdname_t
4062  *		      structure located at msp->compnamep.
4063  */
4064 static int
4065 meta_check_sp(
4066 	mdsetname_t	*sp,
4067 	md_sp_t		*msp,
4068 	mdcmdopts_t	options,
4069 	int		*repart_options,
4070 	md_error_t	*ep
4071 )
4072 {
4073 	md_common_t	*mdp;
4074 	mdname_t	*compnp = msp->compnamep;
4075 	uint_t		slice;
4076 	mddrivename_t	*dnp;
4077 	mdname_t	*slicenp;
4078 	mdvtoc_t	*vtocp;
4079 
4080 	/* make sure it is in the set */
4081 	if (meta_check_inset(sp, compnp, ep) != 0)
4082 		return (-1);
4083 
4084 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4085 		uint_t	rep_slice;
4086 
4087 		/*
4088 		 * check to make sure we can partition this drive.
4089 		 * we cannot continue if any of the following are
4090 		 * true:
4091 		 * The drive is a metadevice.
4092 		 * The drive contains a mounted slice.
4093 		 * The drive contains a slice being swapped to.
4094 		 * The drive contains slices which are part of other
4095 		 * metadevices.
4096 		 * The drive contains a metadb.
4097 		 */
4098 		if (metaismeta(compnp))
4099 			return (mddeverror(ep, MDE_IS_META, compnp->dev,
4100 			    compnp->cname));
4101 
4102 		assert(compnp->drivenamep != NULL);
4103 
4104 		/*
4105 		 * ensure that we have slice 0 since the disk will be
4106 		 * repartitioned in the USE_WHOLE_DISK case.  this check
4107 		 * is redundant unless the user incorrectly specifies a
4108 		 * a fully qualified drive AND slice name (i.e.,
4109 		 * /dev/dsk/cXtXdXsX), which will be incorrectly
4110 		 * recognized as a drive name by the metaname code.
4111 		 */
4112 
4113 		if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL)
4114 			return (-1);
4115 		if (slice != MD_SLICE0)
4116 			return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname));
4117 
4118 		dnp = compnp->drivenamep;
4119 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
4120 			return (-1);
4121 
4122 		for (slice = 0; slice < vtocp->nparts; slice++) {
4123 
4124 			/* only check if the slice really exists */
4125 			if (vtocp->parts[slice].size == 0)
4126 				continue;
4127 
4128 			slicenp = metaslicename(dnp, slice, ep);
4129 			if (slicenp == NULL)
4130 				return (-1);
4131 
4132 			/* check to ensure that it is not already in use */
4133 			if (meta_check_inuse(sp,
4134 			    slicenp, MDCHK_INUSE, ep) != 0) {
4135 				return (-1);
4136 			}
4137 
4138 			/*
4139 			 * Up to this point, tests are applied to all
4140 			 * slices uniformly.
4141 			 */
4142 
4143 			if (slice == rep_slice) {
4144 				/*
4145 				 * Tests inside the body of this
4146 				 * conditional are applied only to
4147 				 * slice seven.
4148 				 */
4149 				if (meta_check_inmeta(sp, slicenp,
4150 				    options | MDCHK_ALLOW_MDDB |
4151 				    MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0)
4152 					return (-1);
4153 
4154 				/*
4155 				 * For slice seven, a metadb is NOT an
4156 				 * automatic failure. It merely means
4157 				 * that we're not allowed to muck
4158 				 * about with the partitioning of that
4159 				 * slice.  We indicate this by masking
4160 				 * in the MD_REPART_LEAVE_REP flag.
4161 				 */
4162 				if (metahasmddb(sp, slicenp, ep)) {
4163 					assert(repart_options !=
4164 					    NULL);
4165 					*repart_options |=
4166 					    MD_REPART_LEAVE_REP;
4167 				}
4168 
4169 				/*
4170 				 * Skip the remaining tests for slice
4171 				 * seven
4172 				 */
4173 				continue;
4174 			}
4175 
4176 			/*
4177 			 * Tests below this point will be applied to
4178 			 * all slices EXCEPT for the replica slice.
4179 			 */
4180 
4181 
4182 			/* check if component is in a metadevice */
4183 			if (meta_check_inmeta(sp, slicenp, options, 0,
4184 			    -1, ep) != 0)
4185 				return (-1);
4186 
4187 			/* check to see if component has a metadb */
4188 			if (metahasmddb(sp, slicenp, ep))
4189 				return (mddeverror(ep, MDE_HAS_MDDB,
4190 				    slicenp->dev, slicenp->cname));
4191 		}
4192 		/*
4193 		 * This should be all of the testing necessary when
4194 		 * the MDCMD_USE_WHOLE_DISK flag is set; the rest of
4195 		 * meta_check_sp() is oriented towards component
4196 		 * arguments instead of disks.
4197 		 */
4198 		goto meta_check_sp_ok;
4199 
4200 	}
4201 
4202 	/* check to ensure that it is not already in use */
4203 	if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) {
4204 		return (-1);
4205 	}
4206 
4207 	if (!metaismeta(compnp)) {	/* handle non-metadevices */
4208 
4209 		/*
4210 		 * The component can have one or more soft partitions on it
4211 		 * already, but can't be part of any other type of metadevice,
4212 		 * so if it is used for a metadevice, but the metadevice
4213 		 * isn't a soft partition, return failure.
4214 		 */
4215 
4216 		if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 &&
4217 		    meta_check_insp(sp, compnp, 0, -1, ep) == 0) {
4218 			return (-1);
4219 		}
4220 	} else {			/* handle metadevices */
4221 		/* get underlying unit & check capabilities */
4222 		if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL)
4223 			return (-1);
4224 
4225 		if ((! (mdp->capabilities & MD_CAN_PARENT)) ||
4226 		    (! (mdp->capabilities & MD_CAN_SP)))
4227 			return (mdmderror(ep, MDE_INVAL_UNIT,
4228 			    meta_getminor(compnp->dev), compnp->cname));
4229 	}
4230 
4231 meta_check_sp_ok:
4232 	mdclrerror(ep);
4233 	return (0);
4234 }
4235 
4236 /*
4237  * FUNCTION:	meta_create_sp()
4238  * INPUT:	sp	- the set name to create in
4239  *		msp	- the unit structure to create
4240  *		oblist	- an optional list of requested extents (-o/-b options)
4241  *		options	- creation options
4242  *		alignment - data alignment
4243  * OUTPUT:	ep	- return error pointer
4244  * RETURNS:	int	-  0 success, -1 error
4245  * PURPOSE:	does most of the work for creating a soft partition.  If
4246  *		metainit -p -e was used, first partition the drive.  Then
4247  *		create an extent list based on the existing soft partitions
4248  *		and assume all space not used by them is free.  Storage for
4249  *		the new soft partition is allocated from the free extents
4250  *		based on the length specified on the command line or the
4251  *		oblist passed in.  The unit structure is then committed and
4252  *		the watermarks are updated.  Finally, the status is changed to
4253  *		Okay and the process is complete.
4254  */
4255 static int
4256 meta_create_sp(
4257 	mdsetname_t	*sp,
4258 	md_sp_t		*msp,
4259 	sp_ext_node_t	*oblist,
4260 	mdcmdopts_t	options,
4261 	sp_ext_length_t	alignment,
4262 	md_error_t	*ep
4263 )
4264 {
4265 	mdname_t	*np = msp->common.namep;
4266 	mdname_t	*compnp = msp->compnamep;
4267 	mp_unit_t	*mp = NULL;
4268 	mdnamelist_t	*keynlp = NULL, *spnlp = NULL;
4269 	md_set_params_t	set_params;
4270 	int		rval = -1;
4271 	diskaddr_t	comp_size;
4272 	diskaddr_t	sp_start;
4273 	sp_ext_node_t	*extlist = NULL;
4274 	int		numexts = 0;	/* number of extents */
4275 	int		count = 0;
4276 	int		committed = 0;
4277 	int		repart_options = MD_REPART_FORCE;
4278 	int		create_flag = MD_CRO_32BIT;
4279 
4280 	md_set_desc	*sd;
4281 	mm_unit_t	*mm;
4282 	md_set_mmown_params_t	*ownpar = NULL;
4283 	int		comp_is_mirror = 0;
4284 
4285 	/* validate soft partition */
4286 	if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0)
4287 		return (-1);
4288 
4289 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4290 		if ((options & MDCMD_DOIT) != 0) {
4291 			if (meta_repartition_drive(sp,
4292 			    compnp->drivenamep,
4293 			    repart_options,
4294 			    NULL, /* Don't return the VTOC */
4295 			    ep) != 0)
4296 
4297 				return (-1);
4298 		} else {
4299 			/*
4300 			 * If -n and -e are both specified, it doesn't make
4301 			 * sense to continue without actually partitioning
4302 			 * the drive.
4303 			 */
4304 			return (0);
4305 		}
4306 	}
4307 
4308 	/* populate the start_blk field of the component name */
4309 	if ((sp_start = meta_sp_get_start(sp, compnp, ep)) ==
4310 	    MD_DISKADDR_ERROR) {
4311 		rval = -1;
4312 		goto out;
4313 	}
4314 
4315 	if (options & MDCMD_DOIT) {
4316 		/* store name in namespace */
4317 		if (add_key_name(sp, compnp, &keynlp, ep) != 0) {
4318 			rval = -1;
4319 			goto out;
4320 		}
4321 	}
4322 
4323 	/*
4324 	 * Get a list of the soft partitions that currently reside on
4325 	 * the component.  We should ALWAYS force reload the cache,
4326 	 * because if this is a single creation, there will not BE a
4327 	 * cached list, and if we're using the md.tab, we must rebuild
4328 	 * the list because it won't contain the previous (if any)
4329 	 * soft partition.
4330 	 */
4331 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4332 	if (count < 0) {
4333 		/* error occured */
4334 		rval = -1;
4335 		goto out;
4336 	}
4337 
4338 	/*
4339 	 * get the size of the underlying device.  if the size is smaller
4340 	 * than or equal to the watermark size, we know there isn't
4341 	 * enough space.
4342 	 */
4343 	if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) {
4344 		rval = -1;
4345 		goto out;
4346 	} else if (comp_size <= MD_SP_WMSIZE) {
4347 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname);
4348 		rval = -1;
4349 		goto out;
4350 	}
4351 	/*
4352 	 * seed extlist with reserved space at the beginning of the volume and
4353 	 * enough space for the end watermark.  The end watermark always gets
4354 	 * updated, but if the underlying device changes size it may not be
4355 	 * pointed to until the extent before it is updated.  Since the
4356 	 * end of the reserved space is where the first watermark starts,
4357 	 * the reserved extent should never be marked for updating.
4358 	 */
4359 
4360 	meta_sp_list_insert(NULL, NULL, &extlist,
4361 	    0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4362 	meta_sp_list_insert(NULL, NULL, &extlist,
4363 	    (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE,
4364 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4365 
4366 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4367 		rval = -1;
4368 		goto out;
4369 	}
4370 
4371 	metafreenamelist(spnlp);
4372 
4373 	if (getenv(META_SP_DEBUG)) {
4374 		meta_sp_debug("meta_create_sp: list of used extents:\n");
4375 		meta_sp_list_dump(extlist);
4376 	}
4377 
4378 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4379 
4380 	/* get extent list from -o/-b options or from free space */
4381 	if (options & MDCMD_DIRECT) {
4382 		if (getenv(META_SP_DEBUG)) {
4383 			meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n");
4384 			meta_sp_list_dump(oblist);
4385 		}
4386 
4387 		numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist);
4388 		if (numexts == -1) {
4389 			(void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname);
4390 			rval = -1;
4391 			goto out;
4392 		}
4393 	} else {
4394 		numexts = meta_sp_alloc_by_len(sp, np, &extlist,
4395 		    &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment :
4396 		    meta_sp_get_default_alignment(sp, compnp, ep));
4397 		if (numexts == -1) {
4398 			(void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname);
4399 			rval = -1;
4400 			goto out;
4401 		}
4402 	}
4403 
4404 	assert(extlist != NULL);
4405 
4406 	/* create soft partition */
4407 	mp = meta_sp_createunit(msp->common.namep, msp->compnamep,
4408 	    extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep);
4409 
4410 	create_flag = meta_check_devicesize(mp->c.un_total_blocks);
4411 
4412 	/* if we're not doing anything (metainit -n), return success */
4413 	if (! (options & MDCMD_DOIT)) {
4414 		rval = 0;	/* success */
4415 		goto out;
4416 	}
4417 
4418 	(void) memset(&set_params, 0, sizeof (set_params));
4419 
4420 	if (create_flag == MD_CRO_64BIT) {
4421 		mp->c.un_revision |= MD_64BIT_META_DEV;
4422 		set_params.options = MD_CRO_64BIT;
4423 	} else {
4424 		mp->c.un_revision &= ~MD_64BIT_META_DEV;
4425 		set_params.options = MD_CRO_32BIT;
4426 	}
4427 
4428 	if (getenv(META_SP_DEBUG)) {
4429 		meta_sp_debug("meta_create_sp: printing unit structure\n");
4430 		meta_sp_printunit(mp);
4431 	}
4432 
4433 	/*
4434 	 * Check to see if we're trying to create a partition on a mirror. If so
4435 	 * we may have to enforce an ownership change before writing the
4436 	 * watermark out.
4437 	 */
4438 	if (metaismeta(compnp)) {
4439 		char *miscname;
4440 
4441 		miscname = metagetmiscname(compnp, ep);
4442 		if (miscname != NULL)
4443 			comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0);
4444 		else
4445 			comp_is_mirror = 0;
4446 	} else {
4447 		comp_is_mirror = 0;
4448 	}
4449 
4450 	/*
4451 	 * For a multi-node environment we have to ensure that the master
4452 	 * node owns an underlying mirror before we issue the MD_IOCSET ioctl.
4453 	 * If the master does not own the device we will deadlock as the
4454 	 * implicit write of the watermarks (in sp_ioctl.c) will cause an
4455 	 * ownership change that will block as the MD_IOCSET is still in
4456 	 * progress. To close this window we force an owner change to occur
4457 	 * before issuing the MD_IOCSET. We cannot simply open the device and
4458 	 * write to it as this will only work for the first soft-partition
4459 	 * creation.
4460 	 */
4461 
4462 	if (comp_is_mirror && !metaislocalset(sp)) {
4463 
4464 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4465 			rval = -1;
4466 			goto out;
4467 		}
4468 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
4469 			mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
4470 			if (mm == NULL) {
4471 				rval = -1;
4472 				goto out;
4473 			} else {
4474 				rval = meta_mn_change_owner(&ownpar, sp->setno,
4475 					meta_getminor(compnp->dev),
4476 					sd->sd_mn_mynode->nd_nodeid,
4477 					MD_MN_MM_PREVENT_CHANGE |
4478 					    MD_MN_MM_SPAWN_THREAD);
4479 				if (rval == -1)
4480 					goto out;
4481 			}
4482 		}
4483 	}
4484 
4485 	set_params.mnum = MD_SID(mp);
4486 	set_params.size = mp->c.un_size;
4487 	set_params.mdp = (uintptr_t)mp;
4488 	MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum));
4489 
4490 	/* first phase of commit. */
4491 	if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
4492 	    np->cname) != 0) {
4493 		(void) mdstealerror(ep, &set_params.mde);
4494 		rval = -1;
4495 		goto out;
4496 	}
4497 
4498 	/* we've successfully committed the record */
4499 	committed = 1;
4500 
4501 	/* write watermarks */
4502 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
4503 		rval = -1;
4504 		goto out;
4505 	}
4506 
4507 	/*
4508 	 * Allow mirror ownership to change. If we don't succeed in this
4509 	 * ioctl it isn't fatal, but the cluster will probably hang fairly
4510 	 * soon as the mirror owner won't change. However, we have
4511 	 * successfully written the watermarks out to the device so the
4512 	 * softpart creation has succeeded
4513 	 */
4514 	if (ownpar) {
4515 		(void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum,
4516 		    ownpar->d.owner,
4517 		    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
4518 	}
4519 
4520 	/* second phase of commit, set status to MD_SP_OK */
4521 	if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) {
4522 		rval = -1;
4523 		goto out;
4524 	}
4525 	rval = 0;
4526 out:
4527 	Free(mp);
4528 	if (ownpar)
4529 		Free(ownpar);
4530 
4531 	if (extlist != NULL)
4532 		meta_sp_list_free(&extlist);
4533 
4534 	if (rval != 0 && keynlp != NULL && committed != 1)
4535 		(void) del_key_names(sp, keynlp, NULL);
4536 
4537 	metafreenamelist(keynlp);
4538 
4539 	return (rval);
4540 }
4541 
4542 /*
4543  * **************************************************************************
4544  *                      Reset (metaclear) Functions                         *
4545  * **************************************************************************
4546  */
4547 
4548 /*
4549  * FUNCTION:	meta_sp_reset_common()
4550  * INPUT:	sp	- the set name of the device to reset
4551  *		np	- the name of the device to reset
4552  *		msp	- the unit structure to reset
4553  *		options	- metaclear options
4554  * OUTPUT:	ep	- return error pointer
4555  * RETURNS:	int	-  0 success, -1 error
4556  * PURPOSE:	"resets", or more accurately deletes, the soft partition
4557  *		specified.  First the state is set to "deleting" and then the
4558  *		watermarks are all cleared out.  Once the watermarks have been
4559  *		updated, the unit structure is deleted from the metadb.
4560  */
4561 static int
4562 meta_sp_reset_common(
4563 	mdsetname_t	*sp,
4564 	mdname_t	*np,
4565 	md_sp_t		*msp,
4566 	md_sp_reset_t	reset_params,
4567 	mdcmdopts_t	options,
4568 	md_error_t	*ep
4569 )
4570 {
4571 	char	*miscname;
4572 	int	rval = -1;
4573 	int	is_open = 0;
4574 
4575 	/* make sure that nobody owns us */
4576 	if (MD_HAS_PARENT(msp->common.parent))
4577 		return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev),
4578 					np->cname));
4579 
4580 	/* make sure that the soft partition isn't open */
4581 	if ((is_open = meta_isopen(sp, np, ep, options)) < 0)
4582 		return (-1);
4583 	else if (is_open)
4584 		return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev),
4585 					np->cname));
4586 
4587 	/* get miscname */
4588 	if ((miscname = metagetmiscname(np, ep)) == NULL)
4589 		return (-1);
4590 
4591 	/* fill in reset params */
4592 	MD_SETDRIVERNAME(&reset_params, miscname, sp->setno);
4593 	reset_params.mnum = meta_getminor(np->dev);
4594 	reset_params.force = (options & MDCMD_FORCE) ? 1 : 0;
4595 
4596 	/*
4597 	 * clear soft partition - phase one.
4598 	 * place the soft partition into the "delete pending" state.
4599 	 */
4600 	if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0)
4601 		return (-1);
4602 
4603 	/*
4604 	 * Now clear the watermarks.  If the force flag is specified,
4605 	 * ignore any errors writing the watermarks and delete the unit
4606 	 * structure anyway.  An error may leave the on-disk format in a
4607 	 * corrupt state.  If force is not specified and we fail here,
4608 	 * the soft partition will remain in the "delete pending" state.
4609 	 */
4610 	if ((meta_sp_clear_wm(sp, msp, ep) < 0) &&
4611 	    ((options & MDCMD_FORCE) == 0))
4612 		goto out;
4613 
4614 	/*
4615 	 * clear soft partition - phase two.
4616 	 * the driver removes the soft partition from the metadb and
4617 	 * zeros out incore version.
4618 	 */
4619 	if (metaioctl(MD_IOCRESET, &reset_params,
4620 	    &reset_params.mde, np->cname) != 0) {
4621 		(void) mdstealerror(ep, &reset_params.mde);
4622 		goto out;
4623 	}
4624 	rval = 0;	/* success */
4625 
4626 	if (options & MDCMD_PRINT) {
4627 		(void) printf(dgettext(TEXT_DOMAIN,
4628 		    "%s: Soft Partition is cleared\n"),
4629 		    np->cname);
4630 		(void) fflush(stdout);
4631 	}
4632 
4633 	/*
4634 	 * if told to recurse and on a metadevice, then attempt to
4635 	 * clear the subdevices.  Indicate failure if the clear fails.
4636 	 */
4637 	if ((options & MDCMD_RECURSE) &&
4638 	    (metaismeta(msp->compnamep)) &&
4639 	    (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0))
4640 		rval = -1;
4641 
4642 out:
4643 	meta_invalidate_name(np);
4644 	return (rval);
4645 }
4646 
4647 /*
4648  * FUNCTION:	meta_sp_reset()
4649  * INPUT:	sp	- the set name of the device to reset
4650  *		np	- the name of the device to reset
4651  *		options	- metaclear options
4652  * OUTPUT:	ep	- return error pointer
4653  * RETURNS:	int	-  0 success, -1 error
4654  * PURPOSE:	provides the entry point to the rest of libmeta for deleting a
4655  *		soft partition.  If np is NULL, then soft partitions are
4656  *		all deleted at the current level and then recursively deleted.
4657  *		Otherwise, if a name is specified either directly or as a
4658  *		result of a recursive operation, it deletes only that name.
4659  *		Since something sitting under a soft partition may be parented
4660  *		to it, we have to reparent that other device to another soft
4661  *		partition on the same component if we're deleting the one it's
4662  *		parented to.
4663  */
4664 int
4665 meta_sp_reset(
4666 	mdsetname_t	*sp,
4667 	mdname_t	*np,
4668 	mdcmdopts_t	options,
4669 	md_error_t	*ep
4670 )
4671 {
4672 	md_sp_t		*msp;
4673 	int		rval = -1;
4674 	mdnamelist_t	*spnlp = NULL, *nlp = NULL;
4675 	md_sp_reset_t	reset_params;
4676 	int		num_sp;
4677 
4678 	assert(sp != NULL);
4679 
4680 	/* reset/delete all soft paritions */
4681 	if (np == NULL) {
4682 		/*
4683 		 * meta_reset_all sets MDCMD_RECURSE, but this behavior
4684 		 * is incorrect for soft partitions.  We want to clear
4685 		 * all soft partitions at a particular level in the
4686 		 * metadevice stack before moving to the next level.
4687 		 * Thus, we clear MDCMD_RECURSE from the options.
4688 		 */
4689 		options &= ~MDCMD_RECURSE;
4690 
4691 		/* for each soft partition */
4692 		rval = 0;
4693 		if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
4694 			rval = -1;
4695 
4696 		for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) {
4697 			np = nlp->namep;
4698 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4699 				rval = -1;
4700 				break;
4701 			}
4702 			/*
4703 			 * meta_reset_all calls us twice to get soft
4704 			 * partitions at the top and bottom of the stack.
4705 			 * thus, if we have a parent, we'll get deleted
4706 			 * on the next call.
4707 			 */
4708 			if (MD_HAS_PARENT(msp->common.parent))
4709 				continue;
4710 			/*
4711 			 * If this is a multi-node set, we send a series
4712 			 * of individual metaclear commands.
4713 			 */
4714 			if (meta_is_mn_set(sp, ep)) {
4715 				if (meta_mn_send_metaclear_command(sp,
4716 				    np->cname, options, 0, ep) != 0) {
4717 					rval = -1;
4718 					break;
4719 				}
4720 			} else {
4721 				if (meta_sp_reset(sp, np, options, ep) != 0) {
4722 					rval = -1;
4723 					break;
4724 				}
4725 			}
4726 		}
4727 		/* cleanup return status */
4728 		metafreenamelist(spnlp);
4729 		return (rval);
4730 	}
4731 
4732 	/* check the name */
4733 	if (metachkmeta(np, ep) != 0)
4734 		return (-1);
4735 
4736 	/* get the unit structure */
4737 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4738 		return (-1);
4739 
4740 	/* clear out reset parameters */
4741 	(void) memset(&reset_params, 0, sizeof (reset_params));
4742 
4743 	/* if our child is a metadevice, we need to deparent/reparent it */
4744 	if (metaismeta(msp->compnamep)) {
4745 		/* get sp's on this component */
4746 		if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep,
4747 		    &spnlp, 1, ep)) <= 0)
4748 			/* no sp's on this device.  error! */
4749 			return (-1);
4750 		else if (num_sp == 1)
4751 			/* last sp on this device, so we deparent */
4752 			reset_params.new_parent = MD_NO_PARENT;
4753 		else {
4754 			/* have to reparent this metadevice */
4755 			for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4756 				if (meta_getminor(nlp->namep->dev) ==
4757 					meta_getminor(np->dev))
4758 					continue;
4759 				/*
4760 				 * this isn't the softpart we are deleting,
4761 				 * so use this device as the new parent.
4762 				 */
4763 				reset_params.new_parent =
4764 				    meta_getminor(nlp->namep->dev);
4765 				break;
4766 			}
4767 		}
4768 		metafreenamelist(spnlp);
4769 	}
4770 
4771 	if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0)
4772 		return (-1);
4773 
4774 	return (0);
4775 }
4776 
4777 /*
4778  * FUNCTION:	meta_sp_reset_component()
4779  * INPUT:	sp	- the set name of the device to reset
4780  *		name	- the string name of the device to reset
4781  *		options	- metaclear options
4782  * OUTPUT:	ep	- return error pointer
4783  * RETURNS:	int	-  0 success, -1 error
4784  * PURPOSE:	provides the ability to delete all soft partitions on a
4785  *		specified device (metaclear -p).  It first gets all of the
4786  *		soft partitions on the component and then deletes each one
4787  *		individually.
4788  */
4789 int
4790 meta_sp_reset_component(
4791 	mdsetname_t	*sp,
4792 	char		*name,
4793 	mdcmdopts_t	options,
4794 	md_error_t	*ep
4795 )
4796 {
4797 	mdname_t	*compnp, *np;
4798 	mdnamelist_t	*spnlp = NULL;
4799 	mdnamelist_t	*nlp = NULL;
4800 	md_sp_t		*msp;
4801 	int		count;
4802 	md_sp_reset_t	reset_params;
4803 
4804 	if ((compnp = metaname(&sp, name, UNKNOWN, ep)) == NULL)
4805 		return (-1);
4806 
4807 	/* If we're starting out with no soft partitions, it's an error */
4808 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4809 	if (count == 0)
4810 		return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname));
4811 	else if (count < 0)
4812 		return (-1);
4813 
4814 	/*
4815 	 * clear all soft partitions on this component.
4816 	 * NOTE: we reparent underlying metadevices as we go so that
4817 	 * things stay sane.  Also, if we encounter an error, we stop
4818 	 * and go no further in case recovery might be needed.
4819 	 */
4820 	for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4821 		/* clear out reset parameters */
4822 		(void) memset(&reset_params, 0, sizeof (reset_params));
4823 
4824 		/* check the name */
4825 		np = nlp->namep;
4826 
4827 		if (metachkmeta(np, ep) != 0) {
4828 			metafreenamelist(spnlp);
4829 			return (-1);
4830 		}
4831 
4832 		/* get the unit structure */
4833 		if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4834 			metafreenamelist(spnlp);
4835 			return (-1);
4836 		}
4837 
4838 		/* have to deparent/reparent metadevices */
4839 		if (metaismeta(compnp)) {
4840 			if (nlp->next == NULL)
4841 				reset_params.new_parent = MD_NO_PARENT;
4842 			else
4843 				reset_params.new_parent =
4844 				    meta_getminor(spnlp->next->namep->dev);
4845 		}
4846 
4847 		/* clear soft partition */
4848 		if (meta_sp_reset_common(sp, np, msp, reset_params,
4849 		    options, ep) < 0) {
4850 			metafreenamelist(spnlp);
4851 			return (-1);
4852 		}
4853 	}
4854 	metafreenamelist(spnlp);
4855 	return (0);
4856 }
4857 
4858 /*
4859  * **************************************************************************
4860  *                      Grow (metattach) Functions                          *
4861  * **************************************************************************
4862  */
4863 
4864 /*
4865  * FUNCTION:	meta_sp_attach()
4866  * INPUT:	sp	- the set name of the device to attach to
4867  *		np	- the name of the device to attach to
4868  *		addsize	- the unparsed string holding the amount of space to add
4869  *		options	- metattach options
4870  *		alignment - data alignment
4871  * OUTPUT:	ep	- return error pointer
4872  * RETURNS:	int	-  0 success, -1 error
4873  * PURPOSE:	grows a soft partition by reading in the existing unit
4874  *		structure and setting its state to Growing, allocating more
4875  *		space (similar to meta_create_sp()), updating the watermarks,
4876  *		and then writing out the new unit structure in the Okay state.
4877  */
4878 int
4879 meta_sp_attach(
4880 	mdsetname_t	*sp,
4881 	mdname_t	*np,
4882 	char		*addsize,
4883 	mdcmdopts_t	options,
4884 	sp_ext_length_t	alignment,
4885 	md_error_t	*ep
4886 )
4887 {
4888 	md_grow_params_t	grow_params;
4889 	sp_ext_length_t		grow_len;	/* amount to grow */
4890 	mp_unit_t		*mp, *new_un;
4891 	mdname_t		*compnp = NULL;
4892 
4893 	sp_ext_node_t		*extlist = NULL;
4894 	int			numexts;
4895 	mdnamelist_t		*spnlp = NULL;
4896 	int			count;
4897 	md_sp_t			*msp;
4898 	daddr_t			start_block;
4899 
4900 	/* should have the same set */
4901 	assert(sp != NULL);
4902 	assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev)));
4903 
4904 	/* check name */
4905 	if (metachkmeta(np, ep) != 0)
4906 		return (-1);
4907 
4908 	if (meta_sp_parsesize(addsize, &grow_len) == -1) {
4909 		return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname));
4910 	}
4911 
4912 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
4913 		return (-1);
4914 
4915 	/* make sure we don't have a parent */
4916 	if (MD_HAS_PARENT(mp->c.un_parent)) {
4917 		Free(mp);
4918 		return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname));
4919 	}
4920 
4921 	if (getenv(META_SP_DEBUG)) {
4922 		meta_sp_debug("meta_sp_attach: Unit structure before new "
4923 		    "space:\n");
4924 		meta_sp_printunit(mp);
4925 	}
4926 
4927 	/*
4928 	 * NOTE: the fast option to metakeyname is 0 as opposed to 1
4929 	 * If this was not the case we would suffer the following
4930 	 * assertion failure:
4931 	 * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP
4932 	 * file meta_check.x, line 315
4933 	 * I guess this is because we have not "seen" this drive before
4934 	 * and hence hit the failure - this is of course the attach routine
4935 	 */
4936 	if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) {
4937 		Free(mp);
4938 		return (-1);
4939 	}
4940 
4941 	/* metakeyname does not fill in the key. */
4942 	compnp->key = mp->un_key;
4943 
4944 	/* work out the space on the component that we are dealing with */
4945 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
4946 
4947 	/*
4948 	 * see if the component has been soft partitioned yet, or if an
4949 	 * error occurred.
4950 	 */
4951 	if (count == 0) {
4952 		Free(mp);
4953 		return (mdmderror(ep, MDE_NOT_SP, 0, np->cname));
4954 	} else if (count < 0) {
4955 		Free(mp);
4956 		return (-1);
4957 	}
4958 
4959 	/*
4960 	 * seed extlist with reserved space at the beginning of the volume and
4961 	 * enough space for the end watermark.  The end watermark always gets
4962 	 * updated, but if the underlying device changes size it may not be
4963 	 * pointed to until the extent before it is updated.  Since the
4964 	 * end of the reserved space is where the first watermark starts,
4965 	 * the reserved extent should never be marked for updating.
4966 	 */
4967 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
4968 	    MD_DISKADDR_ERROR) {
4969 		Free(mp);
4970 		return (-1);
4971 	}
4972 
4973 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
4974 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4975 	meta_sp_list_insert(NULL, NULL, &extlist,
4976 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
4977 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4978 
4979 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4980 		Free(mp);
4981 		return (-1);
4982 	}
4983 
4984 	metafreenamelist(spnlp);
4985 
4986 	if (getenv(META_SP_DEBUG)) {
4987 		meta_sp_debug("meta_sp_attach: list of used extents:\n");
4988 		meta_sp_list_dump(extlist);
4989 	}
4990 
4991 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4992 
4993 	assert(mp->un_numexts >= 1);
4994 	numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len,
4995 	    mp->un_ext[mp->un_numexts - 1].un_poff,
4996 	    (alignment > 0) ? alignment :
4997 	    meta_sp_get_default_alignment(sp, compnp, ep));
4998 
4999 	if (numexts == -1) {
5000 		Free(mp);
5001 		return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname));
5002 	}
5003 
5004 	/* allocate new unit structure and copy in old unit */
5005 	if ((new_un = meta_sp_updateunit(np, mp, extlist,
5006 	    grow_len, numexts, ep)) == NULL) {
5007 		Free(mp);
5008 		return (-1);
5009 	}
5010 	Free(mp);
5011 
5012 	/* If running in dryrun mode (-n option), we're done here */
5013 	if ((options & MDCMD_DOIT) == 0) {
5014 		if (options & MDCMD_PRINT) {
5015 			(void) printf(dgettext(TEXT_DOMAIN,
5016 			    "%s: Soft Partition would grow\n"),
5017 			    np->cname);
5018 			(void) fflush(stdout);
5019 		}
5020 		return (0);
5021 	}
5022 
5023 	if (getenv(META_SP_DEBUG)) {
5024 		meta_sp_debug("meta_sp_attach: updated unit structure:\n");
5025 		meta_sp_printunit(new_un);
5026 	}
5027 
5028 	assert(new_un != NULL);
5029 
5030 	(void) memset(&grow_params, 0, sizeof (grow_params));
5031 	if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) {
5032 		grow_params.options = MD_CRO_64BIT;
5033 		new_un->c.un_revision |= MD_64BIT_META_DEV;
5034 	} else {
5035 		grow_params.options = MD_CRO_32BIT;
5036 		new_un->c.un_revision &= ~MD_64BIT_META_DEV;
5037 	}
5038 	grow_params.mnum = MD_SID(new_un);
5039 	grow_params.size = new_un->c.un_size;
5040 	grow_params.mdp = (uintptr_t)new_un;
5041 	MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum));
5042 
5043 	if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde,
5044 	    np->cname) != 0) {
5045 		(void) mdstealerror(ep, &grow_params.mde);
5046 		return (-1);
5047 	}
5048 
5049 	/* update all watermarks */
5050 
5051 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
5052 		return (-1);
5053 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0)
5054 		return (-1);
5055 
5056 
5057 	/* second phase of commit, set status to MD_SP_OK */
5058 	if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0)
5059 		return (-1);
5060 
5061 	meta_invalidate_name(np);
5062 
5063 	if (options & MDCMD_PRINT) {
5064 		(void) printf(dgettext(TEXT_DOMAIN,
5065 		    "%s: Soft Partition has been grown\n"),
5066 		    np->cname);
5067 		(void) fflush(stdout);
5068 	}
5069 
5070 	return (0);
5071 }
5072 
5073 /*
5074  * **************************************************************************
5075  *                    Recovery (metarecover) Functions                      *
5076  * **************************************************************************
5077  */
5078 
5079 /*
5080  * FUNCTION:	meta_recover_sp()
5081  * INPUT:	sp	- the name of the set we are recovering on
5082  *		compnp	- name pointer for device we are recovering on
5083  *		argc	- argument count
5084  *		argv	- left over arguments not parsed by metarecover command
5085  *		options	- metarecover options
5086  * OUTPUT:	ep	- return error pointer
5087  * RETURNS:	int	- 0 - success, -1 - error
5088  * PURPOSE:	parse soft partitioning-specific metarecover options and
5089  *		dispatch to the appropriate function to handle recovery.
5090  */
5091 int
5092 meta_recover_sp(
5093 	mdsetname_t	*sp,
5094 	mdname_t	*compnp,
5095 	int		argc,
5096 	char		*argv[],
5097 	mdcmdopts_t	options,
5098 	md_error_t	*ep
5099 )
5100 {
5101 	md_set_desc	*sd;
5102 
5103 	if (argc > 1) {
5104 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5105 		    argc, argv);
5106 		return (-1);
5107 	}
5108 
5109 	/*
5110 	 * For a MN set, this operation must be performed on the master
5111 	 * as it is responsible for maintaining the watermarks
5112 	 */
5113 	if (!metaislocalset(sp)) {
5114 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
5115 			return (-1);
5116 		if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) {
5117 			(void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno,
5118 			    sd->sd_mn_master_nodenm, NULL, NULL);
5119 			return (-1);
5120 		}
5121 	}
5122 	if (argc == 0) {
5123 		/*
5124 		 * if no additional arguments are passed, metarecover should
5125 		 * validate both on-disk and metadb structures as well as
5126 		 * checking that both are consistent with each other
5127 		 */
5128 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5129 			return (-1);
5130 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5131 			return (-1);
5132 		if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0)
5133 			return (-1);
5134 	} else if (strcmp(argv[0], "-d") == 0) {
5135 		/*
5136 		 * Ensure that there is no existing valid record for this
5137 		 * soft-partition. If there is we have nothing to do.
5138 		 */
5139 		if (meta_sp_validate_unit(sp, compnp, options, ep) == 0)
5140 			return (-1);
5141 		/* validate and recover from on-disk structures */
5142 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5143 			return (-1);
5144 		if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0)
5145 			return (-1);
5146 	} else if (strcmp(argv[0], "-m") == 0) {
5147 		/* validate and recover from metadb structures */
5148 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5149 			return (-1);
5150 		if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0)
5151 			return (-1);
5152 	} else {
5153 		/* syntax error */
5154 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5155 		    argc, argv);
5156 		return (-1);
5157 	}
5158 
5159 	return (0);
5160 }
5161 
5162 /*
5163  * FUNCTION:	meta_sp_display_exthdr()
5164  * INPUT:	none
5165  * OUTPUT:	none
5166  * RETURNS:	void
5167  * PURPOSE:	print header line for sp_ext_node_t information.  to be used
5168  *		in conjunction with meta_sp_display_ext().
5169  */
5170 static void
5171 meta_sp_display_exthdr(void)
5172 {
5173 	(void) printf("%20s %5s %7s %20s %20s\n",
5174 	    dgettext(TEXT_DOMAIN, "Name"),
5175 	    dgettext(TEXT_DOMAIN, "Seq#"),
5176 	    dgettext(TEXT_DOMAIN, "Type"),
5177 	    dgettext(TEXT_DOMAIN, "Offset"),
5178 	    dgettext(TEXT_DOMAIN, "Length"));
5179 }
5180 
5181 
5182 /*
5183  * FUNCTION:	meta_sp_display_ext()
5184  * INPUT:	ext	- extent to display
5185  * OUTPUT:	none
5186  * RETURNS:	void
5187  * PURPOSE:	print selected fields from sp_ext_node_t.
5188  */
5189 static void
5190 meta_sp_display_ext(sp_ext_node_t *ext)
5191 {
5192 	/* print extent information */
5193 	if (ext->ext_namep != NULL)
5194 		(void) printf("%20s ", ext->ext_namep->cname);
5195 	else
5196 		(void) printf("%20s ", "NONE");
5197 
5198 	(void) printf("%5u ", ext->ext_seq);
5199 
5200 	switch (ext->ext_type) {
5201 	case EXTTYP_ALLOC:
5202 		(void) printf("%7s ", "ALLOC");
5203 		break;
5204 	case EXTTYP_FREE:
5205 		(void) printf("%7s ", "FREE");
5206 		break;
5207 	case EXTTYP_RESERVED:
5208 		(void) printf("%7s ", "RESV");
5209 		break;
5210 	case EXTTYP_END:
5211 		(void) printf("%7s ", "END");
5212 		break;
5213 	default:
5214 		(void) printf("%7s ", "INVLD");
5215 		break;
5216 	}
5217 
5218 	(void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length);
5219 }
5220 
5221 
5222 /*
5223  * FUNCTION:	meta_sp_checkseq()
5224  * INPUT:	extlist	- list of extents to be checked
5225  * OUTPUT:	none
5226  * RETURNS:	int	- 0 - success, -1 - error
5227  * PURPOSE:	check soft partition sequence numbers.  this function assumes
5228  *		that a list of extents representing 1 or more soft partitions
5229  *		is passed in sorted in sequence number order.  within a
5230  *		single soft partition, there may not be any missing or
5231  *		duplicate sequence numbers.
5232  */
5233 static int
5234 meta_sp_checkseq(sp_ext_node_t *extlist)
5235 {
5236 	sp_ext_node_t *ext;
5237 
5238 	assert(extlist != NULL);
5239 
5240 	for (ext = extlist;
5241 	    ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC;
5242 	    ext = ext->ext_next) {
5243 		if (ext->ext_next->ext_namep != NULL &&
5244 		    strcmp(ext->ext_next->ext_namep->cname,
5245 			ext->ext_namep->cname) != 0)
5246 				continue;
5247 
5248 		if (ext->ext_next->ext_seq != ext->ext_seq + 1) {
5249 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5250 			    "%s: sequence numbers are "
5251 			    "incorrect: %d should be %d\n"),
5252 			    ext->ext_next->ext_namep->cname,
5253 			    ext->ext_next->ext_seq, ext->ext_seq + 1);
5254 			return (-1);
5255 		}
5256 	}
5257 	return (0);
5258 }
5259 
5260 
5261 /*
5262  * FUNCTION:	meta_sp_resolve_name_conflict()
5263  * INPUT:	sp	- name of set we're are recovering in.
5264  *		old_np	- name pointer of soft partition we found on disk.
5265  * OUTPUT:	new_np	- name pointer for new soft partition name.
5266  *		ep	- error pointer returned.
5267  * RETURNS:	int	- 0 - name not replace, 1 - name replaced, -1 - error
5268  * PURPOSE:	Check to see if the name of one of the soft partitions we found
5269  *		on disk already exists in the metadb.  If so, prompt for a new
5270  *		name.  In addition, we keep a static array of names that
5271  *		will be recovered from this device since these names don't
5272  *		exist in the configuration at this point but cannot be
5273  *		recovered more than once.
5274  */
5275 static int
5276 meta_sp_resolve_name_conflict(
5277 	mdsetname_t	*sp,
5278 	mdname_t	*old_np,
5279 	mdname_t	**new_np,
5280 	md_error_t	*ep
5281 )
5282 {
5283 	char		yesno[255];
5284 	char		*yes;
5285 	char		newname[MD_SP_MAX_DEVNAME_PLUS_1];
5286 	int		nunits;
5287 	static int	*used_names = NULL;
5288 
5289 	assert(old_np != NULL);
5290 
5291 	if (used_names == NULL) {
5292 		if ((nunits = meta_get_nunits(ep)) < 0)
5293 			return (-1);
5294 		used_names = Zalloc(nunits * sizeof (int));
5295 	}
5296 
5297 	/* see if it exists already */
5298 	if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 &&
5299 	    metagetmiscname(old_np, ep) == NULL) {
5300 		if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5301 			return (-1);
5302 		else {
5303 			used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1;
5304 			mdclrerror(ep);
5305 			return (0);
5306 		}
5307 	}
5308 
5309 	/* name exists, ask the user for a new one */
5310 	(void) printf(dgettext(TEXT_DOMAIN,
5311 	    "WARNING: A soft partition named %s was found in the extent\n"
5312 	    "headers, but this name already exists in the metadb "
5313 	    "configuration.\n"
5314 	    "In order to continue recovery you must supply\n"
5315 	    "a new name for this soft partition.\n"), old_np->cname);
5316 	(void) printf(dgettext(TEXT_DOMAIN,
5317 	    "Would you like to continue and supply a new name? (yes/no) "));
5318 
5319 	(void) fflush(stdout);
5320 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
5321 	    (strlen(yesno) == 1))
5322 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
5323 		    dgettext(TEXT_DOMAIN, "no"));
5324 	yes = dgettext(TEXT_DOMAIN, "yes");
5325 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
5326 		return (-1);
5327 	}
5328 
5329 	(void) fflush(stdin);
5330 
5331 	/* get the new name */
5332 	for (;;) {
5333 		(void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name "
5334 		    "for this soft partition (dXXXX) "));
5335 		(void) fflush(stdout);
5336 		if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL)
5337 			(void) strcpy(newname, "");
5338 
5339 		/* remove newline character */
5340 		if (newname[strlen(newname) - 1] == '\n')
5341 			newname[strlen(newname) - 1] = '\0';
5342 
5343 		if (!(is_metaname(newname)) ||
5344 		    (meta_init_make_device(&sp, newname, ep) <= 0)) {
5345 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5346 			    "Invalid metadevice name\n"));
5347 			(void) fflush(stderr);
5348 			continue;
5349 		}
5350 
5351 		if ((*new_np = metaname(&sp, newname,
5352 		    META_DEVICE, ep)) == NULL) {
5353 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5354 			    "Invalid metadevice name\n"));
5355 			(void) fflush(stderr);
5356 			continue;
5357 		}
5358 
5359 		assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits);
5360 		/* make sure the name isn't already being used */
5361 		if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] ||
5362 		    metagetmiscname(*new_np, ep) != NULL) {
5363 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5364 			    "That name already exists\n"));
5365 			continue;
5366 		} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5367 			return (-1);
5368 
5369 		break;
5370 	}
5371 
5372 	/* got a new name, place in used array and return */
5373 	used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1;
5374 	mdclrerror(ep);
5375 	return (1);
5376 }
5377 
5378 /*
5379  * FUNCTION:	meta_sp_validate_wm()
5380  * INPUT:	sp	- set name we are recovering in
5381  *		compnp	- name pointer for device we are recovering from
5382  *		options	- metarecover options
5383  * OUTPUT:	ep	- error pointer returned
5384  * RETURNS:	int	- 0 - success, -1 - error
5385  * PURPOSE:	validate and display watermark configuration.  walk the
5386  *		on-disk watermark structures and validate the information
5387  *		found within.  since a watermark configuration is
5388  *		"self-defining", the act of traversing the watermarks
5389  *		is part of the validation process.
5390  */
5391 static int
5392 meta_sp_validate_wm(
5393 	mdsetname_t	*sp,
5394 	mdname_t	*compnp,
5395 	mdcmdopts_t	options,
5396 	md_error_t	*ep
5397 )
5398 {
5399 	sp_ext_node_t	*extlist = NULL;
5400 	sp_ext_node_t	*ext;
5401 	int		num_sps = 0;
5402 	int		rval;
5403 
5404 	if ((options & MDCMD_VERBOSE) != 0)
5405 		(void) printf(dgettext(TEXT_DOMAIN,
5406 		    "Verifying on-disk structures on %s.\n"),
5407 		    compnp->cname);
5408 
5409 	/*
5410 	 * for each watermark, build an ext_node, place on list.
5411 	 */
5412 	rval = meta_sp_extlist_from_wm(sp, compnp, &extlist,
5413 	    meta_sp_cmp_by_nameseq, ep);
5414 
5415 	if ((options & MDCMD_VERBOSE) != 0) {
5416 		/* print out what we found */
5417 		if (extlist == NULL)
5418 			(void) printf(dgettext(TEXT_DOMAIN,
5419 			    "No extent headers found on %s.\n"),
5420 			    compnp->cname);
5421 		else {
5422 			(void) printf(dgettext(TEXT_DOMAIN,
5423 			    "The following extent headers were found on %s.\n"),
5424 			    compnp->cname);
5425 			meta_sp_display_exthdr();
5426 		}
5427 		for (ext = extlist; ext != NULL; ext = ext->ext_next)
5428 			meta_sp_display_ext(ext);
5429 	}
5430 
5431 	if (rval < 0) {
5432 		(void) printf(dgettext(TEXT_DOMAIN,
5433 		    "%s: On-disk structures invalid or "
5434 		    "no soft partitions found.\n"),
5435 		    compnp->cname);
5436 		return (-1);
5437 	}
5438 
5439 	assert(extlist != NULL);
5440 
5441 	/* count number of soft partitions */
5442 	for (ext = extlist;
5443 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5444 	    ext = ext->ext_next) {
5445 		if (ext->ext_next != NULL &&
5446 		    ext->ext_next->ext_namep != NULL &&
5447 		    strcmp(ext->ext_next->ext_namep->cname,
5448 			ext->ext_namep->cname) == 0)
5449 				continue;
5450 		num_sps++;
5451 	}
5452 
5453 	if ((options & MDCMD_VERBOSE) != 0)
5454 		(void) printf(dgettext(TEXT_DOMAIN,
5455 		    "Found %d soft partition(s) on %s.\n"), num_sps,
5456 		    compnp->cname);
5457 
5458 	if (num_sps == 0) {
5459 		(void) printf(dgettext(TEXT_DOMAIN,
5460 		    "%s: No soft partitions.\n"), compnp->cname);
5461 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5462 	}
5463 
5464 	/* check sequence numbers */
5465 	if ((options & MDCMD_VERBOSE) != 0)
5466 		(void) printf(dgettext(TEXT_DOMAIN,
5467 		    "Checking sequence numbers.\n"));
5468 
5469 	if (meta_sp_checkseq(extlist) != 0)
5470 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5471 
5472 	return (0);
5473 }
5474 
5475 /*
5476  * FUNCTION:	meta_sp_validate_unit()
5477  * INPUT:	sp	- name of set we are recovering in
5478  *		compnp	- name of component we are recovering from
5479  *		options	- metarecover options
5480  * OUTPUT:	ep	- error pointer returned
5481  * RETURNS:	int	- 0 - success, -1 - error
5482  * PURPOSE:	validate and display metadb configuration.  begin by getting
5483  *		all soft partitions built on the specified component.  get
5484  *		the unit structure for each one and validate the fields within.
5485  */
5486 static int
5487 meta_sp_validate_unit(
5488 	mdsetname_t	*sp,
5489 	mdname_t	*compnp,
5490 	mdcmdopts_t	options,
5491 	md_error_t	*ep
5492 )
5493 {
5494 	md_sp_t		*msp;
5495 	mdnamelist_t	*spnlp = NULL;
5496 	mdnamelist_t	*namep = NULL;
5497 	int		count;
5498 	uint_t		extn;
5499 	sp_ext_length_t	size;
5500 
5501 	if ((options & MDCMD_VERBOSE) != 0)
5502 		(void) printf(dgettext(TEXT_DOMAIN,
5503 		    "%s: Validating soft partition metadb entries.\n"),
5504 		    compnp->cname);
5505 
5506 	if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR)
5507 		return (-1);
5508 
5509 	/* get all soft partitions on component */
5510 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
5511 
5512 	if (count == 0) {
5513 		(void) printf(dgettext(TEXT_DOMAIN,
5514 		    "%s: No soft partitions.\n"), compnp->cname);
5515 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5516 	} else if (count < 0) {
5517 		return (-1);
5518 	}
5519 
5520 	/* Now go through the soft partitions and check each one */
5521 	for (namep = spnlp; namep != NULL; namep = namep->next) {
5522 		mdname_t	*curnp = namep->namep;
5523 		sp_ext_offset_t	curvoff;
5524 
5525 		/* get the unit structure */
5526 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
5527 			return (-1);
5528 
5529 		/* verify generic unit structure parameters */
5530 		if ((options & MDCMD_VERBOSE) != 0)
5531 			(void) printf(dgettext(TEXT_DOMAIN,
5532 			    "\nVerifying device %s.\n"),
5533 			    curnp->cname);
5534 
5535 		/*
5536 		 * MD_SP_LAST is an invalid state and is always the
5537 		 * highest numbered.
5538 		 */
5539 		if (msp->status >= MD_SP_LAST) {
5540 			(void) printf(dgettext(TEXT_DOMAIN,
5541 			    "%s: status value %u is out of range.\n"),
5542 			    curnp->cname, msp->status);
5543 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5544 			    0, curnp->cname));
5545 		} else if ((options & MDCMD_VERBOSE) != 0) {
5546 			uint_t	tstate = 0;
5547 
5548 			if (metaismeta(msp->compnamep)) {
5549 				if (meta_get_tstate(msp->common.namep->dev,
5550 				    &tstate, ep) != 0)
5551 					return (-1);
5552 			}
5553 			(void) printf(dgettext(TEXT_DOMAIN,
5554 			    "%s: Status \"%s\" is valid.\n"),
5555 			    curnp->cname, meta_sp_status_to_name(msp->status,
5556 			    tstate & MD_DEV_ERRORED));
5557 		}
5558 
5559 		/* Now verify each extent */
5560 		if ((options & MDCMD_VERBOSE) != 0)
5561 			(void) printf("%14s %21s %21s %21s\n",
5562 			    dgettext(TEXT_DOMAIN, "Extent Number"),
5563 			    dgettext(TEXT_DOMAIN, "Virtual Offset"),
5564 			    dgettext(TEXT_DOMAIN, "Physical Offset"),
5565 			    dgettext(TEXT_DOMAIN, "Length"));
5566 
5567 		curvoff = 0ULL;
5568 		for (extn = 0; extn < msp->ext.ext_len; extn++) {
5569 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
5570 
5571 			if ((options & MDCMD_VERBOSE) != 0)
5572 				(void) printf("%14u %21llu %21llu %21llu\n",
5573 				    extn, extp->voff, extp->poff, extp->len);
5574 
5575 			if (extp->voff != curvoff) {
5576 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5577 				    "%s: virtual offset for extent %u "
5578 				    "is inconsistent, expected %llu, "
5579 				    "got %llu.\n"), curnp->cname, extn,
5580 				    curvoff, extp->voff);
5581 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5582 				    0, compnp->cname));
5583 			}
5584 
5585 			/* make sure extent does not drop off the end */
5586 			if ((extp->poff + extp->len) == size) {
5587 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5588 				    "%s: extent %u at offset %llu, "
5589 				    "length %llu exceeds the size of the "
5590 				    "device, %llu.\n"), curnp->cname,
5591 				    extn, extp->poff, extp->len, size);
5592 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5593 				    0, compnp->cname));
5594 			}
5595 
5596 			curvoff += extp->len;
5597 		}
5598 	}
5599 	if (options & MDCMD_PRINT) {
5600 		(void) printf(dgettext(TEXT_DOMAIN,
5601 		    "%s: Soft Partition metadb configuration is valid\n"),
5602 		    compnp->cname);
5603 	}
5604 	return (0);
5605 }
5606 
5607 /*
5608  * FUNCTION:	meta_sp_validate_wm_and_unit()
5609  * INPUT:	sp	- name of set we are recovering in
5610  *		compnp	- name of device we are recovering from
5611  *		options	- metarecover options
5612  * OUTPUT:	ep	- error pointer returned
5613  * RETURNS:	int	- 0 - success, -1 error
5614  * PURPOSE:	cross-validate and display watermarks and metadb records.
5615  *		get both the unit structures for the soft partitions built
5616  *		on the specified component and the watermarks found on that
5617  *		component and check to make sure they are consistent with
5618  *		each other.
5619  */
5620 static int
5621 meta_sp_validate_wm_and_unit(
5622 	mdsetname_t	*sp,
5623 	mdname_t	*np,
5624 	mdcmdopts_t	options,
5625 	md_error_t	*ep
5626 )
5627 {
5628 	sp_ext_node_t	*wmlist = NULL;
5629 	sp_ext_node_t	*unitlist = NULL;
5630 	sp_ext_node_t	*unitext;
5631 	sp_ext_node_t	*wmext;
5632 	sp_ext_offset_t	tmpunitoff;
5633 	mdnamelist_t	*spnlp = NULL;
5634 	int		count;
5635 	int		rval = 0;
5636 	int		verbose = (options & MDCMD_VERBOSE);
5637 
5638 	/* get unit structure list */
5639 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
5640 	if (count <= 0)
5641 		return (-1);
5642 
5643 	meta_sp_list_insert(NULL, NULL, &unitlist,
5644 	    metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
5645 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
5646 
5647 	if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) {
5648 		metafreenamelist(spnlp);
5649 		return (-1);
5650 	}
5651 
5652 	metafreenamelist(spnlp);
5653 
5654 	meta_sp_list_freefill(&unitlist, metagetsize(np, ep));
5655 
5656 	if (meta_sp_extlist_from_wm(sp, np, &wmlist,
5657 	    meta_sp_cmp_by_offset, ep) < 0) {
5658 		meta_sp_list_free(&unitlist);
5659 		return (-1);
5660 	}
5661 
5662 	if (getenv(META_SP_DEBUG)) {
5663 		meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n");
5664 		meta_sp_list_dump(unitlist);
5665 		meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n");
5666 		meta_sp_list_dump(wmlist);
5667 	}
5668 
5669 	/*
5670 	 * step through both lists and compare allocated nodes.  Free
5671 	 * nodes and end watermarks may differ between the two but
5672 	 * that's generally ok, and if they're wrong will typically
5673 	 * cause misplaced allocated extents.
5674 	 */
5675 	if (verbose)
5676 		(void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb "
5677 		    "allocations match extent headers.\n"), np->cname);
5678 
5679 	unitext = unitlist;
5680 	wmext = wmlist;
5681 	while ((wmext != NULL) && (unitext != NULL)) {
5682 		/* find next allocated extents in each list */
5683 		while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC)
5684 			wmext = wmext->ext_next;
5685 
5686 		while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC)
5687 			unitext = unitext->ext_next;
5688 
5689 		if (wmext == NULL || unitext == NULL)
5690 			break;
5691 
5692 		if (verbose) {
5693 			(void) printf(dgettext(TEXT_DOMAIN,
5694 			    "Metadb extent:\n"));
5695 			meta_sp_display_exthdr();
5696 			meta_sp_display_ext(unitext);
5697 			(void) printf(dgettext(TEXT_DOMAIN,
5698 			    "Extent header extent:\n"));
5699 			meta_sp_display_exthdr();
5700 			meta_sp_display_ext(wmext);
5701 			(void) printf("\n");
5702 		}
5703 
5704 		if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0)
5705 			rval = -1;
5706 
5707 		/*
5708 		 * if the offsets aren't equal, only increment the
5709 		 * lowest one in hopes of getting the lists back in sync.
5710 		 */
5711 		tmpunitoff = unitext->ext_offset;
5712 		if (unitext->ext_offset <= wmext->ext_offset)
5713 			unitext = unitext->ext_next;
5714 		if (wmext->ext_offset <= tmpunitoff)
5715 			wmext = wmext->ext_next;
5716 	}
5717 
5718 	/*
5719 	 * if both lists aren't at the end then there are extra
5720 	 * allocated nodes in one of them.
5721 	 */
5722 	if (wmext != NULL) {
5723 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5724 		    "%s: extent headers contain allocations not in "
5725 		    "the metadb\n\n"), np->cname);
5726 		rval = -1;
5727 	}
5728 
5729 	if (unitext != NULL) {
5730 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5731 		    "%s: metadb contains allocations not in the extent "
5732 		    "headers\n\n"), np->cname);
5733 		rval = -1;
5734 	}
5735 
5736 	if (options & MDCMD_PRINT) {
5737 		if (rval == 0) {
5738 			(void) printf(dgettext(TEXT_DOMAIN,
5739 			    "%s: Soft Partition metadb matches extent "
5740 			    "header configuration\n"), np->cname);
5741 		} else {
5742 			(void) printf(dgettext(TEXT_DOMAIN,
5743 			    "%s: Soft Partition metadb does not match extent "
5744 			    "header configuration\n"), np->cname);
5745 		}
5746 	}
5747 
5748 	return (rval);
5749 }
5750 
5751 /*
5752  * FUNCTION:	meta_sp_validate_exts()
5753  * INPUT:	compnp	- name pointer for device we are recovering from
5754  *		wmext	- extent node representing watermark
5755  *		unitext	- extent node from unit structure
5756  * OUTPUT:	ep	- return error pointer
5757  * RETURNS:	int	- 0 - succes, mdmderror return code - error
5758  * PURPOSE:	Takes two extent nodes and checks them against each other.
5759  *		offset, length, sequence number, set, and name are compared.
5760  */
5761 static int
5762 meta_sp_validate_exts(
5763 	mdname_t	*compnp,
5764 	sp_ext_node_t	*wmext,
5765 	sp_ext_node_t	*unitext,
5766 	md_error_t	*ep
5767 )
5768 {
5769 	if (wmext->ext_offset != unitext->ext_offset) {
5770 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5771 		    "%s: unit structure and extent header offsets differ.\n"),
5772 		    compnp->cname);
5773 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5774 	}
5775 
5776 	if (wmext->ext_length != unitext->ext_length) {
5777 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5778 		    "%s: unit structure and extent header lengths differ.\n"),
5779 		    compnp->cname);
5780 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5781 	}
5782 
5783 	if (wmext->ext_seq != unitext->ext_seq) {
5784 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5785 		    "%s: unit structure and extent header sequence numbers "
5786 		    "differ.\n"), compnp->cname);
5787 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5788 	}
5789 
5790 	if (wmext->ext_type != unitext->ext_type) {
5791 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5792 		    "%s: unit structure and extent header types differ.\n"),
5793 		    compnp->cname);
5794 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5795 	}
5796 
5797 	/*
5798 	 * If one has a set pointer and the other doesn't, error.
5799 	 * If both extents have setnames, then make sure they match
5800 	 * If both are NULL, it's ok, they match.
5801 	 */
5802 	if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) {
5803 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5804 		    "%s: unit structure and extent header set values "
5805 		    "differ.\n"), compnp->cname);
5806 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5807 	}
5808 
5809 	if (unitext->ext_setp != NULL) {
5810 		if (strcmp(unitext->ext_setp->setname,
5811 		    wmext->ext_setp->setname) != 0) {
5812 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5813 			    "%s: unit structure and extent header set names "
5814 			    "differ.\n"), compnp->cname);
5815 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5816 			    0, compnp->cname));
5817 		}
5818 	}
5819 
5820 	/*
5821 	 * If one has a name pointer and the other doesn't, error.
5822 	 * If both extents have names, then make sure they match
5823 	 * If both are NULL, it's ok, they match.
5824 	 */
5825 	if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) {
5826 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5827 		    "%s: unit structure and extent header name values "
5828 		    "differ.\n"), compnp->cname);
5829 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5830 	}
5831 
5832 	if (unitext->ext_namep != NULL) {
5833 		if (strcmp(wmext->ext_namep->cname,
5834 		    unitext->ext_namep->cname) != 0) {
5835 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5836 			    "%s: unit structure and extent header names "
5837 			    "differ.\n"), compnp->cname);
5838 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5839 			    0, compnp->cname));
5840 		}
5841 	}
5842 
5843 	return (0);
5844 }
5845 
5846 /*
5847  * FUNCTION:	update_sp_status()
5848  * INPUT:	sp	- name of set we are recovering in
5849  *		minors	- pointer to an array of soft partition minor numbers
5850  *		num_sps	- number of minor numbers in array
5851  *		status	- new status to be applied to all soft parts in array
5852  *		mn_set	- set if current set is a multi-node set
5853  * OUTPUT:	ep	- return error pointer
5854  * RETURNS:	int	- 0 - success, -1 - error
5855  * PURPOSE:	update  status of soft partitions to new status. minors is an
5856  *		array of minor numbers to apply the new status to.
5857  *		If mn_set is set, a message is sent to all nodes in the
5858  *		cluster to update the status locally.
5859  */
5860 static int
5861 update_sp_status(
5862 	mdsetname_t	*sp,
5863 	minor_t		*minors,
5864 	int		num_sps,
5865 	sp_status_t	status,
5866 	bool_t		mn_set,
5867 	md_error_t	*ep
5868 )
5869 {
5870 	int	i;
5871 	int	err = 0;
5872 
5873 	if (mn_set) {
5874 		md_mn_msg_sp_setstat_t	sp_setstat_params;
5875 		int			result;
5876 		md_mn_result_t		*resp = NULL;
5877 
5878 		for (i = 0; i < num_sps; i++) {
5879 			sp_setstat_params.sp_setstat_mnum = minors[i];
5880 			sp_setstat_params.sp_setstat_status = status;
5881 
5882 			result = mdmn_send_message(sp->setno,
5883 			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS,
5884 			    (char *)&sp_setstat_params,
5885 			    sizeof (sp_setstat_params),
5886 			    &resp, ep);
5887 			if (resp != NULL) {
5888 				if (resp->mmr_exitval != 0)
5889 					err = -1;
5890 				free_result(resp);
5891 			}
5892 			if (result != 0) {
5893 				err = -1;
5894 			}
5895 		}
5896 	} else {
5897 		if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0)
5898 			err = -1;
5899 	}
5900 	if (err < 0) {
5901 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5902 		    "Error updating status on recovered soft "
5903 		    "partitions.\n"));
5904 	}
5905 	return (err);
5906 }
5907 
5908 /*
5909  * FUNCTION:	meta_sp_recover_from_wm()
5910  * INPUT:	sp	- name of set we are recovering in
5911  *		compnp	- name pointer for component we are recovering from
5912  *		options	- metarecover options
5913  * OUTPUT:	ep	- return error pointer
5914  * RETURNS:	int	- 0 - success, -1 - error
5915  * PURPOSE:	update metadb records to match watermarks.  begin by getting
5916  *		an extlist representing all soft partitions on the component.
5917  *		then build a unit structure for each soft partition.
5918  *		notify user of changes, then commit each soft partition to
5919  *		the metadb one at a time in the "recovering" state.  update
5920  *		any watermarks that may need it	(to reflect possible name
5921  *		changes), and, finally, set the status of all recovered
5922  *		partitions to the "OK" state at once.
5923  */
5924 static int
5925 meta_sp_recover_from_wm(
5926 	mdsetname_t	*sp,
5927 	mdname_t	*compnp,
5928 	mdcmdopts_t	options,
5929 	md_error_t	*ep
5930 )
5931 {
5932 	sp_ext_node_t		*extlist = NULL;
5933 	sp_ext_node_t		*sp_list = NULL;
5934 	sp_ext_node_t		*update_list = NULL;
5935 	sp_ext_node_t		*ext;
5936 	sp_ext_node_t		*sp_ext;
5937 	mp_unit_t		*mp;
5938 	mp_unit_t		**un_array;
5939 	int			numexts = 0, num_sps = 0, i = 0;
5940 	int			err = 0;
5941 	int			not_recovered = 0;
5942 	int			committed = 0;
5943 	sp_ext_length_t		sp_length = 0LL;
5944 	mdnamelist_t		*keynlp = NULL;
5945 	mdname_t		*np;
5946 	mdname_t		*new_np;
5947 	int			new_name;
5948 	md_set_params_t		set_params;
5949 	minor_t			*minors = NULL;
5950 	char			yesno[255];
5951 	char			*yes;
5952 	bool_t			mn_set = 0;
5953 	md_set_desc		*sd;
5954 	mm_unit_t		*mm;
5955 	md_set_mmown_params_t	*ownpar = NULL;
5956 	int			comp_is_mirror = 0;
5957 
5958 	/*
5959 	 * if this component appears in another metadevice already, do
5960 	 * NOT recover from it.
5961 	 */
5962 	if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0)
5963 		return (-1);
5964 
5965 	/* set flag if dealing with a MN set */
5966 	if (!metaislocalset(sp)) {
5967 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5968 			return (-1);
5969 		}
5970 		if (MD_MNSET_DESC(sd))
5971 			mn_set = 1;
5972 	}
5973 	/*
5974 	 * for each watermark, build an ext_node, place on list.
5975 	 */
5976 	if (meta_sp_extlist_from_wm(sp, compnp, &extlist,
5977 	    meta_sp_cmp_by_nameseq, ep) < 0)
5978 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5979 
5980 	assert(extlist != NULL);
5981 
5982 	/* count number of soft partitions */
5983 	for (ext = extlist;
5984 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5985 	    ext = ext->ext_next) {
5986 		if (ext->ext_next != NULL &&
5987 		    ext->ext_next->ext_namep != NULL &&
5988 		    strcmp(ext->ext_next->ext_namep->cname,
5989 			ext->ext_namep->cname) == 0)
5990 				continue;
5991 		num_sps++;
5992 	}
5993 
5994 	/* allocate array of unit structure pointers */
5995 	un_array = Zalloc(num_sps * sizeof (mp_unit_t *));
5996 
5997 	/*
5998 	 * build unit structures from list of ext_nodes.
5999 	 */
6000 	for (ext = extlist;
6001 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
6002 	    ext = ext->ext_next) {
6003 		meta_sp_list_insert(ext->ext_setp, ext->ext_namep,
6004 		    &sp_list, ext->ext_offset, ext->ext_length,
6005 		    ext->ext_type, ext->ext_seq, ext->ext_flags,
6006 		    meta_sp_cmp_by_nameseq);
6007 
6008 		numexts++;
6009 		sp_length += ext->ext_length - MD_SP_WMSIZE;
6010 
6011 		if (ext->ext_next != NULL &&
6012 		    ext->ext_next->ext_namep != NULL &&
6013 		    strcmp(ext->ext_next->ext_namep->cname,
6014 			ext->ext_namep->cname) == 0)
6015 				continue;
6016 
6017 		/*
6018 		 * if we made it here, we are at a soft partition
6019 		 * boundary in the list.
6020 		 */
6021 		if (getenv(META_SP_DEBUG)) {
6022 			meta_sp_debug("meta_recover_from_wm: dumping wm "
6023 			    "list:\n");
6024 			meta_sp_list_dump(sp_list);
6025 		}
6026 
6027 		assert(sp_list != NULL);
6028 		assert(sp_list->ext_namep != NULL);
6029 
6030 		if ((new_name = meta_sp_resolve_name_conflict(sp,
6031 		    sp_list->ext_namep, &new_np, ep)) < 0) {
6032 			err = 1;
6033 			goto out;
6034 		} else if (new_name) {
6035 			for (sp_ext = sp_list;
6036 			    sp_ext != NULL;
6037 			    sp_ext = sp_ext->ext_next) {
6038 				/*
6039 				 * insert into the update list for
6040 				 * watermark update.
6041 				 */
6042 				meta_sp_list_insert(sp_ext->ext_setp,
6043 				    new_np, &update_list, sp_ext->ext_offset,
6044 				    sp_ext->ext_length, sp_ext->ext_type,
6045 				    sp_ext->ext_seq, EXTFLG_UPDATE,
6046 				    meta_sp_cmp_by_offset);
6047 			}
6048 
6049 		}
6050 		if (options & MDCMD_DOIT) {
6051 			/* store name in namespace */
6052 			if (mn_set) {
6053 				/* send message to all nodes to return key */
6054 				md_mn_msg_addkeyname_t	*send_params;
6055 				int			result;
6056 				md_mn_result_t		*resp = NULL;
6057 				int			message_size;
6058 
6059 				message_size =  sizeof (*send_params) +
6060 				    strlen(compnp->cname) + 1;
6061 				send_params = Zalloc(message_size);
6062 				send_params->addkeyname_setno = sp->setno;
6063 				(void) strcpy(&send_params->addkeyname_name[0],
6064 				    compnp->cname);
6065 				result = mdmn_send_message(sp->setno,
6066 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6067 				    (char *)send_params, message_size, &resp,
6068 				    ep);
6069 				Free(send_params);
6070 				if (resp != NULL) {
6071 					if (resp->mmr_exitval >= 0) {
6072 						compnp->key =
6073 						    (mdkey_t)resp->mmr_exitval;
6074 					} else {
6075 						err = 1;
6076 						free_result(resp);
6077 						goto out;
6078 					}
6079 					free_result(resp);
6080 				}
6081 				if (result != 0) {
6082 					err = 1;
6083 					goto out;
6084 				}
6085 				(void) metanamelist_append(&keynlp, compnp);
6086 			} else {
6087 				if (add_key_name(sp, compnp, &keynlp,
6088 				    ep) != 0) {
6089 					err = 1;
6090 					goto out;
6091 				}
6092 			}
6093 		}
6094 
6095 		/* create the unit structure */
6096 		if ((mp = meta_sp_createunit(
6097 		    (new_name) ? new_np : sp_list->ext_namep, compnp,
6098 		    sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) {
6099 			err = 1;
6100 			goto out;
6101 		}
6102 
6103 		if (getenv(META_SP_DEBUG)) {
6104 			meta_sp_debug("meta_sp_recover_from_wm: "
6105 			    "printing newly created unit structure");
6106 			meta_sp_printunit(mp);
6107 		}
6108 
6109 		/* place in unit structure array */
6110 		un_array[i++] = mp;
6111 
6112 		/* free sp_list */
6113 		meta_sp_list_free(&sp_list);
6114 		sp_list = NULL;
6115 		numexts = 0;
6116 		sp_length = 0LL;
6117 	}
6118 
6119 	/* display configuration updates */
6120 	(void) printf(dgettext(TEXT_DOMAIN,
6121 	    "The following soft partitions were found and will be added to\n"
6122 	    "your metadevice configuration.\n"));
6123 	(void) printf("%5s %15s %18s\n",
6124 	    dgettext(TEXT_DOMAIN, "Name"),
6125 	    dgettext(TEXT_DOMAIN, "Size"),
6126 	    dgettext(TEXT_DOMAIN, "No. of Extents"));
6127 	for (i = 0; i < num_sps; i++) {
6128 		(void) printf("%5s%lu %15llu %9d\n", "d",
6129 		    MD_MIN2UNIT(MD_SID(un_array[i])),
6130 		    un_array[i]->un_length, un_array[i]->un_numexts);
6131 	}
6132 
6133 	if (!(options & MDCMD_DOIT)) {
6134 		not_recovered = 1;
6135 		goto out;
6136 	}
6137 
6138 	/* ask user for confirmation */
6139 	(void) printf(dgettext(TEXT_DOMAIN,
6140 	    "WARNING: You are about to add one or more soft partition\n"
6141 	    "metadevices to your metadevice configuration.  If there\n"
6142 	    "appears to be an error in the soft partition(s) displayed\n"
6143 	    "above, do NOT proceed with this recovery operation.\n"));
6144 	(void) printf(dgettext(TEXT_DOMAIN,
6145 	    "Are you sure you want to do this (yes/no)? "));
6146 
6147 	(void) fflush(stdout);
6148 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6149 	    (strlen(yesno) == 1))
6150 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
6151 		    dgettext(TEXT_DOMAIN, "no"));
6152 	yes = dgettext(TEXT_DOMAIN, "yes");
6153 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
6154 		not_recovered = 1;
6155 		goto out;
6156 	}
6157 
6158 	/* commit records one at a time */
6159 	for (i = 0; i < num_sps; i++) {
6160 		(void) memset(&set_params, 0, sizeof (set_params));
6161 		set_params.mnum = MD_SID(un_array[i]);
6162 		set_params.size = (un_array[i])->c.un_size;
6163 		set_params.mdp = (uintptr_t)(un_array[i]);
6164 		set_params.options =
6165 				meta_check_devicesize(un_array[i]->un_length);
6166 		if (set_params.options == MD_CRO_64BIT) {
6167 			un_array[i]->c.un_revision |= MD_64BIT_META_DEV;
6168 		} else {
6169 			un_array[i]->c.un_revision &= ~MD_64BIT_META_DEV;
6170 		}
6171 		MD_SETDRIVERNAME(&set_params, MD_SP,
6172 		    MD_MIN2SET(set_params.mnum));
6173 
6174 		np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep);
6175 
6176 		/*
6177 		 * If this is an MN set, send the MD_IOCSET ioctl to all nodes
6178 		 */
6179 		if (mn_set) {
6180 			md_mn_msg_iocset_t	send_params;
6181 			int			result;
6182 			md_mn_result_t		*resp = NULL;
6183 			int			mess_size;
6184 
6185 			/*
6186 			 * Calculate message size. md_mn_msg_iocset_t only
6187 			 * contains one extent, so increment the size to
6188 			 * include all extents
6189 			 */
6190 			mess_size = sizeof (send_params) -
6191 			    sizeof (mp_ext_t) +
6192 			    (un_array[i]->un_numexts * sizeof (mp_ext_t));
6193 
6194 			send_params.iocset_params = set_params;
6195 			(void) memcpy(&send_params.unit, un_array[i],
6196 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
6197 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
6198 			result = mdmn_send_message(sp->setno,
6199 			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS,
6200 			    (char *)&send_params, mess_size, &resp,
6201 			    ep);
6202 			if (resp != NULL) {
6203 				if (resp->mmr_exitval != 0)
6204 					err = 1;
6205 				free_result(resp);
6206 			}
6207 			if (result != 0) {
6208 				err = 1;
6209 			}
6210 		} else {
6211 			if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
6212 			    np->cname) != 0) {
6213 				err = 1;
6214 			}
6215 		}
6216 
6217 		if (err == 1) {
6218 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6219 			    "%s: Error committing record to metadb.\n"),
6220 			    np->cname);
6221 			goto out;
6222 		}
6223 
6224 		/* note that we've committed a record */
6225 		if (!committed)
6226 			committed = 1;
6227 
6228 		/* update any watermarks that need it */
6229 		if (update_list != NULL) {
6230 			md_sp_t *msp;
6231 
6232 			/*
6233 			 * Check to see if we're trying to create a partition
6234 			 * on a mirror. If so we may have to enforce an
6235 			 * ownership change before writing the watermark out.
6236 			 */
6237 			if (metaismeta(compnp)) {
6238 				char *miscname;
6239 
6240 				miscname = metagetmiscname(compnp, ep);
6241 				if (miscname != NULL)
6242 					comp_is_mirror = (strcmp(miscname,
6243 					    MD_MIRROR) == 0);
6244 				else
6245 					comp_is_mirror = 0;
6246 			}
6247 			/*
6248 			 * If this is a MN set and the component is a mirror,
6249 			 * change ownership to this node in order to write the
6250 			 * watermarks
6251 			 */
6252 			if (mn_set && comp_is_mirror) {
6253 				mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
6254 				if (mm == NULL) {
6255 					err = 1;
6256 					goto out;
6257 				} else {
6258 					err = meta_mn_change_owner(&ownpar,
6259 						sp->setno,
6260 						meta_getminor(compnp->dev),
6261 						sd->sd_mn_mynode->nd_nodeid,
6262 						MD_MN_MM_PREVENT_CHANGE |
6263 						    MD_MN_MM_SPAWN_THREAD);
6264 					if (err != 0)
6265 						goto out;
6266 				}
6267 			}
6268 
6269 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
6270 				err = 1;
6271 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6272 				    "%s: Error updating extent headers.\n"),
6273 				    np->cname);
6274 				goto out;
6275 			}
6276 			if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) {
6277 				err = 1;
6278 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6279 				    "%s: Error updating extent headers "
6280 				    "on disk.\n"), np->cname);
6281 				goto out;
6282 			}
6283 		}
6284 		/*
6285 		 * If we have changed ownership earlier and prevented any
6286 		 * ownership changes, we can now allow ownership changes
6287 		 * again.
6288 		 */
6289 		if (ownpar) {
6290 			(void) meta_mn_change_owner(&ownpar, sp->setno,
6291 			    ownpar->d.mnum,
6292 			    ownpar->d.owner,
6293 			    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
6294 		}
6295 	}
6296 
6297 	/* update status of all soft partitions to OK */
6298 	minors = Zalloc(num_sps * sizeof (minor_t));
6299 	for (i = 0; i < num_sps; i++)
6300 		minors[i] = MD_SID(un_array[i]);
6301 
6302 	err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep);
6303 	if (err != 0)
6304 		goto out;
6305 
6306 	if (options & MDCMD_PRINT)
6307 		(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6308 		    "Soft Partitions recovered from device.\n"),
6309 		    compnp->cname);
6310 out:
6311 	/* free memory */
6312 	if (extlist != NULL)
6313 		meta_sp_list_free(&extlist);
6314 	if (sp_list != NULL)
6315 		meta_sp_list_free(&sp_list);
6316 	if (update_list != NULL)
6317 		meta_sp_list_free(&update_list);
6318 	if (un_array != NULL)	{
6319 		for (i = 0; i < num_sps; i++)
6320 			Free(un_array[i]);
6321 		Free(un_array);
6322 	}
6323 	if (minors != NULL)
6324 		Free(minors);
6325 	if (ownpar != NULL)
6326 		Free(ownpar);
6327 	(void) fflush(stdout);
6328 
6329 	if ((keynlp != NULL) && (committed != 1)) {
6330 		/*
6331 		 * if we haven't committed any softparts, either because of an
6332 		 * error or because the user decided not to proceed, delete
6333 		 * namelist key for the component
6334 		 */
6335 		if (mn_set) {
6336 			mdnamelist_t	*p;
6337 
6338 			for (p = keynlp; (p != NULL); p = p->next) {
6339 				mdname_t		*np = p->namep;
6340 				md_mn_msg_delkeyname_t	send_params;
6341 				md_mn_result_t		*resp = NULL;
6342 
6343 				send_params.delkeyname_dev = np->dev;
6344 				send_params.delkeyname_setno = sp->setno;
6345 				send_params.delkeyname_key = np->key;
6346 				(void) mdmn_send_message(sp->setno,
6347 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6348 				    (char *)&send_params, sizeof (send_params),
6349 				    &resp, ep);
6350 				if (resp != NULL) {
6351 					free_result(resp);
6352 				}
6353 			}
6354 		} else {
6355 			(void) del_key_names(sp, keynlp, NULL);
6356 		}
6357 	}
6358 
6359 	metafreenamelist(keynlp);
6360 
6361 	if (err)
6362 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
6363 
6364 	if (not_recovered)
6365 		if (options & MDCMD_PRINT)
6366 			(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6367 			    "Soft Partitions NOT recovered from device.\n"),
6368 			    compnp->cname);
6369 	return (0);
6370 }
6371 
6372 /*
6373  * FUNCTION:	meta_sp_recover_from_unit()
6374  * INPUT:	sp	- name of set we are recovering in
6375  *		compnp	- name of component we are recovering from
6376  *		options	- metarecover options
6377  * OUTPUT:	ep	- return error pointer
6378  * RETURNS:	int	- 0 - success, -1 - error
6379  * PURPOSE:	update watermarks to match metadb records.  begin by getting
6380  *		a namelist representing all soft partitions on the specified
6381  *		component.  then, build an extlist representing the soft
6382  *		partitions, filling in the freespace extents.  notify user
6383  *		of changes, place all soft partitions into the "recovering"
6384  *		state and update the watermarks.  finally, return all soft
6385  *		partitions to the "OK" state.
6386  */
6387 static int
6388 meta_sp_recover_from_unit(
6389 	mdsetname_t	*sp,
6390 	mdname_t	*compnp,
6391 	mdcmdopts_t	options,
6392 	md_error_t	*ep
6393 )
6394 {
6395 	mdnamelist_t	*spnlp = NULL;
6396 	mdnamelist_t	*nlp = NULL;
6397 	sp_ext_node_t	*ext = NULL;
6398 	sp_ext_node_t	*extlist = NULL;
6399 	int		count;
6400 	char		yesno[255];
6401 	char		*yes;
6402 	int		rval = 0;
6403 	minor_t		*minors = NULL;
6404 	int		i;
6405 	md_sp_t		*msp;
6406 	md_set_desc	*sd;
6407 	bool_t		mn_set = 0;
6408 	daddr_t		start_block;
6409 
6410 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
6411 	if (count <= 0)
6412 		return (-1);
6413 
6414 	/* set flag if dealing with a MN set */
6415 	if (!metaislocalset(sp)) {
6416 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
6417 			return (-1);
6418 		}
6419 		if (MD_MNSET_DESC(sd))
6420 			mn_set = 1;
6421 	}
6422 	/*
6423 	 * Save the XDR unit structure for one of the soft partitions;
6424 	 * we'll use this later to provide metadevice context to
6425 	 * update the watermarks so the device can be resolved by
6426 	 * devid instead of dev_t.
6427 	 */
6428 	if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) {
6429 		metafreenamelist(spnlp);
6430 		return (-1);
6431 	}
6432 
6433 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
6434 	    MD_DISKADDR_ERROR) {
6435 		return (-1);
6436 	}
6437 
6438 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
6439 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
6440 	meta_sp_list_insert(NULL, NULL, &extlist,
6441 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
6442 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
6443 
6444 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
6445 		metafreenamelist(spnlp);
6446 		return (-1);
6447 	}
6448 
6449 	assert(extlist != NULL);
6450 	if ((options & MDCMD_VERBOSE) != 0) {
6451 		(void) printf(dgettext(TEXT_DOMAIN,
6452 		    "Updating extent headers on device %s from metadb.\n\n"),
6453 		    compnp->cname);
6454 		(void) printf(dgettext(TEXT_DOMAIN,
6455 		    "The following extent headers will be written:\n"));
6456 		meta_sp_display_exthdr();
6457 	}
6458 
6459 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
6460 
6461 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
6462 
6463 		/* mark every node for updating except the reserved space */
6464 		if (ext->ext_type != EXTTYP_RESERVED) {
6465 			ext->ext_flags |= EXTFLG_UPDATE;
6466 
6467 			/* print extent information */
6468 			if ((options & MDCMD_VERBOSE) != 0)
6469 				meta_sp_display_ext(ext);
6470 		}
6471 	}
6472 
6473 	/* request verification and then update all watermarks */
6474 	if ((options & MDCMD_DOIT) != 0) {
6475 
6476 		(void) printf(dgettext(TEXT_DOMAIN,
6477 		    "\nWARNING: You are about to overwrite portions of %s\n"
6478 		    "with soft partition metadata. The extent headers will be\n"
6479 		    "written to match the existing metadb configuration.  If\n"
6480 		    "the device was not previously setup with this\n"
6481 		    "configuration, data loss may result.\n\n"),
6482 		    compnp->cname);
6483 		(void) printf(dgettext(TEXT_DOMAIN,
6484 		    "Are you sure you want to do this (yes/no)? "));
6485 
6486 		(void) fflush(stdout);
6487 		if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6488 		    (strlen(yesno) == 1))
6489 			(void) snprintf(yesno, sizeof (yesno),
6490 			    "%s\n", dgettext(TEXT_DOMAIN, "no"));
6491 		yes = dgettext(TEXT_DOMAIN, "yes");
6492 		if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) {
6493 			/* place soft partitions into recovering state */
6494 			minors = Zalloc(count * sizeof (minor_t));
6495 			for (nlp = spnlp, i = 0;
6496 			    nlp != NULL && i < count;
6497 			    nlp = nlp->next, i++) {
6498 				assert(nlp->namep != NULL);
6499 				minors[i] = meta_getminor(nlp->namep->dev);
6500 			}
6501 			if (update_sp_status(sp, minors, count,
6502 			    MD_SP_RECOVER, mn_set, ep) != 0) {
6503 				rval = -1;
6504 				goto out;
6505 			}
6506 
6507 			/* update the watermarks */
6508 			if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
6509 				rval = -1;
6510 				goto out;
6511 			}
6512 
6513 			if (options & MDCMD_PRINT) {
6514 				(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6515 				    "Soft Partitions recovered from metadb\n"),
6516 				    compnp->cname);
6517 			}
6518 
6519 			/* return soft partitions to the OK state */
6520 			if (update_sp_status(sp, minors, count,
6521 			    MD_SP_OK, mn_set, ep) != 0) {
6522 				rval = -1;
6523 				goto out;
6524 			}
6525 
6526 			rval = 0;
6527 			goto out;
6528 		}
6529 	}
6530 
6531 	if (options & MDCMD_PRINT) {
6532 		(void) printf(dgettext(TEXT_DOMAIN,
6533 		    "%s: Soft Partitions NOT recovered from metadb\n"),
6534 		    compnp->cname);
6535 	}
6536 
6537 out:
6538 	if (minors != NULL)
6539 		Free(minors);
6540 	metafreenamelist(spnlp);
6541 	meta_sp_list_free(&extlist);
6542 	(void) fflush(stdout);
6543 	return (rval);
6544 }
6545 
6546 
6547 /*
6548  * FUNCTION:	meta_sp_update_abr()
6549  * INPUT:	sp	- name of set we are recovering in
6550  * OUTPUT:	ep	- return error pointer
6551  * RETURNS:	int	- 0 - success, -1 - error
6552  * PURPOSE:	update the ABR state for all soft partitions in the set. This
6553  *		is called when joining a set. It sends a message to the master
6554  *		node for each soft partition to get the value of tstate and
6555  *		then sets ABR ,if required, by opening the sp, setting ABR
6556  *		and then closing the sp. This approach is taken rather that
6557  *		just issuing the MD_MN_SET_CAP ioctl, in order to deal with
6558  *		the case when we have another node simultaneously unsetting ABR.
6559  */
6560 int
6561 meta_sp_update_abr(
6562 	mdsetname_t	*sp,
6563 	md_error_t	*ep
6564 )
6565 {
6566 	mdnamelist_t	*devnlp = NULL;
6567 	mdnamelist_t	*p;
6568 	mdname_t	*devnp = NULL;
6569 	md_unit_t	*un;
6570 	char		fname[MAXPATHLEN];
6571 	int		mnum, fd;
6572 	volcap_t	vc;
6573 	uint_t		tstate;
6574 
6575 
6576 	if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
6577 		return (-1);
6578 	}
6579 
6580 	/* Exit if no soft partitions in this set */
6581 	if (devnlp == NULL)
6582 		return (0);
6583 
6584 	/* For each soft partition */
6585 	for (p = devnlp; (p != NULL); p = p->next) {
6586 		devnp = p->namep;
6587 
6588 		/* check if this is a top level metadevice */
6589 		if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL)
6590 			goto out;
6591 		if (MD_HAS_PARENT(MD_PARENT(un))) {
6592 			Free(un);
6593 			continue;
6594 		}
6595 		Free(un);
6596 
6597 		/* Get tstate from Master */
6598 		if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) {
6599 			mdname_t	*np;
6600 			np = metamnumname(&sp, meta_getminor(devnp->dev), 0,
6601 			    ep);
6602 			if (np) {
6603 				md_perror(dgettext(TEXT_DOMAIN,
6604 				    "Unable to get tstate for %s"), np->cname);
6605 			}
6606 			continue;
6607 		}
6608 		/* If not set on the master, nothing to do */
6609 		if (!(tstate & MD_ABR_CAP))
6610 			continue;
6611 
6612 		mnum = meta_getminor(devnp->dev);
6613 		(void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u",
6614 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
6615 		if ((fd = open(fname, O_RDWR, 0)) < 0) {
6616 			md_perror(dgettext(TEXT_DOMAIN,
6617 			    "Could not open device %s"), fname);
6618 			continue;
6619 		}
6620 
6621 		/* Set ABR state */
6622 		vc.vc_info = 0;
6623 		vc.vc_set = 0;
6624 		if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
6625 			(void) close(fd);
6626 			continue;
6627 		}
6628 
6629 		vc.vc_set = DKV_ABR_CAP;
6630 		if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
6631 			(void) close(fd);
6632 			goto out;
6633 		}
6634 
6635 		(void) close(fd);
6636 	}
6637 	metafreenamelist(devnlp);
6638 	return (0);
6639 out:
6640 	metafreenamelist(devnlp);
6641 	return (-1);
6642 }
6643 
6644 /*
6645  * FUNCTION:	meta_mn_sp_update_abr()
6646  * INPUT:	arg	- Given set.
6647  * PURPOSE:	update the ABR state for all soft partitions in the set by
6648  *		forking a process to call meta_sp_update_abr()
6649  *		This function is only called via rpc.metad when adding a node
6650  *		to a set, ie this node is beong joined to the set by another
6651  *		node.
6652  */
6653 void *
6654 meta_mn_sp_update_abr(void *arg)
6655 {
6656 	set_t		setno = *((set_t *)arg);
6657 	mdsetname_t	*sp;
6658 	md_error_t	mde = mdnullerror;
6659 	int		fval;
6660 
6661 	/* should have a set */
6662 	assert(setno != NULL);
6663 
6664 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6665 		mde_perror(&mde, "");
6666 		return (NULL);
6667 	}
6668 
6669 	if (!(meta_is_mn_set(sp, &mde))) {
6670 		mde_perror(&mde, "");
6671 		return (NULL);
6672 	}
6673 
6674 	/* fork a process */
6675 	if ((fval = md_daemonize(sp, &mde)) != 0) {
6676 		/*
6677 		 * md_daemonize will fork off a process.  The is the
6678 		 * parent or error.
6679 		 */
6680 		if (fval > 0) {
6681 			return (NULL);
6682 		}
6683 		mde_perror(&mde, "");
6684 		return (NULL);
6685 	}
6686 	/*
6687 	 * Child process should never return back to rpc.metad, but
6688 	 * should exit.
6689 	 * Flush all internally cached data inherited from parent process
6690 	 * since cached data will be cleared when parent process RPC request
6691 	 * has completed (which is possibly before this child process
6692 	 * can complete).
6693 	 * Child process can retrieve and cache its own copy of data from
6694 	 * rpc.metad that won't be changed by the parent process.
6695 	 *
6696 	 * Reset md_in_daemon since this child will be a client of rpc.metad
6697 	 * not part of the rpc.metad daemon itself.
6698 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
6699 	 * this thread is rpc.metad or any other thread.  (If this thread
6700 	 * was rpc.metad it could use some short circuit code to get data
6701 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
6702 	 */
6703 	md_in_daemon = 0;
6704 	metaflushsetname(sp);
6705 	sr_cache_flush_setno(setno);
6706 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6707 		mde_perror(&mde, "");
6708 		md_exit(sp, 1);
6709 	}
6710 
6711 
6712 	/*
6713 	 * Closing stdin/out/err here.
6714 	 */
6715 	(void) close(0);
6716 	(void) close(1);
6717 	(void) close(2);
6718 	assert(fval == 0);
6719 
6720 	(void) meta_sp_update_abr(sp, &mde);
6721 
6722 	md_exit(sp, 0);
6723 	/*NOTREACHED*/
6724 	return (NULL);
6725 }
6726