xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_sp.c (revision e3e793b1c4791518f806cd8a8a4fafe1a675ba52)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * soft partition operations
38  *
39  * Soft Partitions provide a virtual disk mechanism which is used to
40  * divide a large volume into many small pieces, each appearing as a
41  * separate device.  A soft partition consists of a series of extents,
42  * each having an offset and a length.  The extents are logically
43  * contiguous, so where the first extent leaves off the second extent
44  * picks up.  Which extent a given "virtual offset" belongs to is
45  * dependent on the size of all the previous extents in the soft
46  * partition.
47  *
48  * Soft partitions are represented in memory by an extent node
49  * (sp_ext_node_t) which contains all of the information necessary to
50  * create a unit structure and update the on-disk format, called
51  * "watermarks".  These extent nodes are typically kept in a doubly
52  * linked list and are manipulated by list manipulation routines.  A
53  * list of extents may represent all of the soft partitions on a volume,
54  * a single soft partition, or perhaps just a set of extents that need
55  * to be updated.  Extent lists may be sorted by extent or by name/seq#,
56  * depending on which compare function is used.  Most of the routines
57  * require the list be sorted by offset to work, and that's the typical
58  * configuration.
59  *
60  * In order to do an allocation, knowledge of all soft partitions on the
61  * volume is required.  Then free space is determined from the space
62  * that is not allocated, and new allocations can be made from the free
63  * space.  Once the new allocations are made, a unit structure is created
64  * and the watermarks are updated.  The status is then changed to "okay"
65  * on the unit structure to commit the transaction.  If updating the
66  * watermarks fails, the unit structure is in an intermediate state and
67  * the driver will not allow access to the device.
68  *
69  * A typical sequence of events is:
70  *     1. Fetch the list of names for all soft partitions on a volume
71  *         meta_sp_get_by_component()
72  *     2. Construct an extent list from the name list
73  *         meta_sp_extlist_from_namelist()
74  *     3. Fill the gaps in the extent list with free extents
75  *         meta_sp_list_freefill()
76  *     4. Allocate from the free extents
77  *         meta_sp_alloc_by_len()
78  *         meta_sp_alloc_by_list()
79  *     5. Create the unit structure from the extent list
80  *         meta_sp_createunit()
81  *         meta_sp_updateunit()
82  *     6. Write out the watermarks
83  *         meta_sp_update_wm()
84  *     7. Set the status to "Okay"
85  *         meta_sp_setstatus()
86  *
87  */
88 
89 #include <stdio.h>
90 #include <meta.h>
91 #include "meta_repartition.h"
92 #include <sys/lvm/md_sp.h>
93 #include <sys/lvm/md_crc.h>
94 #include <strings.h>
95 #include <sys/lvm/md_mirror.h>
96 #include <sys/bitmap.h>
97 
98 extern int	md_in_daemon;
99 
100 typedef struct sp_ext_node {
101 	struct sp_ext_node	*ext_next;	/* next element */
102 	struct sp_ext_node	*ext_prev;	/* previous element */
103 	sp_ext_type_t		ext_type;	/* type of extent */
104 	sp_ext_offset_t		ext_offset;	/* starting offset */
105 	sp_ext_length_t		ext_length;	/* length of this node */
106 	uint_t			ext_flags;	/* extent flags */
107 	uint32_t		ext_seq;	/* watermark seq no */
108 	mdname_t		*ext_namep;	/* name pointer */
109 	mdsetname_t		*ext_setp;	/* set pointer */
110 } sp_ext_node_t;
111 
112 /* extent flags */
113 #define	EXTFLG_UPDATE	(1)
114 
115 /* Extent node compare function for list sorting */
116 typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *);
117 
118 
119 /* Function Prototypes */
120 
121 /* Debugging Functions */
122 static void meta_sp_debug(char *format, ...);
123 static void meta_sp_printunit(mp_unit_t *mp);
124 
125 /* Misc Support Functions */
126 int meta_sp_parsesize(char *s, sp_ext_length_t *szp);
127 static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp);
128 static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp,
129 	md_error_t *ep);
130 static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp,
131     mdnamelist_t **nlpp, int force, md_error_t *ep);
132 static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp,
133     mdname_t *compnp, md_error_t *ep);
134 
135 /* Extent List Manipulation Functions */
136 static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2);
137 static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2);
138 static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np,
139     sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length,
140     sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare);
141 static void meta_sp_list_free(sp_ext_node_t **head);
142 static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext);
143 static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head,
144     sp_ext_type_t exttype, int exclude_wm);
145 static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head,
146     sp_ext_offset_t offset);
147 static void meta_sp_list_freefill(sp_ext_node_t **extlist,
148     sp_ext_length_t size);
149 static void meta_sp_list_dump(sp_ext_node_t *head);
150 static int meta_sp_list_overlaps(sp_ext_node_t *head);
151 
152 /* Extent List Query Functions */
153 static boolean_t meta_sp_enough_space(int desired_number_of_sps,
154 	blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp,
155 	sp_ext_length_t alignment);
156 static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep,
157 	mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp,
158 	md_error_t *ep);
159 static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep,
160 	mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp);
161 
162 
163 /* Extent Allocation Functions */
164 static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np,
165     sp_ext_node_t **extlist, sp_ext_node_t *free_ext,
166     sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq);
167 static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np,
168     sp_ext_node_t **extlist, sp_ext_length_t *lp,
169     sp_ext_offset_t last_off, sp_ext_length_t alignment);
170 static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np,
171     sp_ext_node_t **extlist, sp_ext_node_t *oblist);
172 
173 /* Extent List Population Functions */
174 static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp,
175     sp_ext_node_t **extlist, md_error_t *ep);
176 static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp,
177     sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep);
178 
179 /* Print (metastat) Functions */
180 static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp,
181     mdprtopts_t options, md_error_t *ep);
182 static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate);
183 static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp,
184     char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep);
185 
186 /* Watermark Manipulation Functions */
187 static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp,
188     sp_ext_node_t *extlist, md_error_t *ep);
189 static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep);
190 static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp,
191     mp_watermark_t *wm, sp_ext_offset_t offset,  md_error_t *ep);
192 static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp,
193     md_error_t *ep);
194 
195 /* Unit Structure Manipulation Functions */
196 static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist);
197 static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp,
198     sp_ext_node_t *extlist, int numexts, sp_ext_length_t len,
199     sp_status_t status, md_error_t *ep);
200 static mp_unit_t *meta_sp_updateunit(mdname_t *np,  mp_unit_t *old_un,
201     sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts,
202     md_error_t *ep);
203 static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist,
204     mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep);
205 static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options,
206     int *repart_options, md_error_t *ep);
207 
208 /* Reset (metaclear) Functions */
209 static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp,
210     md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep);
211 
212 /* Recovery (metarecover) Functions */
213 static void meta_sp_display_exthdr(void);
214 static void meta_sp_display_ext(sp_ext_node_t *ext);
215 static int meta_sp_checkseq(sp_ext_node_t *extlist);
216 static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *,
217     mdname_t **, md_error_t *);
218 static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np,
219     mdcmdopts_t options, md_error_t *ep);
220 static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp,
221     mdcmdopts_t options, md_error_t *ep);
222 static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np,
223     mdcmdopts_t options, md_error_t *ep);
224 static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext,
225     sp_ext_node_t *unitext, md_error_t *ep);
226 static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp,
227     mdcmdopts_t options, md_error_t *ep);
228 static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np,
229     mdcmdopts_t options, md_error_t *ep);
230 
231 /*
232  * Private Constants
233  */
234 
235 static const int FORCE_RELOAD_CACHE = 1;
236 static const uint_t NO_FLAGS = 0;
237 static const sp_ext_offset_t NO_OFFSET = 0ULL;
238 static const uint_t NO_SEQUENCE_NUMBER = 0;
239 static const int ONE_SOFT_PARTITION = 1;
240 
241 static unsigned long sp_parent_printed[BT_BITOUL(MD_MAXUNITS)];
242 
243 #define	TEST_SOFT_PARTITION_NAMEP NULL
244 #define	TEST_SETNAMEP NULL
245 
246 #define	EXCLUDE_WM	(1)
247 #define	INCLUDE_WM	(0)
248 
249 #define	SP_UNALIGNED	(0LL)
250 
251 /*
252  * **************************************************************************
253  *                          Debugging Functions                             *
254  * **************************************************************************
255  */
256 
257 /*PRINTFLIKE1*/
258 static void
259 meta_sp_debug(char *format, ...)
260 {
261 	static int debug;
262 	static int debug_set = 0;
263 	va_list ap;
264 
265 	if (!debug_set) {
266 		debug = getenv(META_SP_DEBUG) ? 1 : 0;
267 		debug_set = 1;
268 	}
269 
270 	if (debug) {
271 		va_start(ap, format);
272 		(void) vfprintf(stderr, format, ap);
273 		va_end(ap);
274 	}
275 }
276 
277 static void
278 meta_sp_printunit(mp_unit_t *mp)
279 {
280 	int i;
281 
282 	if (mp == NULL)
283 		return;
284 
285 	/* print the common fields we know about */
286 	(void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type);
287 	(void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size);
288 	(void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp));
289 
290 	/* sp-specific fields */
291 	(void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status);
292 	(void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts);
293 	(void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length);
294 	(void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev);
295 	(void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev);
296 	(void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key);
297 
298 	/* print extent information */
299 	(void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n");
300 	for (i = 0; i < mp->un_numexts; i++) {
301 		(void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i,
302 		    mp->un_ext[i].un_voff, mp->un_ext[i].un_poff,
303 		    mp->un_ext[i].un_len);
304 	}
305 }
306 
307 /*
308  * FUNCTION:    meta_sp_parsesize()
309  * INPUT:       s       - the string to parse
310  * OUTPUT:      *szp    - disk block count (0 for "all")
311  * RETURNS:     -1 for error, 0 for success
312  * PURPOSE:     parses the command line parameter that specifies the
313  *              requested size of a soft partition.  The input string
314  *              is either the literal "all" or a numeric value
315  *              followed by a single character, b for disk blocks, k
316  *              for kilobytes, m for megabytes, g for gigabytes, or t
317  *              for terabytes.  p for petabytes and e for exabytes
318  *              have been added as undocumented features for future
319  *              expansion.  For example, 100m is 100 megabytes, while
320  *              50g is 50 gigabytes.  All values are rounded up to the
321  *              nearest block size.
322  */
323 int
324 meta_sp_parsesize(char *s, sp_ext_length_t *szp)
325 {
326 	if (s == NULL || szp == NULL) {
327 		return (-1);
328 	}
329 
330 	/* Check for literal "all" */
331 	if (strcasecmp(s, "all") == 0) {
332 		*szp = 0;
333 		return (0);
334 	}
335 
336 	return (meta_sp_parsesizestring(s, szp));
337 }
338 
339 /*
340  * FUNCTION:	meta_sp_parsesizestring()
341  * INPUT:	s	- the string to parse
342  * OUTPUT:	*szp	- disk block count
343  * RETURNS:	-1 for error, 0 for success
344  * PURPOSE:	parses a string that specifies size. The input string is a
345  *		numeric value followed by a single character, b for disk blocks,
346  *		k for kilobytes, m for megabytes, g for gigabytes, or t for
347  *		terabytes.  p for petabytes and e for exabytes have been added
348  *		as undocumented features for future expansion.  For example,
349  *		100m is 100 megabytes, while 50g is 50 gigabytes.  All values
350  *		are rounded up to the nearest block size.
351  */
352 static int
353 meta_sp_parsesizestring(char *s, sp_ext_length_t *szp)
354 {
355 	sp_ext_length_t	len = 0;
356 	char		len_type[2];
357 
358 	if (s == NULL || szp == NULL) {
359 		return (-1);
360 	}
361 
362 	/*
363 	 * make sure block offset does not overflow 2^64 bytes.
364 	 */
365 	if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) ||
366 	    (len == 0LL) ||
367 	    (len > (1LL << (64 - DEV_BSHIFT))))
368 		return (-1);
369 
370 	switch (len_type[0]) {
371 	case 'B':
372 	case 'b':
373 		len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE));
374 		break;
375 	case 'K':
376 	case 'k':
377 		len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE));
378 		break;
379 	case 'M':
380 	case 'm':
381 		len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE));
382 		break;
383 	case 'g':
384 	case 'G':
385 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE));
386 		break;
387 	case 't':
388 	case 'T':
389 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL,
390 		    DEV_BSIZE));
391 		break;
392 	case 'p':
393 	case 'P':
394 		len = lbtodb(roundup(
395 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
396 		    DEV_BSIZE));
397 		break;
398 	case 'e':
399 	case 'E':
400 		len = lbtodb(roundup(
401 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
402 		    DEV_BSIZE));
403 		break;
404 	default:
405 		/* error */
406 		return (-1);
407 	}
408 
409 	*szp = len;
410 	return (0);
411 }
412 
413 /*
414  * FUNCTION:	meta_sp_setgeom()
415  * INPUT:	np      - the underlying device to setup geometry for
416  *		compnp	- the underlying device to setup geometry for
417  *		mp	- the unit structure to set the geometry for
418  * OUTPUT:	ep	- return error pointer
419  * RETURNS:	int	- -1 if error, 0 otherwise
420  * PURPOSE:	establishes geometry information for a device
421  */
422 static int
423 meta_sp_setgeom(
424 	mdname_t	*np,
425 	mdname_t	*compnp,
426 	mp_unit_t	*mp,
427 	md_error_t	*ep
428 )
429 {
430 	mdgeom_t	*geomp;
431 	uint_t		round_cyl = 0;
432 
433 	if ((geomp = metagetgeom(compnp, ep)) == NULL)
434 		return (-1);
435 	if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct,
436 	    geomp->read_reinstruct, round_cyl, ep) != 0)
437 		return (-1);
438 
439 	return (0);
440 }
441 
442 /*
443  * FUNCTION:	meta_sp_setstatus()
444  * INPUT:	sp	- the set name for the devices to set the status on
445  *		minors	- an array of minor numbers of devices to set status on
446  *		num_units - number of entries in the array
447  *		status	- status value to set all units to
448  * OUTPUT:	ep	- return error pointer
449  * RETURNS:	int	- -1 if error, 0 success
450  * PURPOSE:	sets the status of one or more soft partitions to the
451  *		requested value
452  */
453 int
454 meta_sp_setstatus(
455 	mdsetname_t	*sp,
456 	minor_t		*minors,
457 	int		num_units,
458 	sp_status_t	status,
459 	md_error_t	*ep
460 )
461 {
462 	md_sp_statusset_t	status_params;
463 
464 	assert(minors != NULL);
465 
466 	/* update status of all soft partitions to the status passed in */
467 	(void) memset(&status_params, 0, sizeof (status_params));
468 	status_params.num_units = num_units;
469 	status_params.new_status = status;
470 	status_params.size = num_units * sizeof (minor_t);
471 	status_params.minors = (uintptr_t)minors;
472 	MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno);
473 	if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde,
474 	    NULL) != 0) {
475 		(void) mdstealerror(ep, &status_params.mde);
476 		return (-1);
477 	}
478 	return (0);
479 }
480 
481 /*
482  * FUNCTION:	meta_get_sp_names()
483  * INPUT:	sp	- the set name to get soft partitions from
484  *		options	- options from the command line
485  * OUTPUT:	nlpp	- list of all soft partition names
486  *		ep	- return error pointer
487  * RETURNS:	int	- -1 if error, 0 success
488  * PURPOSE:	returns a list of all soft partitions in the metadb
489  *		for all devices in the specified set
490  */
491 int
492 meta_get_sp_names(
493 	mdsetname_t	*sp,
494 	mdnamelist_t	**nlpp,
495 	int		options,
496 	md_error_t	*ep
497 )
498 {
499 	return (meta_get_names(MD_SP, sp, nlpp, options, ep));
500 }
501 
502 /*
503  * FUNCTION:	meta_get_by_component()
504  * INPUT:	sp	- the set name to get soft partitions from
505  *		compnp	- the name of the device containing the soft
506  *			  partitions that will be returned
507  *		force	- 0 - reads cached namelist if available,
508  *			  1 - reloads cached namelist, frees old namelist
509  * OUTPUT:	nlpp	- list of all soft partition names
510  *		ep	- return error pointer
511  * RETURNS:	int	- -1 error, otherwise the number of soft partitions
512  *			  found on the component (0 = none found).
513  * PURPOSE:	returns a list of all soft partitions on a given device
514  *		from the metadb information
515  */
516 static int
517 meta_sp_get_by_component(
518 	mdsetname_t	*sp,
519 	mdname_t	*compnp,
520 	mdnamelist_t	**nlpp,
521 	int		force,
522 	md_error_t	*ep
523 )
524 {
525 	static mdnamelist_t	*cached_list = NULL;	/* cached namelist */
526 	static int		cached_count = 0;	/* cached count */
527 	mdnamelist_t		*spnlp = NULL;		/* all sp names */
528 	mdnamelist_t		*namep;			/* list iterator */
529 	mdnamelist_t		**tailpp = nlpp;	/* namelist tail */
530 	mdnamelist_t		**cachetailpp;		/* cache tail */
531 	md_sp_t			*msp;			/* unit structure */
532 	int			count = 0;		/* count of sp's */
533 	int			err;
534 	mdname_t		*curnp;
535 
536 	if ((cached_list != NULL) && (!force)) {
537 		/* return a copy of the cached list */
538 		for (namep = cached_list; namep != NULL; namep = namep->next)
539 			tailpp = meta_namelist_append_wrapper(tailpp,
540 			    namep->namep);
541 		return (cached_count);
542 	}
543 
544 	/* free the cache and reset values to zeros to prepare for a new list */
545 	metafreenamelist(cached_list);
546 	cached_count = 0;
547 	cached_list = NULL;
548 	cachetailpp = &cached_list;
549 	*nlpp = NULL;
550 
551 	/* get all the softpartitions first of all */
552 	if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
553 		return (-1);
554 
555 	/*
556 	 * Now for each sp, see if it resides on the component we
557 	 * are interested in, if so then add it to our list
558 	 */
559 	for (namep = spnlp; namep != NULL; namep = namep->next) {
560 		curnp = namep->namep;
561 
562 		/* get the unit structure */
563 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
564 			continue;
565 
566 		/*
567 		 * If the current soft partition is not on the same
568 		 * component, continue the search.  If it is on the same
569 		 * component, add it to our namelist.
570 		 */
571 		err = meta_check_samedrive(compnp, msp->compnamep, ep);
572 		if (err <= 0) {
573 			/* not on the same device, check the next one */
574 			continue;
575 		}
576 
577 		/* it's on the same drive */
578 
579 		/*
580 		 * Check for overlapping partitions if the component is not
581 		 * a metadevice.
582 		 */
583 		if (!metaismeta(msp->compnamep)) {
584 			/*
585 			 * if they're on the same drive, neither
586 			 * should be a metadevice if one isn't
587 			 */
588 			assert(!metaismeta(compnp));
589 
590 			if (meta_check_overlap(msp->compnamep->cname,
591 			    compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0)
592 				continue;
593 
594 			/* in this case it's not an error for them to overlap */
595 			mdclrerror(ep);
596 		}
597 
598 		/* Component is on the same device, add to the used list */
599 		tailpp = meta_namelist_append_wrapper(tailpp, curnp);
600 		cachetailpp = meta_namelist_append_wrapper(cachetailpp,
601 		    curnp);
602 
603 		++count;
604 		++cached_count;
605 	}
606 
607 	assert(count == cached_count);
608 	return (count);
609 
610 out:
611 	metafreenamelist(*nlpp);
612 	*nlpp = NULL;
613 	return (-1);
614 }
615 
616 /*
617  * FUNCTION:    meta_sp_get_default_alignment()
618  * INPUT:       sp      - the pertinent set name
619  *              compnp  - the name of the underlying component
620  * OUTPUT:      ep      - return error pointer
621  * RETURNS:     sp_ext_length_t =0: no default alignment
622  *                              >0: default alignment
623  * PURPOSE:     returns the default alignment for soft partitions to
624  *              be built on top of the specified component or
625  *              metadevice
626  */
627 static sp_ext_length_t
628 meta_sp_get_default_alignment(
629 	mdsetname_t	*sp,
630 	mdname_t	*compnp,
631 	md_error_t	*ep
632 )
633 {
634 	sp_ext_length_t	a = SP_UNALIGNED;
635 	char		*mname;
636 
637 	assert(compnp != NULL);
638 
639 	/*
640 	 * We treat raw devices as opaque, and assume nothing about
641 	 * their alignment requirements.
642 	 */
643 	if (!metaismeta(compnp))
644 		return (SP_UNALIGNED);
645 
646 	/*
647 	 * We already know it's a metadevice from the previous test;
648 	 * metagetmiscname() will tell us which metadevice type we
649 	 * have
650 	 */
651 	mname = metagetmiscname(compnp, ep);
652 	if (mname == NULL)
653 		goto out;
654 
655 	/*
656 	 * For a mirror, we want to deal with the stripe that is the
657 	 * primary side.  If it happens to be asymmetrically
658 	 * configured, there is no simple way to fake a universal
659 	 * alignment.  There's a chance that the least common
660 	 * denominator of the set of interlaces from all stripes of
661 	 * all submirrors would do it, but nobody that really cared
662 	 * that much about this issue would create an asymmetric
663 	 * config to start with.
664 	 *
665 	 * If the component underlying the soft partition is a mirror,
666 	 * then at the exit of this loop, compnp will have been
667 	 * updated to describe the first active submirror.
668 	 */
669 	if (strcmp(mname, MD_MIRROR) == 0) {
670 		md_mirror_t	*mp;
671 		int		smi;
672 		md_submirror_t	*smp;
673 
674 		mp = meta_get_mirror(sp, compnp, ep);
675 		if (mp == NULL)
676 			goto out;
677 
678 		for (smi = 0; smi < NMIRROR; smi++) {
679 
680 			smp = &mp->submirrors[smi];
681 			if (smp->state == SMS_UNUSED)
682 				continue;
683 
684 			compnp = smp->submirnamep;
685 			assert(compnp != NULL);
686 
687 			mname = metagetmiscname(compnp, ep);
688 			if (mname == NULL)
689 				goto out;
690 
691 			break;
692 		}
693 
694 		if (smi == NMIRROR)
695 			goto out;
696 	}
697 
698 	/*
699 	 * Handle stripes and submirrors identically; just return the
700 	 * interlace of the first row.
701 	 */
702 	if (strcmp(mname, MD_STRIPE) == 0) {
703 		md_stripe_t	*stp;
704 
705 		stp = meta_get_stripe(sp, compnp, ep);
706 		if (stp == NULL)
707 			goto out;
708 
709 		a = stp->rows.rows_val[0].interlace;
710 		goto out;
711 	}
712 
713 	/*
714 	 * Raid is even more straightforward; the interlace applies to
715 	 * the entire device.
716 	 */
717 	if (strcmp(mname, MD_RAID) == 0) {
718 		md_raid_t	*rp;
719 
720 		rp = meta_get_raid(sp, compnp, ep);
721 		if (rp == NULL)
722 			goto out;
723 
724 		a = rp->interlace;
725 		goto out;
726 	}
727 
728 	/*
729 	 * If we have arrived here with the alignment still not set,
730 	 * then we expect the error to have been set by one of the
731 	 * routines we called.  If neither is the case, something has
732 	 * really gone wrong above.  (Probably the submirror walk
733 	 * failed to produce a valid submirror, but that would be
734 	 * really bad...)
735 	 */
736 out:
737 	meta_sp_debug("meta_sp_get_default_alignment: miscname %s, "
738 	    "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a);
739 
740 	if (getenv(META_SP_DEBUG) && !mdisok(ep)) {
741 		mde_perror(ep, NULL);
742 	}
743 
744 	assert((a > 0) || (!mdisok(ep)));
745 
746 	return (a);
747 }
748 
749 
750 
751 /*
752  * FUNCTION:	meta_check_insp()
753  * INPUT:	sp	- the set name for the device to check
754  *		np	- the name of the device to check
755  *		slblk	- the starting offset of the device to check
756  *		nblks	- the number of blocks in the device to check
757  * OUTPUT:	ep	- return error pointer
758  * RETURNS:	int	-  0 - device contains soft partitions
759  *			  -1 - device does not contain soft partitions
760  * PURPOSE:	determines whether a device contains any soft partitions
761  */
762 /* ARGSUSED */
763 int
764 meta_check_insp(
765 	mdsetname_t	*sp,
766 	mdname_t	*np,
767 	diskaddr_t	slblk,
768 	diskaddr_t	nblks,
769 	md_error_t	*ep
770 )
771 {
772 	mdnamelist_t	*spnlp = NULL;	/* soft partition name list */
773 	int		count;
774 	int		rval;
775 
776 	/* check set pointer */
777 	assert(sp != NULL);
778 
779 	/*
780 	 * Get a list of the soft partitions that currently reside on
781 	 * the component.  We should ALWAYS force reload the cache,
782 	 * because if we're using the md.tab, we must rebuild
783 	 * the list because it won't contain the previous (if any)
784 	 * soft partition.
785 	 */
786 	/* find all soft partitions on the component */
787 	count = meta_sp_get_by_component(sp, np, &spnlp, 1, ep);
788 
789 	if (count == -1) {
790 		rval = -1;
791 	} else if (count > 0) {
792 		rval = mduseerror(ep, MDE_ALREADY, np->dev,
793 		    spnlp->namep->cname, np->cname);
794 	} else {
795 		rval = 0;
796 	}
797 
798 	metafreenamelist(spnlp);
799 	return (rval);
800 }
801 
802 /*
803  * **************************************************************************
804  *                    Extent List Manipulation Functions                    *
805  * **************************************************************************
806  */
807 
808 /*
809  * FUNCTION:	meta_sp_cmp_by_nameseq()
810  * INPUT:	e1	- first node to compare
811  *		e2	- second node to compare
812  * OUTPUT:	none
813  * RETURNS:	int	- =0 - nodes are equal
814  *			  <0 - e1 should go before e2
815  *			  >0 - e1 should go after e2
816  * PURPOSE:	used for sorted list inserts to build a list sorted by
817  *		name first and sequence number second.
818  */
819 static int
820 meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2)
821 {
822 	int rval;
823 
824 	if (e1->ext_namep == NULL)
825 		return (1);
826 	if (e2->ext_namep == NULL)
827 		return (-1);
828 	if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0)
829 		return (rval);
830 
831 	/* the names are equal, compare sequence numbers */
832 	if (e1->ext_seq > e2->ext_seq)
833 		return (1);
834 	if (e1->ext_seq < e2->ext_seq)
835 		return (-1);
836 	/* sequence numbers are also equal */
837 	return (0);
838 }
839 
840 /*
841  * FUNCTION:	meta_sp_cmp_by_offset()
842  * INPUT:	e1	- first node to compare
843  *		e2	- second node to compare
844  * OUTPUT:	none
845  * RETURNS:	int	- =0 - nodes are equal
846  *			  <0 - e1 should go before e2
847  *			  >0 - e1 should go after e2
848  * PURPOSE:	used for sorted list inserts to build a list sorted by offset
849  */
850 static int
851 meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2)
852 {
853 	if (e1->ext_offset > e2->ext_offset)
854 		return (1);
855 	if (e1->ext_offset < e2->ext_offset)
856 		return (-1);
857 	/* offsets are equal */
858 	return (0);
859 }
860 
861 /*
862  * FUNCTION:	meta_sp_list_insert()
863  * INPUT:	sp	- the set name for the device the node belongs to
864  *		np	- the name of the device the node belongs to
865  *		head	- the head of the list, must be NULL for empty list
866  *		offset	- the physical offset of this extent in sectors
867  *		length	- the length of this extent in sectors
868  *		type	- the type of the extent being inserted
869  *		seq	- the sequence number of the extent being inserted
870  *		flags	- extent flags (eg. whether it needs to be updated)
871  *		compare	- the compare function to use
872  * OUTPUT:	head	- points to the new head if a node was inserted
873  *			  at the beginning
874  * RETURNS:	void
875  * PURPOSE:	inserts an extent node into a sorted doubly linked list.
876  *		The sort order is determined by the compare function.
877  *		Memory is allocated for the node in this function and it
878  *		is up to the caller to free it, possibly using
879  *		meta_sp_list_free().  If a node is inserted at the
880  *		beginning of the list, the head pointer is updated to
881  *		point to the new first node.
882  */
883 static void
884 meta_sp_list_insert(
885 	mdsetname_t	*sp,
886 	mdname_t	*np,
887 	sp_ext_node_t	**head,
888 	sp_ext_offset_t	offset,
889 	sp_ext_length_t	length,
890 	sp_ext_type_t	type,
891 	uint_t		seq,
892 	uint_t		flags,
893 	ext_cmpfunc_t	compare
894 )
895 {
896 	sp_ext_node_t	*newext;
897 	sp_ext_node_t	*curext;
898 
899 	assert(head != NULL);
900 
901 	/* Don't bother adding zero length nodes */
902 	if (length == 0ULL)
903 		return;
904 
905 	/* allocate and fill in new ext_node */
906 	newext = Zalloc(sizeof (sp_ext_node_t));
907 
908 	newext->ext_offset = offset;
909 	newext->ext_length = length;
910 	newext->ext_flags = flags;
911 	newext->ext_type = type;
912 	newext->ext_seq = seq;
913 	newext->ext_setp = sp;
914 	newext->ext_namep = np;
915 
916 	/* first node in the list */
917 	if (*head == NULL) {
918 		newext->ext_next = newext->ext_prev = NULL;
919 		*head = newext;
920 	} else if ((*compare)(*head, newext) >= 0) {
921 		/* the first node has a bigger offset, so insert before it */
922 		assert((*head)->ext_prev == NULL);
923 
924 		newext->ext_prev = NULL;
925 		newext->ext_next = *head;
926 		(*head)->ext_prev = newext;
927 		*head = newext;
928 	} else {
929 		/*
930 		 * find the next node whose offset is greater than
931 		 * the one we want to insert, or the end of the list.
932 		 */
933 		for (curext = *head;
934 		    (curext->ext_next != NULL) &&
935 		    ((*compare)(curext->ext_next, newext) < 0);
936 		    (curext = curext->ext_next))
937 			;
938 
939 		/* link the new node in after the current node */
940 		newext->ext_next = curext->ext_next;
941 		newext->ext_prev = curext;
942 
943 		if (curext->ext_next != NULL)
944 			curext->ext_next->ext_prev = newext;
945 
946 		curext->ext_next = newext;
947 	}
948 }
949 
950 /*
951  * FUNCTION:	meta_sp_list_free()
952  * INPUT:	head	- the head of the list, must be NULL for empty list
953  * OUTPUT:	head	- points to NULL on return
954  * RETURNS:	void
955  * PURPOSE:	walks a double linked extent list and frees each node
956  */
957 static void
958 meta_sp_list_free(sp_ext_node_t **head)
959 {
960 	sp_ext_node_t	*ext;
961 	sp_ext_node_t	*next;
962 
963 	assert(head != NULL);
964 
965 	ext = *head;
966 	while (ext) {
967 		next = ext->ext_next;
968 		Free(ext);
969 		ext = next;
970 	}
971 	*head = NULL;
972 }
973 
974 /*
975  * FUNCTION:	meta_sp_list_remove()
976  * INPUT:	head	- the head of the list, must be NULL for empty list
977  *		ext	- the extent to remove, must be a member of the list
978  * OUTPUT:	head	- points to the new head of the list
979  * RETURNS:	void
980  * PURPOSE:	unlinks the node specified by ext from the list and
981  *		frees it, possibly moving the head pointer forward if
982  *		the head is the node being removed.
983  */
984 static void
985 meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext)
986 {
987 	assert(head != NULL);
988 	assert(*head != NULL);
989 
990 	if (*head == ext)
991 		*head = ext->ext_next;
992 
993 	if (ext->ext_prev != NULL)
994 		ext->ext_prev->ext_next = ext->ext_next;
995 	if (ext->ext_next != NULL)
996 		ext->ext_next->ext_prev = ext->ext_prev;
997 	Free(ext);
998 }
999 
1000 /*
1001  * FUNCTION:	meta_sp_list_size()
1002  * INPUT:	head	- the head of the list, must be NULL for empty list
1003  *		exttype	- the type of the extents to sum
1004  *		exclude_wm - subtract space for extent headers from total
1005  * OUTPUT:	none
1006  * RETURNS:	sp_ext_length_t	- the sum of all of the lengths
1007  * PURPOSE:	sums the lengths of all extents in the list matching the
1008  *		specified type.  This could be used for computing the
1009  *		amount of free or used space, for example.
1010  */
1011 static sp_ext_length_t
1012 meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm)
1013 {
1014 	sp_ext_node_t	*ext;
1015 	sp_ext_length_t	size = 0LL;
1016 
1017 	for (ext = head; ext != NULL; ext = ext->ext_next)
1018 		if (ext->ext_type == exttype)
1019 			size += ext->ext_length -
1020 			    ((exclude_wm) ? MD_SP_WMSIZE : 0);
1021 
1022 	return (size);
1023 }
1024 
1025 /*
1026  * FUNCTION:	meta_sp_list_find()
1027  * INPUT:	head	- the head of the list, must be NULL for empty list
1028  *		offset	- the offset contained by the node to find
1029  * OUTPUT:	none
1030  * RETURNS:	sp_ext_node_t *	- the node containing the requested offset
1031  *				  or NULL if no such nodes were found.
1032  * PURPOSE:	finds a node in a list containing the requested offset
1033  *		(inclusive).  If multiple nodes contain this offset then
1034  *		only the first will be returned, though typically these
1035  *		lists are managed with non-overlapping nodes.
1036  *
1037  *		*The list MUST be sorted by offset for this function to work.*
1038  */
1039 static sp_ext_node_t *
1040 meta_sp_list_find(
1041 	sp_ext_node_t	*head,
1042 	sp_ext_offset_t	offset
1043 )
1044 {
1045 	sp_ext_node_t	*ext;
1046 
1047 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1048 		/* check if the offset lies within this extent */
1049 		if ((offset >= ext->ext_offset) &&
1050 		    (offset < ext->ext_offset + ext->ext_length)) {
1051 			/*
1052 			 * the requested extent should always be a
1053 			 * subset of an extent in the list.
1054 			 */
1055 			return (ext);
1056 		}
1057 	}
1058 	return (NULL);
1059 }
1060 
1061 /*
1062  * FUNCTION:	meta_sp_list_freefill()
1063  * INPUT:	head	- the head of the list, must be NULL for empty list
1064  *		size	- the size of the volume this extent list is
1065  *			  representing
1066  * OUTPUT:	head	- the new head of the list
1067  * RETURNS:	void
1068  * PURPOSE:	finds gaps in the extent list and fills them with a free
1069  *		node.  If there is a gap at the beginning the head
1070  *		pointer will be changed to point to the new free node.
1071  *		If there is free space at the end, the last free extent
1072  *		will extend all the way out to the size specified.
1073  *
1074  *		*The list MUST be sorted by offset for this function to work.*
1075  */
1076 static void
1077 meta_sp_list_freefill(
1078 	sp_ext_node_t	**head,
1079 	sp_ext_length_t	size
1080 )
1081 {
1082 	sp_ext_node_t	*ext;
1083 	sp_ext_offset_t	curoff = 0LL;
1084 
1085 	for (ext = *head; ext != NULL; ext = ext->ext_next) {
1086 		if (curoff < ext->ext_offset)
1087 			meta_sp_list_insert(NULL, NULL, head,
1088 			    curoff, ext->ext_offset - curoff,
1089 			    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1090 		curoff = ext->ext_offset + ext->ext_length;
1091 	}
1092 
1093 	/* pad inverse list out to the end */
1094 	if (curoff < size)
1095 		meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff,
1096 		    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1097 
1098 	if (getenv(META_SP_DEBUG)) {
1099 		meta_sp_debug("meta_sp_list_freefill: Extent list with "
1100 		    "holes freefilled:\n");
1101 		meta_sp_list_dump(*head);
1102 	}
1103 }
1104 
1105 /*
1106  * FUNCTION:	meta_sp_list_dump()
1107  * INPUT:	head	- the head of the list, must be NULL for empty list
1108  * OUTPUT:	none
1109  * RETURNS:	void
1110  * PURPOSE:	dumps the entire extent list to stdout for easy debugging
1111  */
1112 static void
1113 meta_sp_list_dump(sp_ext_node_t *head)
1114 {
1115 	sp_ext_node_t	*ext;
1116 
1117 	meta_sp_debug("meta_sp_list_dump: dumping extent list:\n");
1118 	meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name",
1119 	    "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev",
1120 	    "Next");
1121 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1122 		if (ext->ext_namep != NULL)
1123 			meta_sp_debug("%5s", ext->ext_namep->cname);
1124 		else
1125 			meta_sp_debug("%5s", "NONE");
1126 
1127 		meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq);
1128 		switch (ext->ext_type) {
1129 		case EXTTYP_ALLOC:
1130 			meta_sp_debug("%7s ", "ALLOC");
1131 			break;
1132 		case EXTTYP_FREE:
1133 			meta_sp_debug("%7s ", "FREE");
1134 			break;
1135 		case EXTTYP_END:
1136 			meta_sp_debug("%7s ", "END");
1137 			break;
1138 		case EXTTYP_RESERVED:
1139 			meta_sp_debug("%7s ", "RESV");
1140 			break;
1141 		default:
1142 			meta_sp_debug("%7s ", "INVLD");
1143 			break;
1144 		}
1145 
1146 		meta_sp_debug("%10llu %10llu %5u %10p %10p\n",
1147 		    ext->ext_offset, ext->ext_length,
1148 		    ext->ext_flags, (void *) ext->ext_prev,
1149 		    (void *) ext->ext_next);
1150 	}
1151 	meta_sp_debug("\n");
1152 }
1153 
1154 /*
1155  * FUNCTION:	meta_sp_list_overlaps()
1156  * INPUT:	head	- the head of the list, must be NULL for empty list
1157  * OUTPUT:	none
1158  * RETURNS:	int	- 1 if extents overlap, 0 if ok
1159  * PURPOSE:	checks a list for overlaps.  The list MUST be sorted by
1160  *		offset for this function to work properly.
1161  */
1162 static int
1163 meta_sp_list_overlaps(sp_ext_node_t *head)
1164 {
1165 	sp_ext_node_t	*ext;
1166 
1167 	for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) {
1168 		if (ext->ext_offset + ext->ext_length >
1169 		    ext->ext_next->ext_offset)
1170 			return (1);
1171 	}
1172 	return (0);
1173 }
1174 
1175 /*
1176  * **************************************************************************
1177  *                        Extent Allocation Functions                       *
1178  * **************************************************************************
1179  */
1180 
1181 /*
1182  * FUNCTION:	meta_sp_alloc_by_ext()
1183  * INPUT:	sp	- the set name for the device the node belongs to
1184  *		np	- the name of the device the node belongs to
1185  *		head	- the head of the list, must be NULL for empty list
1186  *		free_ext	- the free extent being allocated from
1187  *		alloc_offset	- the offset of the allocation
1188  *		alloc_len	- the length of the allocation
1189  *		seq		- the sequence number of the allocation
1190  * OUTPUT:	head	- the new head pointer
1191  * RETURNS:	void
1192  * PURPOSE:	allocates a portion of the free extent free_ext.  The
1193  *		allocated portion starts at alloc_offset and is
1194  *		alloc_length long.  Both (alloc_offset) and (alloc_offset +
1195  *		alloc_length) must be contained within the free extent.
1196  *
1197  *		The free extent is split into as many as 3 pieces - a
1198  *		free extent containing [ free_offset .. alloc_offset ), an
1199  *		allocated extent containing the range [ alloc_offset ..
1200  *		alloc_end ], and another free extent containing the
1201  *		range ( alloc_end .. free_end ].  If either of the two
1202  *		new free extents would be zero length, they are not created.
1203  *
1204  *		Finally, the original free extent is removed.  All newly
1205  *		created extents have the EXTFLG_UPDATE flag set.
1206  */
1207 static void
1208 meta_sp_alloc_by_ext(
1209 	mdsetname_t	*sp,
1210 	mdname_t	*np,
1211 	sp_ext_node_t	**head,
1212 	sp_ext_node_t	*free_ext,
1213 	sp_ext_offset_t	alloc_offset,
1214 	sp_ext_length_t	alloc_length,
1215 	uint_t		seq
1216 )
1217 {
1218 	sp_ext_offset_t	free_offset = free_ext->ext_offset;
1219 	sp_ext_length_t	free_length = free_ext->ext_length;
1220 
1221 	sp_ext_offset_t	alloc_end = alloc_offset + alloc_length;
1222 	sp_ext_offset_t	free_end  = free_offset  + free_length;
1223 
1224 	/* allocated extent must be a subset of the free extent */
1225 	assert(free_offset <= alloc_offset);
1226 	assert(free_end >= alloc_end);
1227 
1228 	meta_sp_list_remove(head, free_ext);
1229 
1230 	if (free_offset < alloc_offset) {
1231 		meta_sp_list_insert(NULL, NULL, head, free_offset,
1232 		    (alloc_offset - free_offset), EXTTYP_FREE, 0,
1233 		    EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1234 	}
1235 
1236 	if (free_end > alloc_end) {
1237 		meta_sp_list_insert(NULL, NULL, head, alloc_end,
1238 		    (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE,
1239 		    meta_sp_cmp_by_offset);
1240 	}
1241 
1242 	meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length,
1243 	    EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1244 
1245 	if (getenv(META_SP_DEBUG)) {
1246 		meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n");
1247 		meta_sp_list_dump(*head);
1248 	}
1249 }
1250 
1251 /*
1252  * FUNCTION:	meta_sp_alloc_by_len()
1253  * INPUT:	sp	- the set name for the device the node belongs to
1254  *		np	- the name of the device the node belongs to
1255  *		head	- the head of the list, must be NULL for empty list
1256  *		*lp	- the requested length to allocate
1257  *		last_off	- the last offset already allocated.
1258  *		alignment	- the desired extent alignmeent
1259  * OUTPUT:	head	- the new head pointer
1260  *		*lp	- the length allocated
1261  * RETURNS:	int	- -1 if error, the number of new extents on success
1262  * PURPOSE:	allocates extents from free space to satisfy the requested
1263  *		length.  If requested length is zero, allocates all
1264  *		remaining free space.  This function provides the meat
1265  *		of the extent allocation algorithm.  Allocation is a
1266  *		three tier process:
1267  *
1268  *		1. If last_off is nonzero and there is free space following
1269  *		   that node, then it is extended to allocate as much of that
1270  *		   free space as possible.  This is useful for metattach.
1271  *		2. If a free extent can be found to satisfy the remaining
1272  *		   requested space, then satisfy the rest of the request
1273  *		   from that extent.
1274  *		3. Start allocating space from any remaining free extents until
1275  *		   the remainder of the request is satisified.
1276  *
1277  *              If alignment is non-zero, then every extent modified
1278  *              or newly allocated will be aligned modulo alignment,
1279  *              with a length that is an integer multiple of
1280  *              alignment.
1281  *
1282  *		The EXTFLG_UPDATE flag is set for all nodes (free and
1283  *		allocated) that require updated watermarks.
1284  *
1285  *		This algorithm may have a negative impact on fragmentation
1286  *		in pathological cases and may be improved if it turns out
1287  *		to be a problem.  This may be exacerbated by particularly
1288  *		large alignments.
1289  *
1290  * NOTE:	It's confusing, so it demands an explanation:
1291  *		- len is used to represent requested data space; it
1292  *		  does not include room for a watermark.  On each full
1293  *		  or partial allocation, len will be decremented by
1294  *		  alloc_len (see next paragraph) until it reaches
1295  *		  zero.
1296  *		- alloc_len is used to represent data space allocated
1297  *		  from a particular extent; it does not include space
1298  *		  for a watermark.  In the rare event that a_length
1299  *		  (see next paragraph) is equal to MD_SP_WMSIZE,
1300  *		  alloc_len will be zero and the resulting MD_SP_WMSIZE
1301  *		  fragment of space will be utterly unusable.
1302  *		- a_length is used to represent all space to be
1303  *		  allocated from a particular extent; it DOES include
1304  *		  space for a watermark.
1305  */
1306 static int
1307 meta_sp_alloc_by_len(
1308 	mdsetname_t	*sp,
1309 	mdname_t	*np,
1310 	sp_ext_node_t	**head,
1311 	sp_ext_length_t	*lp,
1312 	sp_ext_offset_t	last_off,
1313 	sp_ext_offset_t	alignment
1314 )
1315 {
1316 	sp_ext_node_t	*free_ext;
1317 	sp_ext_node_t	*alloc_ext;
1318 	uint_t		last_seq = 0;
1319 	uint_t		numexts = 0;
1320 	sp_ext_length_t	freespace;
1321 	sp_ext_length_t	alloc_len;
1322 	sp_ext_length_t	len;
1323 
1324 	/* We're DOA if we can't read *lp */
1325 	assert(lp != NULL);
1326 	len = *lp;
1327 
1328 	/*
1329 	 * Process the nominal case first: we've been given an actual
1330 	 * size argument, rather than the literal "all"
1331 	 */
1332 
1333 	if (len != 0) {
1334 
1335 		/*
1336 		 * Short circuit the check for free space.  This may
1337 		 * tell us we have enough space when we really don't
1338 		 * because each extent loses space to a watermark, but
1339 		 * it will always tell us there isn't enough space
1340 		 * correctly.  Worst case we do some extra work.
1341 		 */
1342 		freespace = meta_sp_list_size(*head, EXTTYP_FREE,
1343 		    INCLUDE_WM);
1344 
1345 		if (freespace < len)
1346 			return (-1);
1347 
1348 		/*
1349 		 * First see if we can extend the last extent for an
1350 		 * attach.
1351 		 */
1352 		if (last_off != 0LL) {
1353 			int align = 0;
1354 
1355 			alloc_ext =
1356 			    meta_sp_list_find(*head, last_off);
1357 			assert(alloc_ext != NULL);
1358 
1359 			/*
1360 			 * The offset test reflects the
1361 			 * inclusion of the watermark in the extent
1362 			 */
1363 			align = (alignment > 0) &&
1364 			    (((alloc_ext->ext_offset + MD_SP_WMSIZE) %
1365 				alignment) == 0);
1366 
1367 			/*
1368 			 * If we decided not to align here, we should
1369 			 * also reset "alignment" so we don't bother
1370 			 * later, either.
1371 			 */
1372 			if (!align) {
1373 				alignment = 0;
1374 			}
1375 
1376 			last_seq = alloc_ext->ext_seq;
1377 
1378 			free_ext = meta_sp_list_find(*head,
1379 			    alloc_ext->ext_offset +
1380 			    alloc_ext->ext_length);
1381 
1382 			/*
1383 			 * If a free extent follows our last allocated
1384 			 * extent, then remove the last allocated
1385 			 * extent and increase the size of the free
1386 			 * extent to overlap it, then allocate the
1387 			 * total space from the new free extent.
1388 			 */
1389 			if (free_ext != NULL &&
1390 			    free_ext->ext_type == EXTTYP_FREE) {
1391 				assert(free_ext->ext_offset ==
1392 				    alloc_ext->ext_offset +
1393 				    alloc_ext->ext_length);
1394 
1395 				alloc_len =
1396 				    MIN(len, free_ext->ext_length);
1397 
1398 				if (align && (alloc_len < len)) {
1399 					/* No watermark space needed */
1400 					alloc_len -= alloc_len % alignment;
1401 				}
1402 
1403 				if (alloc_len > 0) {
1404 					free_ext->ext_offset -=
1405 					    alloc_ext->ext_length;
1406 					free_ext->ext_length +=
1407 					    alloc_ext->ext_length;
1408 
1409 					meta_sp_alloc_by_ext(sp, np, head,
1410 					    free_ext, free_ext->ext_offset,
1411 					    alloc_ext->ext_length + alloc_len,
1412 					    last_seq);
1413 
1414 					/*
1415 					 * now remove the original allocated
1416 					 * node.  We may have overlapping
1417 					 * extents for a short time before
1418 					 * this node is removed.
1419 					 */
1420 					meta_sp_list_remove(head, alloc_ext);
1421 					len -= alloc_len;
1422 				}
1423 			}
1424 			last_seq++;
1425 		}
1426 
1427 		if (len == 0LL)
1428 			goto out;
1429 
1430 		/*
1431 		 * Next, see if we can find a single allocation for
1432 		 * the remainder.  This may make fragmentation worse
1433 		 * in some cases, but there's no good way to allocate
1434 		 * that doesn't have a highly fragmented corner case.
1435 		 */
1436 		for (free_ext = *head; free_ext != NULL;
1437 			free_ext = free_ext->ext_next) {
1438 			sp_ext_offset_t	a_offset;
1439 			sp_ext_offset_t	a_length;
1440 
1441 			if (free_ext->ext_type != EXTTYP_FREE)
1442 				continue;
1443 
1444 			/*
1445 			 * The length test should include space for
1446 			 * the watermark
1447 			 */
1448 
1449 			a_offset = free_ext->ext_offset;
1450 			a_length = free_ext->ext_length;
1451 
1452 			if (alignment > 0) {
1453 
1454 				/*
1455 				 * Shortcut for extents that have been
1456 				 * previously added to pad out the
1457 				 * data space
1458 				 */
1459 				if (a_length < alignment) {
1460 					continue;
1461 				}
1462 
1463 				/*
1464 				 * Round up so the data space begins
1465 				 * on a properly aligned boundary.
1466 				 */
1467 				a_offset += alignment -
1468 				    (a_offset % alignment) - MD_SP_WMSIZE;
1469 
1470 				/*
1471 				 * This is only necessary in case the
1472 				 * watermark size is ever greater than
1473 				 * one.  It'll never happen, of
1474 				 * course; we'll get rid of watermarks
1475 				 * before we make 'em bigger.
1476 				 */
1477 				if (a_offset < free_ext->ext_offset) {
1478 					a_offset += alignment;
1479 				}
1480 
1481 				/*
1482 				 * Adjust the length to account for
1483 				 * the space lost above (if any)
1484 				 */
1485 				a_length -=
1486 					(a_offset - free_ext->ext_offset);
1487 			}
1488 
1489 			if (a_length >= len + MD_SP_WMSIZE) {
1490 				meta_sp_alloc_by_ext(sp, np, head,
1491 					free_ext, a_offset,
1492 					len + MD_SP_WMSIZE, last_seq);
1493 
1494 				len = 0LL;
1495 				numexts++;
1496 				break;
1497 			}
1498 		}
1499 
1500 		if (len == 0LL)
1501 			goto out;
1502 
1503 
1504 		/*
1505 		 * If the request could not be satisfied by extending
1506 		 * the last extent or by a single extent, then put
1507 		 * multiple smaller extents together until the request
1508 		 * is satisfied.
1509 		 */
1510 		for (free_ext = *head; (free_ext != NULL) && (len > 0);
1511 			free_ext = free_ext->ext_next) {
1512 			sp_ext_offset_t a_offset;
1513 			sp_ext_length_t a_length;
1514 
1515 			if (free_ext->ext_type != EXTTYP_FREE)
1516 				continue;
1517 
1518 			a_offset = free_ext->ext_offset;
1519 			a_length = free_ext->ext_length;
1520 
1521 			if (alignment > 0) {
1522 
1523 				/*
1524 				 * Shortcut for extents that have been
1525 				 * previously added to pad out the
1526 				 * data space
1527 				 */
1528 				if (a_length < alignment) {
1529 					continue;
1530 				}
1531 
1532 				/*
1533 				 * Round up so the data space begins
1534 				 * on a properly aligned boundary.
1535 				 */
1536 				a_offset += alignment -
1537 					(a_offset % alignment) - MD_SP_WMSIZE;
1538 
1539 				/*
1540 				 * This is only necessary in case the
1541 				 * watermark size is ever greater than
1542 				 * one.  It'll never happen, of
1543 				 * course; we'll get rid of watermarks
1544 				 * before we make 'em bigger.
1545 				 */
1546 				if (a_offset < free_ext->ext_offset) {
1547 					a_offset += alignment;
1548 				}
1549 
1550 				/*
1551 				 * Adjust the length to account for
1552 				 * the space lost above (if any)
1553 				 */
1554 				a_length -=
1555 					(a_offset - free_ext->ext_offset);
1556 
1557 				/*
1558 				 * Adjust the length to be properly
1559 				 * aligned if it is NOT to be the
1560 				 * last extent in the soft partition.
1561 				 */
1562 				if ((a_length - MD_SP_WMSIZE) < len)
1563 					a_length -=
1564 						(a_length - MD_SP_WMSIZE)
1565 						% alignment;
1566 			}
1567 
1568 			alloc_len = MIN(len, a_length - MD_SP_WMSIZE);
1569 			if (alloc_len == 0)
1570 				continue;
1571 
1572 			/*
1573 			 * meta_sp_alloc_by_ext() expects the
1574 			 * allocation length to include the watermark
1575 			 * size, which is why we don't simply pass in
1576 			 * alloc_len here.
1577 			 */
1578 			meta_sp_alloc_by_ext(sp, np, head, free_ext,
1579 				a_offset, MIN(len + MD_SP_WMSIZE, a_length),
1580 				last_seq);
1581 
1582 			len -= alloc_len;
1583 			numexts++;
1584 			last_seq++;
1585 		}
1586 
1587 
1588 		/*
1589 		 * If there was not enough space we can throw it all
1590 		 * away since no real work has been done yet.
1591 		 */
1592 		if (len != 0) {
1593 			meta_sp_list_free(head);
1594 			return (-1);
1595 		}
1596 	}
1597 
1598 	/*
1599 	 * Otherwise, the literal "all" was specified: allocate all
1600 	 * available free space.  Don't bother with alignment.
1601 	 */
1602 	else {
1603 		/* First, extend the last extent if this is a grow */
1604 		if (last_off != 0LL) {
1605 			alloc_ext =
1606 				meta_sp_list_find(*head, last_off);
1607 			assert(alloc_ext != NULL);
1608 
1609 			last_seq = alloc_ext->ext_seq;
1610 
1611 			free_ext = meta_sp_list_find(*head,
1612 				alloc_ext->ext_offset +
1613 				alloc_ext->ext_length);
1614 
1615 			/*
1616 			 * If a free extent follows our last allocated
1617 			 * extent, then remove the last allocated
1618 			 * extent and increase the size of the free
1619 			 * extent to overlap it, then allocate the
1620 			 * total space from the new free extent.
1621 			 */
1622 			if (free_ext != NULL &&
1623 			    free_ext->ext_type == EXTTYP_FREE) {
1624 				assert(free_ext->ext_offset ==
1625 				    alloc_ext->ext_offset +
1626 				    alloc_ext->ext_length);
1627 
1628 				len = alloc_len =
1629 				    free_ext->ext_length;
1630 
1631 				free_ext->ext_offset -=
1632 				    alloc_ext->ext_length;
1633 				free_ext->ext_length +=
1634 				    alloc_ext->ext_length;
1635 
1636 				meta_sp_alloc_by_ext(sp, np, head,
1637 				    free_ext, free_ext->ext_offset,
1638 				    alloc_ext->ext_length + alloc_len,
1639 				    last_seq);
1640 
1641 				/*
1642 				 * now remove the original allocated
1643 				 * node.  We may have overlapping
1644 				 * extents for a short time before
1645 				 * this node is removed.
1646 				 */
1647 				meta_sp_list_remove(head, alloc_ext);
1648 			}
1649 
1650 			last_seq++;
1651 		}
1652 
1653 		/* Next, grab all remaining free space */
1654 		for (free_ext = *head; free_ext != NULL;
1655 			free_ext = free_ext->ext_next) {
1656 
1657 			if (free_ext->ext_type == EXTTYP_FREE) {
1658 				alloc_len =
1659 				    free_ext->ext_length - MD_SP_WMSIZE;
1660 				if (alloc_len == 0)
1661 					continue;
1662 
1663 				/*
1664 				 * meta_sp_alloc_by_ext() expects the
1665 				 * allocation length to include the
1666 				 * watermark size, which is why we
1667 				 * don't simply pass in alloc_len
1668 				 * here.
1669 				 */
1670 				meta_sp_alloc_by_ext(sp, np, head,
1671 				    free_ext, free_ext->ext_offset,
1672 				    free_ext->ext_length,
1673 				    last_seq);
1674 
1675 				len += alloc_len;
1676 				numexts++;
1677 				last_seq++;
1678 			}
1679 		}
1680 	}
1681 
1682 out:
1683 	if (getenv(META_SP_DEBUG)) {
1684 		meta_sp_debug("meta_sp_alloc_by_len: Extent list after "
1685 		    "allocation:\n");
1686 		meta_sp_list_dump(*head);
1687 	}
1688 
1689 	if (*lp == 0) {
1690 		*lp = len;
1691 
1692 		/*
1693 		 * Make sure the callers hit a no space error if we
1694 		 * didn't actually find anything.
1695 		 */
1696 		if (len == 0) {
1697 			return (-1);
1698 		}
1699 	}
1700 
1701 	return (numexts);
1702 }
1703 
1704 /*
1705  * FUNCTION:	meta_sp_alloc_by_list()
1706  * INPUT:	sp	- the set name for the device the node belongs to
1707  *		np	- the name of the device the node belongs to
1708  *		head	- the head of the list, must be NULL for empty list
1709  *		oblist	- an extent list containing requested nodes to allocate
1710  * OUTPUT:	head	- the new head pointer
1711  * RETURNS:	int	- -1 if error, the number of new extents on success
1712  * PURPOSE:	allocates extents from free space to satisfy the requested
1713  *		extent list.  This is primarily used for the -o/-b options
1714  *		where the user may specifically request extents to allocate.
1715  *		Each extent in the oblist must be a subset (inclusive) of a
1716  *		free extent and may not overlap each other.  This
1717  *		function sets the EXTFLG_UPDATE flag for each node that
1718  *		requires a watermark update after allocating.
1719  */
1720 static int
1721 meta_sp_alloc_by_list(
1722 	mdsetname_t	*sp,
1723 	mdname_t	*np,
1724 	sp_ext_node_t	**head,
1725 	sp_ext_node_t	*oblist
1726 )
1727 {
1728 	sp_ext_node_t	*ext;
1729 	sp_ext_node_t	*free_ext;
1730 	uint_t		numexts = 0;
1731 
1732 	for (ext = oblist; ext != NULL; ext = ext->ext_next) {
1733 
1734 		free_ext = meta_sp_list_find(*head,
1735 		    ext->ext_offset - MD_SP_WMSIZE);
1736 
1737 		/* Make sure the allocation is within the free extent */
1738 		if ((free_ext == NULL) ||
1739 		    (ext->ext_offset + ext->ext_length >
1740 		    free_ext->ext_offset + free_ext->ext_length) ||
1741 		    (free_ext->ext_type != EXTTYP_FREE))
1742 			return (-1);
1743 
1744 		meta_sp_alloc_by_ext(sp, np, head, free_ext,
1745 		    ext->ext_offset - MD_SP_WMSIZE,
1746 		    ext->ext_length + MD_SP_WMSIZE, ext->ext_seq);
1747 
1748 		numexts++;
1749 	}
1750 
1751 	assert(meta_sp_list_overlaps(*head) == 0);
1752 
1753 	if (getenv(META_SP_DEBUG)) {
1754 		meta_sp_debug("meta_sp_alloc_by_list: Extent list after "
1755 		    "allocation:\n");
1756 		meta_sp_list_dump(*head);
1757 	}
1758 
1759 	return (numexts);
1760 }
1761 
1762 /*
1763  * **************************************************************************
1764  *                     Extent List Population Functions                     *
1765  * **************************************************************************
1766  */
1767 
1768 /*
1769  * FUNCTION:	meta_sp_extlist_from_namelist()
1770  * INPUT:	sp	- the set name for the device the node belongs to
1771  *		spnplp	- the namelist of soft partitions to build a list from
1772  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1773  *		ep	- return error pointer
1774  * RETURNS:	int	- -1 if error, 0 on success
1775  * PURPOSE:	builds an extent list representing the soft partitions
1776  *		specified in the namelist.  Each extent in each soft
1777  *		partition is added to the list with the type EXTTYP_ALLOC.
1778  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1779  *		extent in the list includes the space occupied by the
1780  *		watermark, which is not included in the unit structures.
1781  */
1782 static int
1783 meta_sp_extlist_from_namelist(
1784 	mdsetname_t	*sp,
1785 	mdnamelist_t	*spnlp,
1786 	sp_ext_node_t	**extlist,
1787 	md_error_t	*ep
1788 )
1789 {
1790 	int		extn;
1791 	md_sp_t		*msp;		/* unit structure of the sp's */
1792 	mdnamelist_t	*namep;
1793 
1794 	assert(sp != NULL);
1795 
1796 	/*
1797 	 * Now go through the soft partitions and add a node to the used
1798 	 * list for each allocated extent.
1799 	 */
1800 	for (namep = spnlp; namep != NULL; namep = namep->next) {
1801 		mdname_t	*curnp = namep->namep;
1802 
1803 		/* get the unit structure */
1804 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
1805 			return (-1);
1806 
1807 		for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1808 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1809 
1810 			/*
1811 			 * subtract from offset and add to the length
1812 			 * to account for the watermark, which is not
1813 			 * contained in the extents in the unit structure.
1814 			 */
1815 			meta_sp_list_insert(sp, curnp, extlist,
1816 			    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
1817 			    EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset);
1818 		}
1819 	}
1820 	return (0);
1821 }
1822 
1823 /*
1824  * FUNCTION:	meta_sp_extlist_from_wm()
1825  * INPUT:	sp	- the set name for the device the node belongs to
1826  *		compnp	- the name of the device to scan watermarks on
1827  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1828  *		ep	- return error pointer
1829  * RETURNS:	int	- -1 if error, 0 on success
1830  * PURPOSE:	builds an extent list representing the soft partitions
1831  *		specified in the namelist.  Each extent in each soft
1832  *		partition is added to the list with the type EXTTYP_ALLOC.
1833  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1834  *		extent in the list includes the space occupied by the
1835  *		watermark, which is not included in the unit structures.
1836  */
1837 static int
1838 meta_sp_extlist_from_wm(
1839 	mdsetname_t	*sp,
1840 	mdname_t	*compnp,
1841 	sp_ext_node_t	**extlist,
1842 	ext_cmpfunc_t	compare,
1843 	md_error_t	*ep
1844 )
1845 {
1846 	mp_watermark_t	wm;
1847 	mdname_t	*np = NULL;
1848 	mdsetname_t	*spsetp = NULL;
1849 	sp_ext_offset_t	cur_off;
1850 	md_set_desc	*sd;
1851 	int		init = 0;
1852 	mdkey_t		key;
1853 	minor_t		mnum;
1854 
1855 	if (!metaislocalset(sp)) {
1856 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1857 			return (-1);
1858 	}
1859 
1860 	if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR)
1861 		return (-1);
1862 
1863 	for (;;) {
1864 		if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) {
1865 			return (-1);
1866 		}
1867 
1868 		/* get the set and name pointers */
1869 		if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) {
1870 			if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) {
1871 				return (-1);
1872 			}
1873 		}
1874 
1875 		/*
1876 		 * For the MN set, meta_init_make_device needs to
1877 		 * be run on all the nodes so the entries for the
1878 		 * softpart device name and its comp can be created
1879 		 * in the same order in the replica namespace.  If
1880 		 * we have it run on mdmn_do_iocset then the mddbs
1881 		 * will be out of sync between master node and slave
1882 		 * nodes.
1883 		 */
1884 		if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) {
1885 
1886 		    if (!metaislocalset(sp) && MD_MNSET_DESC(sd)) {
1887 			md_mn_msg_addmdname_t	*send_params;
1888 			int			result;
1889 			md_mn_result_t		*resp = NULL;
1890 			int			message_size;
1891 
1892 			message_size =  sizeof (*send_params) +
1893 			    strlen(wm.wm_mdname) + 1;
1894 			send_params = Zalloc(message_size);
1895 			send_params->addmdname_setno = sp->setno;
1896 			(void) strcpy(&send_params->addmdname_name[0],
1897 			    wm.wm_mdname);
1898 			result = mdmn_send_message(sp->setno,
1899 			    MD_MN_MSG_ADDMDNAME,
1900 			    MD_MSGF_PANIC_WHEN_INCONSISTENT,
1901 			    (char *)send_params, message_size, &resp,
1902 			    ep);
1903 			Free(send_params);
1904 			if (resp != NULL) {
1905 				if (resp->mmr_exitval != 0) {
1906 					free_result(resp);
1907 					return (-1);
1908 				}
1909 				free_result(resp);
1910 			}
1911 			if (result != 0)
1912 				return (-1);
1913 		    } else {
1914 
1915 			if (!is_existing_meta_hsp(sp, wm.wm_mdname)) {
1916 			    if ((key = meta_init_make_device(&sp,
1917 				wm.wm_mdname, ep)) <= 0) {
1918 					return (-1);
1919 				}
1920 				init = 1;
1921 			}
1922 		    }
1923 
1924 		    np = metaname(&spsetp, wm.wm_mdname, META_DEVICE, ep);
1925 		    if (np == NULL) {
1926 			if (init) {
1927 			    if (meta_getnmentbykey(sp->setno, MD_SIDEWILD,
1928 				key, NULL, &mnum, NULL, ep) != NULL) {
1929 				    (void) metaioctl(MD_IOCREM_DEV, &mnum,
1930 						ep, NULL);
1931 			    }
1932 			    (void) del_self_name(sp, key, ep);
1933 			}
1934 			return (-1);
1935 		    }
1936 		}
1937 
1938 		/* insert watermark into extent list */
1939 		meta_sp_list_insert(spsetp, np, extlist, cur_off,
1940 		    wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq,
1941 		    EXTFLG_UPDATE, compare);
1942 
1943 		/* if we see the end watermark, we're done */
1944 		if (wm.wm_type == EXTTYP_END)
1945 			break;
1946 
1947 		cur_off += wm.wm_length + 1;
1948 
1949 		/* clear out set and name pointers for next iteration */
1950 		np = NULL;
1951 		spsetp = NULL;
1952 	}
1953 
1954 	return (0);
1955 }
1956 
1957 /*
1958  * **************************************************************************
1959  *                        Print (metastat) Functions                        *
1960  * **************************************************************************
1961  */
1962 
1963 /*
1964  * FUNCTION:	meta_sp_short_print()
1965  * INPUT:	msp	- the unit structure to display
1966  *		fp	- the file pointer to send output to
1967  *		options	- print options from the command line processor
1968  * OUTPUT:	ep	- return error pointer
1969  * RETURNS:	int	- -1 if error, 0 on success
1970  * PURPOSE:	display a short report of the soft partition in md.tab
1971  *		form, primarily used for metastat -p.
1972  */
1973 static int
1974 meta_sp_short_print(
1975 	md_sp_t		*msp,
1976 	char		*fname,
1977 	FILE		*fp,
1978 	mdprtopts_t	options,
1979 	md_error_t	*ep
1980 )
1981 {
1982 	int	extn;
1983 
1984 	if (options & PRINT_LARGEDEVICES) {
1985 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0)
1986 			return (0);
1987 	}
1988 
1989 	if (options & PRINT_FN) {
1990 		if ((msp->common.revision & MD_FN_META_DEV) == 0)
1991 			return (0);
1992 	}
1993 
1994 	/* print name and -p */
1995 	if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF)
1996 		return (mdsyserror(ep, errno, fname));
1997 
1998 	/* print the component */
1999 	/*
2000 	 * Always print the full path name
2001 	 */
2002 	if (fprintf(fp, " %s", msp->compnamep->rname) == EOF)
2003 		return (mdsyserror(ep, errno, fname));
2004 
2005 	/* print out each extent */
2006 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2007 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2008 		if (fprintf(fp, " -o %llu -b %llu ", extp->poff,
2009 		    extp->len) == EOF)
2010 			return (mdsyserror(ep, errno, fname));
2011 	}
2012 
2013 	if (fprintf(fp, "\n") == EOF)
2014 		return (mdsyserror(ep, errno, fname));
2015 
2016 	/* success */
2017 	return (0);
2018 }
2019 
2020 /*
2021  * FUNCTION:	meta_sp_status_to_name()
2022  * INPUT:	xsp_status	- the status value to convert to a string
2023  *		tstate		- transient errored device state. If set the
2024  *				  device is Unavailable
2025  * OUTPUT:	none
2026  * RETURNS:	char *	- a pointer to the string representing the status value
2027  * PURPOSE:	return an internationalized string representing the
2028  *		status value for a soft partition.  The strings are
2029  *		strdup'd and must be freed by the caller.
2030  */
2031 static char *
2032 meta_sp_status_to_name(
2033 	xsp_status_t	xsp_status,
2034 	uint_t		tstate
2035 )
2036 {
2037 	char *rval = NULL;
2038 
2039 	/*
2040 	 * Check to see if we have MD_INACCESSIBLE set. This is the only valid
2041 	 * value for an 'Unavailable' return. tstate can be set because of
2042 	 * other multi-node reasons (e.g. ABR being set)
2043 	 */
2044 	if (tstate & MD_INACCESSIBLE) {
2045 		return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable")));
2046 	}
2047 
2048 	switch (xsp_status) {
2049 	case MD_SP_CREATEPEND:
2050 		rval = Strdup(dgettext(TEXT_DOMAIN, "Creating"));
2051 		break;
2052 	case MD_SP_GROWPEND:
2053 		rval = Strdup(dgettext(TEXT_DOMAIN, "Growing"));
2054 		break;
2055 	case MD_SP_DELPEND:
2056 		rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting"));
2057 		break;
2058 	case MD_SP_OK:
2059 		rval = Strdup(dgettext(TEXT_DOMAIN, "Okay"));
2060 		break;
2061 	case MD_SP_ERR:
2062 		rval = Strdup(dgettext(TEXT_DOMAIN, "Errored"));
2063 		break;
2064 	case MD_SP_RECOVER:
2065 		rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering"));
2066 		break;
2067 	}
2068 
2069 	if (rval == NULL)
2070 		rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid"));
2071 
2072 	return (rval);
2073 }
2074 
2075 /*
2076  * FUNCTION:	meta_sp_report()
2077  * INPUT:	sp	- the set name for the unit being displayed
2078  *		msp	- the unit structure to display
2079  *		nlpp	- pass back the large devs
2080  *		fp	- the file pointer to send output to
2081  *		options	- print options from the command line processor
2082  * OUTPUT:	ep	- return error pointer
2083  * RETURNS:	int	- -1 if error, 0 on success
2084  * PURPOSE:	print a full report of the device specified
2085  */
2086 static int
2087 meta_sp_report(
2088 	mdsetname_t	*sp,
2089 	md_sp_t		*msp,
2090 	mdnamelist_t	**nlpp,
2091 	char		*fname,
2092 	FILE		*fp,
2093 	mdprtopts_t	options,
2094 	md_error_t	*ep
2095 )
2096 {
2097 	uint_t		extn;
2098 	char		*status;
2099 	char		*devid = "";
2100 	mdname_t	*didnp = NULL;
2101 	ddi_devid_t	dtp;
2102 	int		len;
2103 	uint_t		tstate = 0;
2104 
2105 	if (options & PRINT_LARGEDEVICES) {
2106 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0) {
2107 			return (0);
2108 		} else {
2109 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2110 				return (-1);
2111 		}
2112 	}
2113 
2114 	if (options & PRINT_FN) {
2115 		if ((msp->common.revision & MD_FN_META_DEV) == 0) {
2116 			return (0);
2117 		} else {
2118 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2119 				return (-1);
2120 		}
2121 	}
2122 
2123 	if (options & PRINT_HEADER) {
2124 		if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"),
2125 		    msp->common.namep->cname) == EOF)
2126 			return (mdsyserror(ep, errno, fname));
2127 	}
2128 
2129 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Device: %s\n"),
2130 	    msp->compnamep->cname) == EOF)
2131 		return (mdsyserror(ep, errno, fname));
2132 
2133 	/* Determine if device is available before displaying status */
2134 	if (metaismeta(msp->common.namep)) {
2135 		if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0)
2136 			return (-1);
2137 	}
2138 	status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED);
2139 
2140 	/* print out "State" to be consistent with other metadevices */
2141 	if (tstate & MD_ABR_CAP) {
2142 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2143 		    "    State: %s - Application Based Recovery (ABR)\n"),
2144 		    status) == EOF) {
2145 			Free(status);
2146 			return (mdsyserror(ep, errno, fname));
2147 		}
2148 	} else {
2149 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2150 		    "    State: %s\n"), status) == EOF) {
2151 			Free(status);
2152 			return (mdsyserror(ep, errno, fname));
2153 		}
2154 	}
2155 	free(status);
2156 
2157 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Size: %llu blocks (%s)\n"),
2158 	    msp->common.size,
2159 	    meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF)
2160 		return (mdsyserror(ep, errno, fname));
2161 
2162 	/* print component details */
2163 	if (! metaismeta(msp->compnamep)) {
2164 		diskaddr_t	start_blk;
2165 		int		has_mddb;
2166 		char		*has_mddb_str;
2167 
2168 		/* print header */
2169 		/*
2170 		 * Building a format string on the fly that will
2171 		 * be used in (f)printf. This allows the length
2172 		 * of the ctd to vary from small to large without
2173 		 * looking horrible.
2174 		 */
2175 		len = strlen(msp->compnamep->cname);
2176 		len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device")));
2177 		len += 2;
2178 		if (fprintf(fp,
2179 		    "\t%-*.*s %-12.12s %-5.5s %s\n",
2180 		    len, len,
2181 		    dgettext(TEXT_DOMAIN, "Device"),
2182 		    dgettext(TEXT_DOMAIN, "Start Block"),
2183 		    dgettext(TEXT_DOMAIN, "Dbase"),
2184 		    dgettext(TEXT_DOMAIN, "Reloc")) == EOF) {
2185 			return (mdsyserror(ep, errno, fname));
2186 		}
2187 
2188 
2189 		/* get info */
2190 		if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) ==
2191 		    MD_DISKADDR_ERROR)
2192 			return (-1);
2193 
2194 		if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0)
2195 			return (-1);
2196 
2197 		if (has_mddb)
2198 			has_mddb_str = dgettext(TEXT_DOMAIN, "Yes");
2199 		else
2200 			has_mddb_str = dgettext(TEXT_DOMAIN, "No");
2201 
2202 		/* populate the key in the name_p structure */
2203 		didnp = metadevname(&sp, msp->compnamep->dev, ep);
2204 		if (didnp == NULL) {
2205 			return (-1);
2206 		}
2207 
2208 		/* determine if devid does NOT exist */
2209 		if (options & PRINT_DEVID) {
2210 		    if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep),
2211 					didnp->key, ep)) == NULL)
2212 				devid = dgettext(TEXT_DOMAIN, "No ");
2213 			else {
2214 				devid = dgettext(TEXT_DOMAIN, "Yes");
2215 				free(dtp);
2216 			}
2217 		}
2218 
2219 		/* print info */
2220 		/*
2221 		 * This allows the length
2222 		 * of the ctd to vary from small to large without
2223 		 * looking horrible.
2224 		 */
2225 		if (fprintf(fp, "\t%-*s %8lld     %-5.5s %s\n",
2226 		    len, msp->compnamep->cname,
2227 		    start_blk, has_mddb_str, devid) == EOF) {
2228 			return (mdsyserror(ep, errno, fname));
2229 		}
2230 		(void) fprintf(fp, "\n");
2231 	}
2232 
2233 
2234 	/* print the headers */
2235 	if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n",
2236 	    dgettext(TEXT_DOMAIN, "Extent"),
2237 	    dgettext(TEXT_DOMAIN, "Start Block"),
2238 	    dgettext(TEXT_DOMAIN, "Block count")) == EOF)
2239 		return (mdsyserror(ep, errno, fname));
2240 
2241 	/* print out each extent */
2242 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2243 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2244 
2245 		/* If PRINT_TIMES option is ever supported, add output here */
2246 		if (fprintf(fp, "\t%6u %24llu %24llu\n",
2247 		    extn, extp->poff, extp->len) == EOF)
2248 			return (mdsyserror(ep, errno, fname));
2249 	}
2250 
2251 	/* separate records with a newline */
2252 	(void) fprintf(fp, "\n");
2253 	return (0);
2254 }
2255 
2256 /*
2257  * FUNCTION:	meta_sp_print()
2258  * INPUT:	sp	- the set name for the unit being displayed
2259  *		np	- the name of the device to print
2260  *		fname	- ??? not used
2261  *		fp	- the file pointer to send output to
2262  *		options	- print options from the command line processor
2263  * OUTPUT:	ep	- return error pointer
2264  * RETURNS:	int	- -1 if error, 0 on success
2265  * PURPOSE:	print a full report of the device specified by metastat.
2266  *		This is the main entry point for printing.
2267  */
2268 int
2269 meta_sp_print(
2270 	mdsetname_t	*sp,
2271 	mdname_t	*np,
2272 	mdnamelist_t	**nlpp,
2273 	char		*fname,
2274 	FILE		*fp,
2275 	mdprtopts_t	options,
2276 	md_error_t	*ep
2277 )
2278 {
2279 	md_sp_t		*msp;
2280 	md_unit_t	*mdp;
2281 	int		rval = 0;
2282 
2283 	/* should always have the same set */
2284 	assert(sp != NULL);
2285 
2286 	/* print all the soft partitions */
2287 	if (np == NULL) {
2288 		mdnamelist_t	*nlp = NULL;
2289 		mdnamelist_t	*p;
2290 		int		cnt;
2291 
2292 		if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0)
2293 			return (-1);
2294 		else if (cnt == 0)
2295 			return (0);
2296 
2297 		/* recusively print them out */
2298 		for (p = nlp; (p != NULL); p = p->next) {
2299 			mdname_t	*curnp = p->namep;
2300 
2301 			/*
2302 			 * one problem with the rval of -1 here is that
2303 			 * the error gets "lost" when the next device is
2304 			 * printed, but we want to print them all anyway.
2305 			 */
2306 			rval = meta_sp_print(sp, curnp, nlpp, fname, fp,
2307 			    options, ep);
2308 		}
2309 
2310 		/* clean up, return success */
2311 		metafreenamelist(nlp);
2312 		return (rval);
2313 	}
2314 
2315 	/* get the unit structure */
2316 	if ((msp = meta_get_sp_common(sp, np,
2317 	    ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL)
2318 		return (-1);
2319 
2320 	/* check for parented */
2321 	if ((! (options & PRINT_SUBDEVS)) &&
2322 	    (MD_HAS_PARENT(msp->common.parent))) {
2323 		return (0);
2324 	}
2325 
2326 	/* print appropriate detail */
2327 	if (options & PRINT_SHORT) {
2328 		if (meta_sp_short_print(msp, fname, fp, options, ep) != 0)
2329 			return (-1);
2330 	} else {
2331 		if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0)
2332 			return (-1);
2333 	}
2334 
2335 	/*
2336 	 * Print underlying metadevices if they are parented to us and
2337 	 * if the info for the underlying metadevice has not been printed.
2338 	 */
2339 	if (metaismeta(msp->compnamep)) {
2340 		/* get the unit structure for the subdevice */
2341 		if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL)
2342 			return (-1);
2343 
2344 		/* If info not already printed, recurse */
2345 		if (!BT_TEST(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)))) {
2346 			if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp,
2347 			    (options | PRINT_HEADER | PRINT_SUBDEVS),
2348 			    NULL, ep) != 0) {
2349 				return (-1);
2350 			}
2351 			BT_SET(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)));
2352 		}
2353 	}
2354 	return (0);
2355 }
2356 
2357 /*
2358  * **************************************************************************
2359  *                     Watermark Manipulation Functions                     *
2360  * **************************************************************************
2361  */
2362 
2363 /*
2364  * FUNCTION:	meta_sp_get_start()
2365  * INPUT:	sp	- the operating set
2366  *		np 	- device upon which the sp is being built
2367  * OUTPUT:	ep	- return error pointer
2368  * RETURNS:	daddr_t	- -1 if error, otherwise the start block
2369  * PURPOSE:	Encapsulate the determination of the start block of the
2370  *		device upon which the sp is built or being built.
2371  */
2372 static diskaddr_t
2373 meta_sp_get_start(
2374 	mdsetname_t	*sp,
2375 	mdname_t	*np,
2376 	md_error_t	*ep
2377 )
2378 {
2379 	daddr_t		start_block;
2380 
2381 	if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR)
2382 		start_block += MD_SP_START;
2383 
2384 	return (start_block);
2385 }
2386 
2387 /*
2388  * FUNCTION:	meta_sp_update_wm()
2389  * INPUT:	sp	- the operating set
2390  *		msp	- a pointer to the XDR unit structure
2391  *		extlist	- the extent list specifying watermarks to update
2392  * OUTPUT:	ep	- return error pointer
2393  * RETURNS:	int	- -1 if error, 0 on success
2394  * PURPOSE:	steps backwards through the extent list updating
2395  *		watermarks for all extents with the EXTFLG_UPDATE flag
2396  *		set.  Writing the watermarks guarantees consistency when
2397  *		extents must be broken into pieces since the original
2398  *		watermark will be the last to be updated, and will be
2399  *		changed to point to a new watermark that is already
2400  *		known to be consistent.  If one of the writes fails, the
2401  *		original watermark stays intact and none of the changes
2402  *		are realized.
2403  */
2404 static int
2405 meta_sp_update_wm(
2406 	mdsetname_t	*sp,
2407 	md_sp_t		*msp,
2408 	sp_ext_node_t	*extlist,
2409 	md_error_t	*ep
2410 )
2411 {
2412 	sp_ext_node_t	*ext;
2413 	sp_ext_node_t	*tail;
2414 	mp_watermark_t	*wmp, *watermarks;
2415 	xsp_offset_t	*osp, *offsets;
2416 	int		update_count = 0;
2417 	int		rval = 0;
2418 	md_unit_t	*mdp;
2419 	md_sp_update_wm_t	update_params;
2420 
2421 	if (getenv(META_SP_DEBUG)) {
2422 		meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n");
2423 		meta_sp_list_dump(extlist);
2424 	}
2425 
2426 	/*
2427 	 * find the last node so we can write the watermarks backwards
2428 	 * and count watermarks to update so we can allocate space
2429 	 */
2430 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
2431 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2432 			update_count++;
2433 		}
2434 
2435 		if (ext->ext_next == NULL) {
2436 			tail = ext;
2437 		}
2438 	}
2439 	ext = tail;
2440 
2441 	wmp = watermarks =
2442 	    Zalloc(update_count * sizeof (mp_watermark_t));
2443 	osp = offsets =
2444 	    Zalloc(update_count * sizeof (sp_ext_offset_t));
2445 
2446 	while (ext != NULL) {
2447 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2448 			/* update watermark */
2449 			wmp->wm_magic = MD_SP_MAGIC;
2450 			wmp->wm_version = MD_SP_VERSION;
2451 			wmp->wm_type = ext->ext_type;
2452 			wmp->wm_seq = ext->ext_seq;
2453 			wmp->wm_length = ext->ext_length - MD_SP_WMSIZE;
2454 
2455 			/* fill in the volume name and set name */
2456 			if (ext->ext_namep != NULL)
2457 				(void) strcpy(wmp->wm_mdname,
2458 				    ext->ext_namep->cname);
2459 			else
2460 				(void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME);
2461 			if (ext->ext_setp != NULL &&
2462 			    ext->ext_setp->setno != MD_LOCAL_SET)
2463 				(void) strcpy(wmp->wm_setname,
2464 				    ext->ext_setp->setname);
2465 			else
2466 				(void) strcpy(wmp->wm_setname,
2467 				    MD_SP_LOCALSETNAME);
2468 
2469 			/* Generate the checksum */
2470 			wmp->wm_checksum = 0;
2471 			crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum,
2472 			    sizeof (*wmp), NULL);
2473 
2474 			/* record the extent offset */
2475 			*osp = ext->ext_offset;
2476 
2477 			/* Advance the placeholders */
2478 			osp++; wmp++;
2479 		}
2480 		ext = ext->ext_prev;
2481 	}
2482 
2483 	mdp = meta_get_mdunit(sp, msp->common.namep, ep);
2484 	if (mdp == NULL) {
2485 		rval = -1;
2486 		goto out;
2487 	}
2488 
2489 	(void) memset(&update_params, 0, sizeof (update_params));
2490 	update_params.mnum = MD_SID(mdp);
2491 	update_params.count = update_count;
2492 	update_params.wmp = (uintptr_t)watermarks;
2493 	update_params.osp = (uintptr_t)offsets;
2494 	MD_SETDRIVERNAME(&update_params, MD_SP,
2495 	    MD_MIN2SET(update_params.mnum));
2496 
2497 	if (metaioctl(MD_IOC_SPUPDATEWM, &update_params,
2498 	    &update_params.mde, msp->common.namep->cname) != 0) {
2499 		(void) mdstealerror(ep, &update_params.mde);
2500 		rval = -1;
2501 		goto out;
2502 	}
2503 
2504 out:
2505 	Free(watermarks);
2506 	Free(offsets);
2507 
2508 	return (rval);
2509 }
2510 
2511 /*
2512  * FUNCTION:	meta_sp_clear_wm()
2513  * INPUT:	sp	- the operating set
2514  *		msp	- the unit structure for the soft partition to clear
2515  * OUTPUT:	ep	- return error pointer
2516  * RETURNS:	int	- -1 if error, 0 on success
2517  * PURPOSE:	steps through the extents for a soft partition unit and
2518  *		creates an extent list designed to mark all of the
2519  *		watermarks for those extents as free.  The extent list
2520  *		is then passed to meta_sp_update_wm() to actually write
2521  *		the watermarks out.
2522  */
2523 static int
2524 meta_sp_clear_wm(
2525 	mdsetname_t	*sp,
2526 	md_sp_t		*msp,
2527 	md_error_t	*ep
2528 )
2529 {
2530 	sp_ext_node_t	*extlist = NULL;
2531 	int		numexts = msp->ext.ext_len;
2532 	uint_t		i;
2533 	int		rval = 0;
2534 
2535 	/* for each watermark must set the flag to SP_FREE */
2536 	for (i = 0; i < numexts; i++) {
2537 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
2538 
2539 		meta_sp_list_insert(NULL, NULL, &extlist,
2540 		    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
2541 		    EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
2542 	}
2543 
2544 	/* update watermarks */
2545 	rval = meta_sp_update_wm(sp, msp, extlist, ep);
2546 
2547 	meta_sp_list_free(&extlist);
2548 	return (rval);
2549 }
2550 
2551 /*
2552  * FUNCTION:	meta_sp_read_wm()
2553  * INPUT:	sp	- setname for component
2554  *		compnp	- mdname_t for component
2555  *		offset	- the offset of the watermark to read (sectors)
2556  * OUTPUT:	wm	- the watermark structure to read into
2557  *		ep	- return error pointer
2558  * RETURNS:	int	- -1 if error, 0 on success
2559  * PURPOSE:	seeks out to the requested offset and reads a watermark.
2560  *		It then verifies that the magic number is correct and
2561  *		that the checksum is valid, returning an error if either
2562  *		is wrong.
2563  */
2564 static int
2565 meta_sp_read_wm(
2566 	mdsetname_t	*sp,
2567 	mdname_t	*compnp,
2568 	mp_watermark_t	*wm,
2569 	sp_ext_offset_t	offset,
2570 	md_error_t	*ep
2571 )
2572 {
2573 	md_sp_read_wm_t	read_params;
2574 
2575 	/*
2576 	 * make sure block offset does not overflow 2^64 bytes and it's a
2577 	 * multiple of the block size.
2578 	 */
2579 	assert(offset <= (1LL << (64 - DEV_BSHIFT)));
2580 	/* LINTED */
2581 	assert((sizeof (*wm) % DEV_BSIZE) == 0);
2582 
2583 	(void) memset(wm, 0, sizeof (*wm));
2584 
2585 	(void) memset(&read_params, 0, sizeof (read_params));
2586 	read_params.rdev = compnp->dev;
2587 	read_params.wmp = (uintptr_t)wm;
2588 	read_params.offset = offset;
2589 	MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno);
2590 
2591 	if (metaioctl(MD_IOC_SPREADWM, &read_params,
2592 	    &read_params.mde, compnp->cname) != 0) {
2593 
2594 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2595 		    "Extent header read failed, block %llu.\n"), offset);
2596 		return (mdstealerror(ep, &read_params.mde));
2597 	}
2598 
2599 	/* make sure magic number is correct */
2600 	if (wm->wm_magic != MD_SP_MAGIC) {
2601 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2602 		    "found incorrect magic number %x, expected %x.\n"),
2603 		    wm->wm_magic, MD_SP_MAGIC);
2604 		/*
2605 		 * Pass NULL for the device name as we don't have
2606 		 * valid watermark contents.
2607 		 */
2608 		return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL));
2609 	}
2610 
2611 	if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
2612 	    sizeof (*wm), NULL)) {
2613 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2614 		    "found incorrect checksum %x.\n"),
2615 		    wm->wm_checksum);
2616 		return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname));
2617 	}
2618 
2619 	return (0);
2620 }
2621 
2622 /*
2623  * **************************************************************************
2624  *                  Query Functions
2625  * **************************************************************************
2626  */
2627 
2628 /*
2629  * IMPORTANT NOTE: This is a static function that assumes that
2630  *		   its input parameters have been checked and
2631  *		   have valid values that lie within acceptable
2632  *		   ranges.
2633  *
2634  * FUNCTION:	meta_sp_enough_space()
2635  * INPUT:	desired_number_of_sps - the number of soft partitions desired;
2636  *					must be > 0
2637  *		desired_sp_size - the desired soft partition size in blocks;
2638  *				  must be > 0
2639  *		extent_listpp - a reference to a reference to an extent
2640  *				list that lists the extents on a device;
2641  *				must be a reference to a reference to a
2642  *				valid extent list
2643  *		alignment - the desired data space alignment for the sp's
2644  * OUTPUT:	boolean_t return value
2645  * RETURNS:	boolean_t - B_TRUE if there's enough space in the extent
2646  *			    list to create the desired soft partitions,
2647  *			    B_FALSE if there's not enough space
2648  * PURPOSE:	determines whether there's enough free space in an extent
2649  *		list to allow creation of a set of soft partitions
2650  */
2651 static boolean_t
2652 meta_sp_enough_space(
2653 	int		desired_number_of_sps,
2654 	blkcnt_t	desired_sp_size,
2655 	sp_ext_node_t	**extent_listpp,
2656 	sp_ext_length_t	alignment
2657 )
2658 {
2659 	boolean_t		enough_space;
2660 	int			number_of_sps;
2661 	int			number_of_extents_used;
2662 	sp_ext_length_t		desired_ext_length = desired_sp_size;
2663 
2664 	enough_space = B_TRUE;
2665 	number_of_sps = 0;
2666 	while ((enough_space == B_TRUE) &&
2667 		(number_of_sps < desired_number_of_sps)) {
2668 		/*
2669 		 * Use the extent allocation algorithm implemented by
2670 		 * meta_sp_alloc_by_len() to test whether the free
2671 		 * extents in the extent list referenced by *extent_listpp
2672 		 * contain enough space to accomodate a soft partition
2673 		 * of size desired_ext_length.
2674 		 *
2675 		 * Repeat the test <desired_number_of_sps> times
2676 		 * or until it fails, whichever comes first,
2677 		 * each time allocating the extents required to
2678 		 * create the soft partition without actually
2679 		 * creating the soft partition.
2680 		 */
2681 		number_of_extents_used = meta_sp_alloc_by_len(
2682 						TEST_SETNAMEP,
2683 						TEST_SOFT_PARTITION_NAMEP,
2684 						extent_listpp,
2685 						&desired_ext_length,
2686 						NO_OFFSET,
2687 						alignment);
2688 		if (number_of_extents_used == -1) {
2689 			enough_space = B_FALSE;
2690 		} else {
2691 			number_of_sps++;
2692 		}
2693 	}
2694 	return (enough_space);
2695 }
2696 
2697 /*
2698  * IMPORTANT NOTE: This is a static function that calls other functions
2699  *		   that check its mdsetnamep and device_mdnamep
2700  *		   input parameters, but expects extent_listpp to
2701  *		   be a initialized to a valid address to which
2702  *		   it can write a reference to the extent list that
2703  *		   it creates.
2704  *
2705  * FUNCTION:	meta_sp_get_extent_list()
2706  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2707  *			     for the set containing the device for
2708  *			     which the extents are to be listed
2709  *		device_mdnamep - a reference to the mdname_t structure
2710  *				 for the device for which the extents
2711  *				 are to be listed
2712  * OUTPUT:	*extent_listpp - a reference to the extent list for
2713  *				 the device; NULL if the function fails
2714  *		*ep - the libmeta error encountered, if any
2715  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2716  *			    B_FALSE if not
2717  * PURPOSE:	gets the extent list for a device
2718  */
2719 static boolean_t
2720 meta_sp_get_extent_list(
2721 	mdsetname_t	*mdsetnamep,
2722 	mdname_t	*device_mdnamep,
2723 	sp_ext_node_t	**extent_listpp,
2724 	md_error_t	*ep
2725 )
2726 {
2727 	diskaddr_t		device_size_in_blocks;
2728 	mdnamelist_t		*sp_name_listp;
2729 	diskaddr_t		start_block_address_in_blocks;
2730 
2731 	*extent_listpp = NULL;
2732 	sp_name_listp = NULL;
2733 
2734 	start_block_address_in_blocks = meta_sp_get_start(mdsetnamep,
2735 						device_mdnamep,
2736 						ep);
2737 	if (start_block_address_in_blocks == MD_DISKADDR_ERROR) {
2738 	    if (getenv(META_SP_DEBUG)) {
2739 		mde_perror(ep, "meta_sp_get_extent_list:meta_sp_get_start");
2740 	    }
2741 	    return (B_FALSE);
2742 	}
2743 
2744 	device_size_in_blocks = metagetsize(device_mdnamep, ep);
2745 	if (device_size_in_blocks == MD_DISKADDR_ERROR) {
2746 	    if (getenv(META_SP_DEBUG)) {
2747 		mde_perror(ep,
2748 		    "meta_sp_get_extent_list:metagetsize");
2749 	    }
2750 	    return (B_FALSE);
2751 	}
2752 
2753 	/*
2754 	 * Sanity check: the start block will have skipped an integer
2755 	 * number of cylinders, C.  C will usually be zero.  If (C > 0),
2756 	 * and the disk slice happens to only be C cylinders in total
2757 	 * size, we'll fail this check.
2758 	 */
2759 	if (device_size_in_blocks <=
2760 	    (start_block_address_in_blocks + MD_SP_WMSIZE)) {
2761 	    (void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname);
2762 	    return (B_FALSE);
2763 	}
2764 
2765 	/*
2766 	 * After this point, we will have allocated resources, so any
2767 	 * failure returns must be through the supplied "fail" label
2768 	 * to properly deallocate things.
2769 	 */
2770 
2771 	/*
2772 	 * Create an empty extent list that starts one watermark past
2773 	 * the start block of the device and ends one watermark before
2774 	 * the end of the device.
2775 	 */
2776 	meta_sp_list_insert(TEST_SETNAMEP,
2777 			    TEST_SOFT_PARTITION_NAMEP,
2778 			    extent_listpp,
2779 			    NO_OFFSET,
2780 			    (sp_ext_length_t)start_block_address_in_blocks,
2781 			    EXTTYP_RESERVED,
2782 			    NO_SEQUENCE_NUMBER,
2783 			    NO_FLAGS,
2784 			    meta_sp_cmp_by_offset);
2785 	meta_sp_list_insert(TEST_SETNAMEP,
2786 			    TEST_SOFT_PARTITION_NAMEP,
2787 			    extent_listpp,
2788 			    (sp_ext_offset_t)(device_size_in_blocks -
2789 				MD_SP_WMSIZE),
2790 			    MD_SP_WMSIZE,
2791 			    EXTTYP_END,
2792 			    NO_SEQUENCE_NUMBER,
2793 			    NO_FLAGS,
2794 			    meta_sp_cmp_by_offset);
2795 
2796 	/*
2797 	 * Get the list of soft partitions that are already on the
2798 	 * device.
2799 	 */
2800 	if (meta_sp_get_by_component(mdsetnamep, device_mdnamep,
2801 	    &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) {
2802 		if (getenv(META_SP_DEBUG)) {
2803 			mde_perror(ep,
2804 			    "meta_sp_get_extent_list:meta_sp_get_by_component");
2805 		}
2806 		goto fail;
2807 	}
2808 
2809 	if (sp_name_listp != NULL) {
2810 		/*
2811 		 * If there are soft partitions on the device, add the
2812 		 * extents used in them to the extent list.
2813 		 */
2814 		if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp,
2815 		    extent_listpp, ep) == -1) {
2816 			if (getenv(META_SP_DEBUG)) {
2817 				mde_perror(ep, "meta_sp_get_extent_list:"
2818 				    "meta_sp_extlist_from_namelist");
2819 			}
2820 			goto fail;
2821 		}
2822 		metafreenamelist(sp_name_listp);
2823 	}
2824 
2825 	/*
2826 	 * Add free extents to the extent list to represent
2827 	 * the remaining regions of free space on the
2828 	 * device.
2829 	 */
2830 	meta_sp_list_freefill(extent_listpp, device_size_in_blocks);
2831 	return (B_TRUE);
2832 
2833 fail:
2834 	if (sp_name_listp != NULL) {
2835 		metafreenamelist(sp_name_listp);
2836 	}
2837 
2838 	if (*extent_listpp != NULL) {
2839 		/*
2840 		 * meta_sp_list_free sets *extent_listpp to NULL.
2841 		 */
2842 		meta_sp_list_free(extent_listpp);
2843 	}
2844 	return (B_FALSE);
2845 }
2846 
2847 /*
2848  * IMPORTANT NOTE: This is a static function that calls other functions
2849  *		   that check its mdsetnamep and mddrivenamep
2850  *		   input parameters, but expects extent_listpp to
2851  *		   be a initialized to a valid address to which
2852  *		   it can write a reference to the extent list that
2853  *		   it creates.
2854  *
2855  * FUNCTION:	meta_sp_get_extent_list_for_drive()
2856  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2857  *			     for the set containing the drive for
2858  *			     which the extents are to be listed
2859  *		mddrivenamep   - a reference to the mddrivename_t structure
2860  *				 for the drive for which the extents
2861  *				 are to be listed
2862  * OUTPUT:	*extent_listpp - a reference to the extent list for
2863  *				 the drive; NULL if the function fails
2864  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2865  *			    B_FALSE if not
2866  * PURPOSE:	gets the extent list for a drive when the entire drive
2867  *		is to be soft partitioned
2868  */
2869 static boolean_t
2870 meta_sp_get_extent_list_for_drive(
2871 	mdsetname_t	*mdsetnamep,
2872 	mddrivename_t	*mddrivenamep,
2873 	sp_ext_node_t	**extent_listpp
2874 )
2875 {
2876 	boolean_t		can_use;
2877 	diskaddr_t		free_space;
2878 	md_error_t		mderror;
2879 	mdvtoc_t		proposed_vtoc;
2880 	int			repartition_options;
2881 	int			return_value;
2882 	md_sp_t			test_sp_struct;
2883 
2884 	can_use = B_TRUE;
2885 	*extent_listpp = NULL;
2886 	mderror = mdnullerror;
2887 	test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0,
2888 					&mderror);
2889 	if (test_sp_struct.compnamep == NULL) {
2890 		can_use = B_FALSE;
2891 	}
2892 
2893 	if (can_use == B_TRUE) {
2894 		mderror = mdnullerror;
2895 		repartition_options = 0;
2896 		return_value = meta_check_sp(mdsetnamep, &test_sp_struct,
2897 				MDCMD_USE_WHOLE_DISK, &repartition_options,
2898 				&mderror);
2899 		if (return_value != 0) {
2900 			can_use = B_FALSE;
2901 		}
2902 	}
2903 
2904 	if (can_use == B_TRUE) {
2905 		mderror = mdnullerror;
2906 		repartition_options = repartition_options |
2907 			(MD_REPART_FORCE | MD_REPART_DONT_LABEL);
2908 		return_value = meta_repartition_drive(mdsetnamep, mddrivenamep,
2909 				repartition_options, &proposed_vtoc, &mderror);
2910 		if (return_value != 0) {
2911 			can_use = B_FALSE;
2912 		}
2913 	}
2914 
2915 	if (can_use == B_TRUE) {
2916 		free_space = proposed_vtoc.parts[MD_SLICE0].size;
2917 		if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) {
2918 			can_use = B_FALSE;
2919 		}
2920 	}
2921 
2922 	if (can_use == B_TRUE) {
2923 		/*
2924 		 * Create an extent list that starts with
2925 		 * a reserved extent that ends at the start
2926 		 * of the usable space on slice zero of the
2927 		 * proposed VTOC, ends with an extent that
2928 		 * reserves space for a watermark at the end
2929 		 * of slice zero, and contains a single free
2930 		 * extent that occupies the rest of the space
2931 		 * on the slice.
2932 		 *
2933 		 * NOTE:
2934 		 *
2935 		 * Don't use metagetstart() or metagetsize() to
2936 		 * find the usable space.  They query the mdname_t
2937 		 * structure that represents an actual device to
2938 		 * determine the amount of space on the device that
2939 		 * contains metadata and the total amount of space
2940 		 * on the device.  Since this function creates a
2941 		 * proposed extent list that doesn't reflect the
2942 		 * state of an actual device, there's no mdname_t
2943 		 * structure to be queried.
2944 		 *
2945 		 * When a drive is reformatted to prepare for
2946 		 * soft partitioning, all of slice seven is
2947 		 * reserved for metadata, all of slice zero is
2948 		 * available for soft partitioning, and all other
2949 		 * slices on the drive are empty.  The proposed
2950 		 * extent list for the drive therefore contains
2951 		 * only three extents: a reserved extent that ends
2952 		 * at the start of the usable space on slice zero,
2953 		 * a single free extent that occupies all the usable
2954 		 * space on slice zero, and an ending extent that
2955 		 * reserves space for a watermark at the end of
2956 		 * slice zero.
2957 		 */
2958 		meta_sp_list_insert(TEST_SETNAMEP,
2959 			TEST_SOFT_PARTITION_NAMEP,
2960 			extent_listpp,
2961 			NO_OFFSET,
2962 			(sp_ext_length_t)(MD_SP_START),
2963 			EXTTYP_RESERVED,
2964 			NO_SEQUENCE_NUMBER,
2965 			NO_FLAGS,
2966 			meta_sp_cmp_by_offset);
2967 		meta_sp_list_insert(TEST_SETNAMEP,
2968 			TEST_SOFT_PARTITION_NAMEP,
2969 			extent_listpp,
2970 			(sp_ext_offset_t)(free_space - MD_SP_WMSIZE),
2971 			MD_SP_WMSIZE,
2972 			EXTTYP_END,
2973 			NO_SEQUENCE_NUMBER,
2974 			NO_FLAGS,
2975 			meta_sp_cmp_by_offset);
2976 		meta_sp_list_freefill(extent_listpp, free_space);
2977 	}
2978 	return (can_use);
2979 }
2980 
2981 /*
2982  * FUNCTION:	meta_sp_can_create_sps()
2983  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2984  *			     for the set containing the device for
2985  *			     which the extents are to be listed
2986  *		mdnamep - a reference to the mdname_t of the device
2987  *			  on which the soft parititions are to be created
2988  *		number_of_sps - the desired number of soft partitions
2989  *		sp_size - the desired soft partition size
2990  * OUTPUT:	boolean_t return value
2991  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
2992  *			    B_FALSE if not
2993  * PURPOSE:	determines whether a set of soft partitions can be created
2994  *		on a device
2995  */
2996 boolean_t
2997 meta_sp_can_create_sps(
2998 	mdsetname_t	*mdsetnamep,
2999 	mdname_t	*mdnamep,
3000 	int		number_of_sps,
3001 	blkcnt_t	sp_size
3002 )
3003 {
3004 	sp_ext_node_t	*extent_listp;
3005 	boolean_t	succeeded;
3006 	md_error_t	mde;
3007 
3008 	if ((number_of_sps > 0) && (sp_size > 0)) {
3009 		succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3010 						    &extent_listp, &mde);
3011 	} else {
3012 		succeeded = B_FALSE;
3013 	}
3014 
3015 	/*
3016 	 * We don't really care about an error return from the
3017 	 * alignment call; that will just result in passing zero,
3018 	 * which will be interpreted as no alignment.
3019 	 */
3020 
3021 	if (succeeded == B_TRUE) {
3022 		succeeded = meta_sp_enough_space(number_of_sps,
3023 		    sp_size, &extent_listp,
3024 		    meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde));
3025 		meta_sp_list_free(&extent_listp);
3026 	}
3027 	return (succeeded);
3028 }
3029 
3030 /*
3031  * FUNCTION:	meta_sp_can_create_sps_on_drive()
3032  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3033  *			     for the set containing the drive for
3034  *			     which the extents are to be listed
3035  *		mddrivenamep - a reference to the mddrivename_t of the drive
3036  *			       on which the soft parititions are to be created
3037  *		number_of_sps - the desired number of soft partitions
3038  *		sp_size - the desired soft partition size
3039  * OUTPUT:	boolean_t return value
3040  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3041  *			    B_FALSE if not
3042  * PURPOSE:	determines whether a set of soft partitions can be created
3043  *		on a drive if the entire drive is soft partitioned
3044  */
3045 boolean_t
3046 meta_sp_can_create_sps_on_drive(
3047 	mdsetname_t	*mdsetnamep,
3048 	mddrivename_t	*mddrivenamep,
3049 	int		number_of_sps,
3050 	blkcnt_t	sp_size
3051 )
3052 {
3053 	sp_ext_node_t	*extent_listp;
3054 	boolean_t	succeeded;
3055 
3056 	if ((number_of_sps > 0) && (sp_size > 0)) {
3057 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3058 							mddrivenamep,
3059 							&extent_listp);
3060 	} else {
3061 		succeeded = B_FALSE;
3062 	}
3063 
3064 	/*
3065 	 * We don't care about alignment on the space call because
3066 	 * we're specifically dealing with a drive, which will have no
3067 	 * inherent alignment.
3068 	 */
3069 
3070 	if (succeeded == B_TRUE) {
3071 		succeeded = meta_sp_enough_space(number_of_sps, sp_size,
3072 		    &extent_listp, SP_UNALIGNED);
3073 		meta_sp_list_free(&extent_listp);
3074 	}
3075 	return (succeeded);
3076 }
3077 
3078 /*
3079  * FUNCTION:	meta_sp_get_free_space()
3080  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3081  *			     for the set containing the device for
3082  *			     which the free space is to be returned
3083  *		mdnamep - a reference to the mdname_t of the device
3084  *			  for which the free space is to be returned
3085  * OUTPUT:	blkcnt_t return value
3086  * RETURNS:	blkcnt_t - the number of blocks of free space on the device
3087  * PURPOSE:	returns the number of blocks of free space on a device
3088  */
3089 blkcnt_t
3090 meta_sp_get_free_space(
3091 	mdsetname_t	*mdsetnamep,
3092 	mdname_t	*mdnamep
3093 )
3094 {
3095 	sp_ext_node_t		*extent_listp;
3096 	sp_ext_length_t		free_blocks;
3097 	boolean_t		succeeded;
3098 	md_error_t		mde;
3099 
3100 	extent_listp = NULL;
3101 	free_blocks = 0;
3102 	succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3103 					    &extent_listp, &mde);
3104 	if (succeeded == B_TRUE) {
3105 		free_blocks = meta_sp_list_size(extent_listp,
3106 		    EXTTYP_FREE, INCLUDE_WM);
3107 		meta_sp_list_free(&extent_listp);
3108 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3109 			/*
3110 			 * Subtract a safety margin for watermarks when
3111 			 * computing the number of blocks available for
3112 			 * use.  The actual number of watermarks can't
3113 			 * be calculated without knowing the exact numbers
3114 			 * and sizes of both the free extents and the soft
3115 			 * partitions to be created.  The calculation is
3116 			 * highly complex and error-prone even if those
3117 			 * quantities are known.  The approximate value
3118 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3119 			 * correct value in all practical cases.
3120 			 */
3121 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3122 		} else {
3123 			free_blocks = 0;
3124 		}
3125 	} else {
3126 	    mdclrerror(&mde);
3127 	}
3128 
3129 	return (free_blocks);
3130 }
3131 
3132 /*
3133  * FUNCTION:	meta_sp_get_free_space_on_drive()
3134  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3135  *			     for the set containing the drive for
3136  *			     which the free space is to be returned
3137  *		mddrivenamep - a reference to the mddrivename_t of the drive
3138  *			       for which the free space is to be returned
3139  * OUTPUT:	blkcnt_t return value
3140  * RETURNS:	blkcnt_t - the number of blocks of free space on the drive
3141  * PURPOSE:	returns the number of blocks of space usable for soft
3142  *		partitions on an entire drive, if the entire drive is
3143  *		soft partitioned
3144  */
3145 blkcnt_t
3146 meta_sp_get_free_space_on_drive(
3147 	mdsetname_t	*mdsetnamep,
3148 	mddrivename_t	*mddrivenamep
3149 )
3150 {
3151 	sp_ext_node_t		*extent_listp;
3152 	sp_ext_length_t		free_blocks;
3153 	boolean_t		succeeded;
3154 
3155 	extent_listp = NULL;
3156 	free_blocks = 0;
3157 	succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3158 			mddrivenamep, &extent_listp);
3159 	if (succeeded == B_TRUE) {
3160 		free_blocks = meta_sp_list_size(extent_listp,
3161 		    EXTTYP_FREE, INCLUDE_WM);
3162 		meta_sp_list_free(&extent_listp);
3163 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3164 			/*
3165 			 * Subtract a safety margin for watermarks when
3166 			 * computing the number of blocks available for
3167 			 * use.  The actual number of watermarks can't
3168 			 * be calculated without knowing the exact numbers
3169 			 * and sizes of both the free extents and the soft
3170 			 * partitions to be created.  The calculation is
3171 			 * highly complex and error-prone even if those
3172 			 * quantities are known.  The approximate value
3173 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3174 			 * correct value in all practical cases.
3175 			 */
3176 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3177 		} else {
3178 			free_blocks = 0;
3179 		}
3180 	}
3181 	return (free_blocks);
3182 }
3183 
3184 /*
3185  * FUNCTION:	meta_sp_get_number_of_possible_sps()
3186  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3187  *			     for the set containing the device for
3188  *			     which the number of possible soft partitions
3189  *			     is to be returned
3190  *		mdnamep - a reference to the mdname_t of the device
3191  *			  for which the number of possible soft partitions
3192  *			  is to be returned
3193  * OUTPUT:	int return value
3194  * RETURNS:	int - the number of soft partitions of the desired size
3195  *		      that can be created on the device
3196  * PURPOSE:	returns the number of soft partitions of a given size
3197  *		that can be created on a device
3198  */
3199 int
3200 meta_sp_get_number_of_possible_sps(
3201 	mdsetname_t	*mdsetnamep,
3202 	mdname_t	*mdnamep,
3203 	blkcnt_t	sp_size
3204 )
3205 {
3206 	sp_ext_node_t	*extent_listp;
3207 	int		number_of_possible_sps;
3208 	boolean_t	succeeded;
3209 	md_error_t	mde;
3210 	sp_ext_length_t	alignment;
3211 
3212 	extent_listp = NULL;
3213 	number_of_possible_sps = 0;
3214 	if (sp_size > 0) {
3215 	    if ((succeeded = meta_sp_get_extent_list(mdsetnamep,
3216 		mdnamep, &extent_listp, &mde)) == B_FALSE)
3217 		mdclrerror(&mde);
3218 	} else {
3219 		succeeded = B_FALSE;
3220 	}
3221 
3222 	if (succeeded == B_TRUE) {
3223 		alignment = meta_sp_get_default_alignment(mdsetnamep,
3224 		    mdnamep, &mde);
3225 	}
3226 
3227 	while (succeeded == B_TRUE) {
3228 		/*
3229 		 * Keep allocating space from the extent list
3230 		 * for soft partitions of the desired size until
3231 		 * there's not enough free space left in the list
3232 		 * for another soft partiition of that size.
3233 		 * Add one to the number of possible soft partitions
3234 		 * for each soft partition for which there is
3235 		 * enough free space left.
3236 		 */
3237 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3238 		    sp_size, &extent_listp, alignment);
3239 		if (succeeded == B_TRUE) {
3240 			number_of_possible_sps++;
3241 		}
3242 	}
3243 	if (extent_listp != NULL) {
3244 		meta_sp_list_free(&extent_listp);
3245 	}
3246 	return (number_of_possible_sps);
3247 }
3248 
3249 /*
3250  * FUNCTION:	meta_sp_get_number_of_possible_sps_on_drive()
3251  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3252  *			     for the set containing the drive for
3253  *			     which the number of possible soft partitions
3254  *			     is to be returned
3255  *		mddrivenamep - a reference to the mddrivename_t of the drive
3256  *			       for which the number of possible soft partitions
3257  *			       is to be returned
3258  *		sp_size - the size in blocks of the proposed soft partitions
3259  * OUTPUT:	int return value
3260  * RETURNS:	int - the number of soft partitions of the desired size
3261  *		      that can be created on the drive
3262  * PURPOSE:	returns the number of soft partitions of a given size
3263  *		that can be created on a drive, if the entire drive is
3264  *		soft partitioned
3265  */
3266 int
3267 meta_sp_get_number_of_possible_sps_on_drive(
3268 	mdsetname_t	*mdsetnamep,
3269 	mddrivename_t	*mddrivenamep,
3270 	blkcnt_t	sp_size
3271 )
3272 {
3273 	sp_ext_node_t	*extent_listp;
3274 	int		number_of_possible_sps;
3275 	boolean_t	succeeded;
3276 
3277 	extent_listp = NULL;
3278 	number_of_possible_sps = 0;
3279 	if (sp_size > 0) {
3280 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3281 					mddrivenamep, &extent_listp);
3282 	} else {
3283 		succeeded = B_FALSE;
3284 	}
3285 	while (succeeded == B_TRUE) {
3286 		/*
3287 		 * Keep allocating space from the extent list
3288 		 * for soft partitions of the desired size until
3289 		 * there's not enough free space left in the list
3290 		 * for another soft partition of that size.
3291 		 * Add one to the number of possible soft partitions
3292 		 * for each soft partition for which there is
3293 		 * enough free space left.
3294 		 *
3295 		 * Since it's a drive, not a metadevice, make no
3296 		 * assumptions about alignment.
3297 		 */
3298 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3299 		    sp_size, &extent_listp, SP_UNALIGNED);
3300 		if (succeeded == B_TRUE) {
3301 			number_of_possible_sps++;
3302 		}
3303 	}
3304 	if (extent_listp != NULL) {
3305 		meta_sp_list_free(&extent_listp);
3306 	}
3307 	return (number_of_possible_sps);
3308 }
3309 
3310 /*
3311  * FUNCTION:	meta_sp_get_possible_sp_size()
3312  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3313  *			     for the set containing the device for
3314  *			     which the possible soft partition size
3315  *			     is to be returned
3316  *		mdnamep - a reference to the mdname_t of the device
3317  *			  for which the possible soft partition size
3318  *			  is to be returned
3319  *		number_of_sps - the desired number of soft partitions
3320  * OUTPUT:	blkcnt_t return value
3321  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3322  * PURPOSE:	returns the maximum possible size of each of a given number of
3323  *		soft partitions of equal size that can be created on a device
3324  */
3325 blkcnt_t
3326 meta_sp_get_possible_sp_size(
3327 	mdsetname_t	*mdsetnamep,
3328 	mdname_t	*mdnamep,
3329 	int		number_of_sps
3330 )
3331 {
3332 	blkcnt_t	free_blocks;
3333 	blkcnt_t	sp_size;
3334 	boolean_t	succeeded;
3335 
3336 	sp_size = 0;
3337 	if (number_of_sps > 0) {
3338 		free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep);
3339 		sp_size = free_blocks / number_of_sps;
3340 		succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3341 						number_of_sps, sp_size);
3342 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3343 			/*
3344 			 * To compensate for space that may have been
3345 			 * occupied by watermarks, reduce sp_size by a
3346 			 * number of blocks equal to the number of soft
3347 			 * partitions desired, and test again to see
3348 			 * whether the desired number of soft partitions
3349 			 * can be created.
3350 			 */
3351 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3352 			succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3353 							number_of_sps, sp_size);
3354 		}
3355 		if (sp_size < 0) {
3356 			sp_size = 0;
3357 		}
3358 	}
3359 	return (sp_size);
3360 }
3361 
3362 /*
3363  * FUNCTION:	meta_sp_get_possible_sp_size_on_drive()
3364  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3365  *			     for the set containing the drive for
3366  *			     which the possible soft partition size
3367  *			     is to be returned
3368  *		mddrivenamep - a reference to the mddrivename_t of the drive
3369  *			       for which the possible soft partition size
3370  *			       is to be returned
3371  *		number_of_sps - the desired number of soft partitions
3372  * OUTPUT:	blkcnt_t return value
3373  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3374  * PURPOSE:	returns the maximum possible size of each of a given number of
3375  *		soft partitions of equal size that can be created on a drive
3376  *              if the entire drive is soft partitioned
3377  */
3378 blkcnt_t
3379 meta_sp_get_possible_sp_size_on_drive(
3380 	mdsetname_t	*mdsetnamep,
3381 	mddrivename_t	*mddrivenamep,
3382 	int		number_of_sps
3383 )
3384 {
3385 	blkcnt_t	free_blocks;
3386 	blkcnt_t	sp_size;
3387 	boolean_t	succeeded;
3388 
3389 	sp_size = 0;
3390 	if (number_of_sps > 0) {
3391 		free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep,
3392 								mddrivenamep);
3393 		sp_size = free_blocks / number_of_sps;
3394 		succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3395 						mddrivenamep,
3396 						number_of_sps, sp_size);
3397 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3398 			/*
3399 			 * To compensate for space that may have been
3400 			 * occupied by watermarks, reduce sp_size by a
3401 			 * number of blocks equal to the number of soft
3402 			 * partitions desired, and test again to see
3403 			 * whether the desired number of soft partitions
3404 			 * can be created.
3405 			 */
3406 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3407 			succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3408 							mddrivenamep,
3409 							number_of_sps, sp_size);
3410 		}
3411 		if (sp_size < 0) {
3412 			sp_size = 0;
3413 		}
3414 	}
3415 	return (sp_size);
3416 }
3417 
3418 /*
3419  * **************************************************************************
3420  *                  Unit Structure Manipulation Functions                   *
3421  * **************************************************************************
3422  */
3423 
3424 /*
3425  * FUNCTION:	meta_sp_fillextarray()
3426  * INPUT:	mp	- the unit structure to fill
3427  *		extlist	- the list of extents to fill with
3428  * OUTPUT:	none
3429  * RETURNS:	void
3430  * PURPOSE:	fills in the unit structure extent list with the extents
3431  *		specified by extlist.  Only extents in extlist with the
3432  *		EXTFLG_UPDATE flag are changed in the unit structure,
3433  *		and the index into the unit structure is the sequence
3434  *		number in the extent list.  After all of the nodes have
3435  *		been updated the virtual offsets in the unit structure
3436  *		are updated to reflect the new lengths.
3437  */
3438 static void
3439 meta_sp_fillextarray(
3440 	mp_unit_t	*mp,
3441 	sp_ext_node_t	*extlist
3442 )
3443 {
3444 	int	i;
3445 	sp_ext_node_t	*ext;
3446 	sp_ext_offset_t	curvoff = 0LL;
3447 
3448 	assert(mp != NULL);
3449 
3450 	/* go through the allocation list and fill in our unit structure */
3451 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
3452 		if ((ext->ext_type == EXTTYP_ALLOC) &&
3453 		    (ext->ext_flags & EXTFLG_UPDATE) != 0) {
3454 			mp->un_ext[ext->ext_seq].un_poff =
3455 			    ext->ext_offset + MD_SP_WMSIZE;
3456 			mp->un_ext[ext->ext_seq].un_len =
3457 			    ext->ext_length - MD_SP_WMSIZE;
3458 		}
3459 	}
3460 
3461 	for (i = 0; i < mp->un_numexts; i++) {
3462 		assert(mp->un_ext[i].un_poff != 0);
3463 		assert(mp->un_ext[i].un_len  != 0);
3464 		mp->un_ext[i].un_voff = curvoff;
3465 		curvoff += mp->un_ext[i].un_len;
3466 	}
3467 }
3468 
3469 /*
3470  * FUNCTION:	meta_sp_createunit()
3471  * INPUT:	np	- the name of the device to create a unit structure for
3472  *		compnp	- the name of the device the soft partition is on
3473  *		extlist	- the extent list to populate the new unit with
3474  *		numexts	- the number of extents in the extent list
3475  *		len	- the total size of the soft partition (sectors)
3476  *		status	- the initial status of the unit structure
3477  * OUTPUT:	ep	- return error pointer
3478  * RETURNS:	mp_unit_t * - the new unit structure.
3479  * PURPOSE:	allocates and fills in a new soft partition unit
3480  *		structure to be passed to the soft partitioning driver
3481  *		for creation.
3482  */
3483 static mp_unit_t *
3484 meta_sp_createunit(
3485 	mdname_t	*np,
3486 	mdname_t	*compnp,
3487 	sp_ext_node_t	*extlist,
3488 	int		numexts,
3489 	sp_ext_length_t	len,
3490 	sp_status_t	status,
3491 	md_error_t	*ep
3492 )
3493 {
3494 	mp_unit_t	*mp;
3495 	uint_t		ms_size;
3496 
3497 	ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) +
3498 	    (numexts * sizeof (mp->un_ext[0]));
3499 
3500 	mp = Zalloc(ms_size);
3501 
3502 	/* fill in fields in common unit structure */
3503 	mp->c.un_type = MD_METASP;
3504 	mp->c.un_size = ms_size;
3505 	MD_SID(mp) = meta_getminor(np->dev);
3506 	mp->c.un_total_blocks = len;
3507 	mp->c.un_actual_tb = len;
3508 
3509 	/* set up geometry */
3510 	(void) meta_sp_setgeom(np, compnp, mp, ep);
3511 
3512 	/* if we're building on metadevice we can't parent */
3513 	if (metaismeta(compnp))
3514 		MD_CAPAB(mp) = MD_CANT_PARENT;
3515 	else
3516 		MD_CAPAB(mp) = MD_CAN_PARENT;
3517 
3518 	/* fill soft partition-specific fields */
3519 	mp->un_dev = compnp->dev;
3520 	mp->un_key = compnp->key;
3521 
3522 	/* mdname_t start_blk field is not 64-bit! */
3523 	mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk;
3524 	mp->un_status = status;
3525 	mp->un_numexts = numexts;
3526 	mp->un_length = len;
3527 
3528 	/* fill in the extent array */
3529 	meta_sp_fillextarray(mp, extlist);
3530 
3531 	return (mp);
3532 }
3533 
3534 /*
3535  * FUNCTION:	meta_sp_updateunit()
3536  * INPUT:	np       - name structure for the metadevice being updated
3537  *		old_un	 - the original unit structure that is being updated
3538  *		extlist	 - the extent list to populate the new unit with
3539  *		grow_len - the amount by which the partition is being grown
3540  *		numexts	 - the number of extents in the extent list
3541  *		ep       - return error pointer
3542  * OUTPUT:	none
3543  * RETURNS:	mp_unit_t * - the updated unit structure
3544  * PURPOSE:	allocates and fills in a new soft partition unit structure to
3545  *		be passed to the soft partitioning driver for creation.  The
3546  *		old unit structure is first copied in, and then the updated
3547  *		extents are changed in the new unit structure.  This is
3548  *		typically used when the size of an existing unit is changed.
3549  */
3550 static mp_unit_t *
3551 meta_sp_updateunit(
3552 	mdname_t	*np,
3553 	mp_unit_t	*old_un,
3554 	sp_ext_node_t	*extlist,
3555 	sp_ext_length_t	grow_len,
3556 	int		numexts,
3557 	md_error_t	*ep
3558 )
3559 {
3560 	mp_unit_t	*new_un;
3561 	sp_ext_length_t	new_len;
3562 	uint_t		new_size;
3563 
3564 	assert(old_un != NULL);
3565 	assert(extlist != NULL);
3566 
3567 	/* allocate new unit structure and copy in old unit */
3568 	new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) +
3569 	    ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0]));
3570 	new_len = old_un->un_length + grow_len;
3571 	new_un = Zalloc(new_size);
3572 	bcopy(old_un, new_un, old_un->c.un_size);
3573 
3574 	/* update size and geometry information */
3575 	new_un->c.un_size = new_size;
3576 	new_un->un_length = new_len;
3577 	new_un->c.un_total_blocks = new_len;
3578 	new_un->c.un_actual_tb = new_len;
3579 	if (meta_adjust_geom((md_unit_t *)new_un, np,
3580 	    old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct,
3581 	    0, ep) != 0) {
3582 		Free(new_un);
3583 		return (NULL);
3584 	}
3585 
3586 	/* update extent information */
3587 	new_un->un_numexts += numexts;
3588 
3589 	meta_sp_fillextarray(new_un, extlist);
3590 
3591 	return (new_un);
3592 }
3593 
3594 /*
3595  * FUNCTION:	meta_get_sp()
3596  * INPUT:	sp	- the set name for the device to get
3597  *		np	- the name of the device to get
3598  * OUTPUT:	ep	- return error pointer
3599  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition
3600  * PURPOSE:	interface to the rest of libmeta for fetching a unit structure
3601  *		for the named device.  Just a wrapper for meta_get_sp_common().
3602  */
3603 md_sp_t *
3604 meta_get_sp(
3605 	mdsetname_t	*sp,
3606 	mdname_t	*np,
3607 	md_error_t	*ep
3608 )
3609 {
3610 	return (meta_get_sp_common(sp, np, 0, ep));
3611 }
3612 
3613 /*
3614  * FUNCTION:	meta_get_sp_common()
3615  * INPUT:	sp	- the set name for the device to get
3616  *		np	- the name of the device to get
3617  *		fast	- whether to use the cache or not (NOT IMPLEMENTED!)
3618  * OUTPUT:	ep	- return error pointer
3619  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition,
3620  *			    NULL if np is not a soft partition
3621  * PURPOSE:	common routine for fetching a soft partition unit structure
3622  */
3623 md_sp_t *
3624 meta_get_sp_common(
3625 	mdsetname_t	*sp,
3626 	mdname_t	*np,
3627 	int		fast,
3628 	md_error_t	*ep
3629 )
3630 {
3631 	mddrivename_t	*dnp = np->drivenamep;
3632 	char		*miscname;
3633 	mp_unit_t	*mp;
3634 	md_sp_t		*msp;
3635 	int		i;
3636 
3637 	/* must have set */
3638 	assert(sp != NULL);
3639 
3640 	/* short circuit */
3641 	if (dnp->unitp != NULL) {
3642 		if (dnp->unitp->type != MD_METASP)
3643 			return (NULL);
3644 		return ((md_sp_t *)dnp->unitp);
3645 	}
3646 	/* get miscname and unit */
3647 	if ((miscname = metagetmiscname(np, ep)) == NULL)
3648 		return (NULL);
3649 
3650 	if (strcmp(miscname, MD_SP) != 0) {
3651 		(void) mdmderror(ep, MDE_NOT_SP, 0, np->cname);
3652 		return (NULL);
3653 	}
3654 
3655 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
3656 		return (NULL);
3657 
3658 	assert(mp->c.un_type == MD_METASP);
3659 
3660 	/* allocate soft partition */
3661 	msp = Zalloc(sizeof (*msp));
3662 
3663 	/* get the common information */
3664 	msp->common.namep = np;
3665 	msp->common.type = mp->c.un_type;
3666 	msp->common.state = mp->c.un_status;
3667 	msp->common.capabilities = mp->c.un_capabilities;
3668 	msp->common.parent = mp->c.un_parent;
3669 	msp->common.size = mp->c.un_total_blocks;
3670 	msp->common.user_flags = mp->c.un_user_flags;
3671 	msp->common.revision = mp->c.un_revision;
3672 
3673 	/* get soft partition information */
3674 	if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL)
3675 		goto out;
3676 
3677 	/*
3678 	 * Fill in the key and the start block.  Note that the start
3679 	 * block in the unit structure is 64 bits but the name pointer
3680 	 * only supports 32 bits.
3681 	 */
3682 	msp->compnamep->key = mp->un_key;
3683 	msp->compnamep->start_blk = mp->un_start_blk;
3684 
3685 	/* fill in status field */
3686 	msp->status = mp->un_status;
3687 
3688 	/* allocate the extents */
3689 	msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val));
3690 	msp->ext.ext_len = mp->un_numexts;
3691 
3692 	/* do the extents for this soft partition */
3693 	for (i = 0; i < mp->un_numexts; i++) {
3694 		struct mp_ext	*mde = &mp->un_ext[i];
3695 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
3696 
3697 		extp->voff = mde->un_voff;
3698 		extp->poff = mde->un_poff;
3699 		extp->len = mde->un_len;
3700 	}
3701 
3702 	/* cleanup, return success */
3703 	Free(mp);
3704 	dnp->unitp = (md_common_t *)msp;
3705 	return (msp);
3706 
3707 out:
3708 	/* clean up and return error */
3709 	Free(mp);
3710 	Free(msp);
3711 	return (NULL);
3712 }
3713 
3714 
3715 /*
3716  * FUNCTION:	meta_init_sp()
3717  * INPUT:	spp	- the set name for the new device
3718  *		argc	- the remaining argument count for the metainit cmdline
3719  *		argv	- the remainder of the unparsed command line
3720  *		options	- global options parsed by metainit
3721  * OUTPUT:	ep	- return error pointer
3722  * RETURNS:	int	- -1 failure, 0 success
3723  * PURPOSE:	provides the command line parsing and name management overhead
3724  *		for creating a new soft partition.  Ultimately this calls
3725  *		meta_create_sp() which does the real work of allocating space
3726  *		for the new soft partition.
3727  */
3728 int
3729 meta_init_sp(
3730 	mdsetname_t	**spp,
3731 	int		argc,
3732 	char		*argv[],
3733 	mdcmdopts_t	options,
3734 	md_error_t	*ep
3735 )
3736 {
3737 	char		*compname = NULL;
3738 	mdname_t	*spcompnp = NULL;	/* name of component volume */
3739 	char		*devname = argv[0];	/* unit name */
3740 	mdname_t	*np = NULL;		/* name of soft partition */
3741 	md_sp_t		*msp = NULL;
3742 	int		c;
3743 	int		old_optind;
3744 	sp_ext_length_t	len = 0LL;
3745 	int		rval = -1;
3746 	uint_t		seq;
3747 	int		oflag;
3748 	int		failed;
3749 	mddrivename_t	*dnp = NULL;
3750 	sp_ext_length_t	alignment = 0LL;
3751 	sp_ext_node_t	*extlist = NULL;
3752 
3753 	assert(argc > 0);
3754 
3755 	/* expect sp name, -p, optional -e, compname, and size parameters */
3756 	/* grab soft partition name */
3757 	if ((np = metaname(spp, devname, META_DEVICE, ep)) == NULL)
3758 		goto out;
3759 
3760 	/* see if it exists already */
3761 	if (metagetmiscname(np, ep) != NULL) {
3762 		(void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP,
3763 		    meta_getminor(np->dev), devname);
3764 		goto out;
3765 	} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) {
3766 		goto out;
3767 	} else {
3768 		mdclrerror(ep);
3769 	}
3770 	--argc, ++argv;
3771 
3772 	if (argc == 0)
3773 		goto syntax;
3774 
3775 	/* grab -p */
3776 	if (strcmp(argv[0], "-p") != 0)
3777 		goto syntax;
3778 	--argc, ++argv;
3779 
3780 	if (argc == 0)
3781 		goto syntax;
3782 
3783 	/* see if -e is there */
3784 	if (strcmp(argv[0], "-e") == 0) {
3785 		/* use the whole disk */
3786 		options |= MDCMD_USE_WHOLE_DISK;
3787 		--argc, ++argv;
3788 	}
3789 
3790 	if (argc == 0)
3791 		goto syntax;
3792 
3793 	/* get component name */
3794 	compname = Strdup(argv[0]);
3795 
3796 	if (options & MDCMD_USE_WHOLE_DISK) {
3797 		if ((dnp = metadrivename(spp, compname, ep)) == NULL) {
3798 			goto out;
3799 		}
3800 		if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) {
3801 			goto out;
3802 		}
3803 	} else if ((spcompnp = metaname(spp, compname, UNKNOWN, ep)) == NULL) {
3804 		goto out;
3805 	}
3806 	assert(*spp != NULL);
3807 
3808 	if (!(options & MDCMD_NOLOCK)) {
3809 		/* grab set lock */
3810 		if (meta_lock(*spp, TRUE, ep))
3811 			goto out;
3812 
3813 		if (meta_check_ownership(*spp, ep) != 0)
3814 			goto out;
3815 	}
3816 
3817 	/* allocate the soft partition */
3818 	msp = Zalloc(sizeof (*msp));
3819 
3820 	/* setup common */
3821 	msp->common.namep = np;
3822 	msp->common.type = MD_METASP;
3823 
3824 	compname = spcompnp->cname;
3825 
3826 	assert(spcompnp->rname != NULL);
3827 	--argc, ++argv;
3828 
3829 	if (argc == 0) {
3830 		goto syntax;
3831 	}
3832 
3833 	if (*argv[0] == '-') {
3834 		/*
3835 		 * parse any other command line options, this includes
3836 		 * the recovery options -o and -b. The special thing
3837 		 * with these options is that the len needs to be
3838 		 * kept track of otherwise when the geometry of the
3839 		 * "device" is built it will create an invalid geometry
3840 		 */
3841 		old_optind = optind = 0;
3842 		opterr = 0;
3843 		oflag = 0;
3844 		seq = 0;
3845 		failed = 0;
3846 		while ((c = getopt(argc, argv, "A:o:b:")) != -1) {
3847 			sp_ext_offset_t	offset;
3848 			sp_ext_length_t	length;
3849 			longlong_t	tmp_size;
3850 
3851 			switch (c) {
3852 			case 'A':	/* data alignment */
3853 				if (meta_sp_parsesizestring(optarg,
3854 					&alignment) == -1) {
3855 					failed = 1;
3856 				}
3857 				break;
3858 			case 'o':	/* offset in the partition */
3859 				if (oflag == 1) {
3860 					failed = 1;
3861 				} else {
3862 					tmp_size = atoll(optarg);
3863 					if (tmp_size <= 0) {
3864 						failed = 1;
3865 					} else {
3866 						oflag = 1;
3867 						options |= MDCMD_DIRECT;
3868 
3869 						offset = tmp_size;
3870 					}
3871 				}
3872 
3873 				break;
3874 			case 'b':	/* number of blocks */
3875 				if (oflag == 0) {
3876 					failed = 1;
3877 				} else {
3878 					tmp_size = atoll(optarg);
3879 					if (tmp_size <= 0) {
3880 						failed = 1;
3881 					} else {
3882 						oflag = 0;
3883 
3884 						length = tmp_size;
3885 
3886 						/* we have a pair of values */
3887 						meta_sp_list_insert(*spp, np,
3888 							&extlist, offset,
3889 							length, EXTTYP_ALLOC,
3890 							seq++, EXTFLG_UPDATE,
3891 							meta_sp_cmp_by_offset);
3892 						len += length;
3893 					}
3894 				}
3895 
3896 				break;
3897 			default:
3898 				argc -= old_optind;
3899 				argv += old_optind;
3900 				goto options;
3901 			}
3902 
3903 			if (failed) {
3904 				argc -= old_optind;
3905 				argv += old_optind;
3906 				goto syntax;
3907 			}
3908 
3909 			old_optind = optind;
3910 		}
3911 		argc -= optind;
3912 		argv += optind;
3913 
3914 		/*
3915 		 * Must have matching pairs of -o and -b flags
3916 		 */
3917 		if (oflag != 0)
3918 			goto syntax;
3919 
3920 		/*
3921 		 * Can't specify both layout (indicated indirectly by
3922 		 * len being set by thye -o/-b cases above) AND
3923 		 * alignment
3924 		 */
3925 		if ((len > 0LL) && (alignment > 0LL))
3926 			goto syntax;
3927 
3928 		/*
3929 		 * sanity check the allocation list
3930 		 */
3931 		if ((extlist != NULL) && meta_sp_list_overlaps(extlist))
3932 			goto syntax;
3933 	}
3934 
3935 	if (len == 0LL) {
3936 		if (argc == 0)
3937 			goto syntax;
3938 		if (meta_sp_parsesize(argv[0], &len) == -1)
3939 			goto syntax;
3940 		--argc, ++argv;
3941 	}
3942 
3943 	msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val));
3944 	msp->ext.ext_val->len = len;
3945 	msp->compnamep = spcompnp;
3946 
3947 	/* we should be at the end */
3948 	if (argc != 0)
3949 		goto syntax;
3950 
3951 	/* create soft partition */
3952 	if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0)
3953 		goto out;
3954 	rval = 0;
3955 
3956 	/* let em know */
3957 	if (options & MDCMD_PRINT) {
3958 		(void) printf(dgettext(TEXT_DOMAIN,
3959 		    "%s: Soft Partition is setup\n"),
3960 		    devname);
3961 		(void) fflush(stdout);
3962 	}
3963 	goto out;
3964 
3965 syntax:
3966 	/* syntax error */
3967 	rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv);
3968 	goto out;
3969 
3970 options:
3971 	/* options error */
3972 	rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv);
3973 	goto out;
3974 
3975 out:
3976 	if (msp != NULL) {
3977 		if (msp->ext.ext_val != NULL) {
3978 			Free(msp->ext.ext_val);
3979 		}
3980 		Free(msp);
3981 	}
3982 
3983 	return (rval);
3984 }
3985 
3986 /*
3987  * FUNCTION:	meta_free_sp()
3988  * INPUT:	msp	- the soft partition unit to free
3989  * OUTPUT:	none
3990  * RETURNS:	void
3991  * PURPOSE:	provides an interface from the rest of libmeta for freeing a
3992  *		soft partition unit
3993  */
3994 void
3995 meta_free_sp(md_sp_t *msp)
3996 {
3997 	Free(msp);
3998 }
3999 
4000 /*
4001  * FUNCTION:	meta_sp_issp()
4002  * INPUT:	sp	- the set name to check
4003  *		np	- the name to check
4004  * OUTPUT:	ep	- return error pointer
4005  * RETURNS:	int	- 0 means sp,np is a soft partition
4006  *			  1 means sp,np is not a soft partition
4007  * PURPOSE:	determines whether the given device is a soft partition
4008  *		device.  This is called by other metadevice check routines.
4009  */
4010 int
4011 meta_sp_issp(
4012 	mdsetname_t	*sp,
4013 	mdname_t	*np,
4014 	md_error_t	*ep
4015 )
4016 {
4017 	if (meta_get_sp_common(sp, np, 0, ep) == NULL)
4018 		return (1);
4019 
4020 	return (0);
4021 }
4022 
4023 /*
4024  * FUNCTION:	meta_check_sp()
4025  * INPUT:	sp	- the set name to check
4026  *		msp	- the unit structure to check
4027  *		options	- creation options
4028  * OUTPUT:	repart_options - options to be passed to
4029  *				meta_repartition_drive()
4030  *		ep	- return error pointer
4031  * RETURNS:	int	-  0 ok to create on this component
4032  *			  -1 error or not ok to create on this component
4033  * PURPOSE:	Checks to determine whether the rules for creation of
4034  *		soft partitions allow creation of a soft partition on
4035  *		the device described by the mdname_t structure referred
4036  *		to by msp->compnamep.
4037  *
4038  *		NOTE: Does NOT check to determine whether the extents
4039  *		      described in the md_sp_t structure referred to by
4040  *		      msp will fit on the device described by the mdname_t
4041  *		      structure located at msp->compnamep.
4042  */
4043 static int
4044 meta_check_sp(
4045 	mdsetname_t	*sp,
4046 	md_sp_t		*msp,
4047 	mdcmdopts_t	options,
4048 	int		*repart_options,
4049 	md_error_t	*ep
4050 )
4051 {
4052 	md_common_t	*mdp;
4053 	mdname_t	*compnp = msp->compnamep;
4054 	uint_t		slice;
4055 	mddrivename_t	*dnp;
4056 	mdname_t	*slicenp;
4057 	mdvtoc_t	*vtocp;
4058 
4059 	/* make sure it is in the set */
4060 	if (meta_check_inset(sp, compnp, ep) != 0)
4061 		return (-1);
4062 
4063 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4064 		uint_t	rep_slice;
4065 
4066 		/*
4067 		 * check to make sure we can partition this drive.
4068 		 * we cannot continue if any of the following are
4069 		 * true:
4070 		 * The drive is a metadevice.
4071 		 * The drive contains a mounted slice.
4072 		 * The drive contains a slice being swapped to.
4073 		 * The drive contains slices which are part of other
4074 		 * metadevices.
4075 		 * The drive contains a metadb.
4076 		 */
4077 		if (metaismeta(compnp))
4078 			return (mddeverror(ep, MDE_IS_META, compnp->dev,
4079 			    compnp->cname));
4080 
4081 		assert(compnp->drivenamep != NULL);
4082 
4083 		/*
4084 		 * ensure that we have slice 0 since the disk will be
4085 		 * repartitioned in the USE_WHOLE_DISK case.  this check
4086 		 * is redundant unless the user incorrectly specifies a
4087 		 * a fully qualified drive AND slice name (i.e.,
4088 		 * /dev/dsk/cXtXdXsX), which will be incorrectly
4089 		 * recognized as a drive name by the metaname code.
4090 		 */
4091 
4092 		if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL)
4093 			return (-1);
4094 		if (slice != MD_SLICE0)
4095 			return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname));
4096 
4097 		dnp = compnp->drivenamep;
4098 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
4099 			return (-1);
4100 
4101 		for (slice = 0; slice < vtocp->nparts; slice++) {
4102 
4103 			/* only check if the slice really exists */
4104 			if (vtocp->parts[slice].size == 0)
4105 				continue;
4106 
4107 			slicenp = metaslicename(dnp, slice, ep);
4108 			if (slicenp == NULL)
4109 				return (-1);
4110 
4111 			/* check to ensure that it is not already in use */
4112 			if (meta_check_inuse(sp,
4113 			    slicenp, MDCHK_INUSE, ep) != 0) {
4114 				return (-1);
4115 			}
4116 
4117 			/*
4118 			 * Up to this point, tests are applied to all
4119 			 * slices uniformly.
4120 			 */
4121 
4122 			if (slice == rep_slice) {
4123 				/*
4124 				 * Tests inside the body of this
4125 				 * conditional are applied only to
4126 				 * slice seven.
4127 				 */
4128 				if (meta_check_inmeta(sp, slicenp,
4129 				    options | MDCHK_ALLOW_MDDB |
4130 				    MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0)
4131 					return (-1);
4132 
4133 				/*
4134 				 * For slice seven, a metadb is NOT an
4135 				 * automatic failure. It merely means
4136 				 * that we're not allowed to muck
4137 				 * about with the partitioning of that
4138 				 * slice.  We indicate this by masking
4139 				 * in the MD_REPART_LEAVE_REP flag.
4140 				 */
4141 				if (metahasmddb(sp, slicenp, ep)) {
4142 					assert(repart_options !=
4143 					    NULL);
4144 					*repart_options |=
4145 					    MD_REPART_LEAVE_REP;
4146 				}
4147 
4148 				/*
4149 				 * Skip the remaining tests for slice
4150 				 * seven
4151 				 */
4152 				continue;
4153 			}
4154 
4155 			/*
4156 			 * Tests below this point will be applied to
4157 			 * all slices EXCEPT for the replica slice.
4158 			 */
4159 
4160 
4161 			/* check if component is in a metadevice */
4162 			if (meta_check_inmeta(sp, slicenp, options, 0,
4163 			    -1, ep) != 0)
4164 				return (-1);
4165 
4166 			/* check to see if component has a metadb */
4167 			if (metahasmddb(sp, slicenp, ep))
4168 				return (mddeverror(ep, MDE_HAS_MDDB,
4169 				    slicenp->dev, slicenp->cname));
4170 		}
4171 		/*
4172 		 * This should be all of the testing necessary when
4173 		 * the MDCMD_USE_WHOLE_DISK flag is set; the rest of
4174 		 * meta_check_sp() is oriented towards component
4175 		 * arguments instead of disks.
4176 		 */
4177 		goto meta_check_sp_ok;
4178 
4179 	}
4180 
4181 	/* check to ensure that it is not already in use */
4182 	if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) {
4183 		return (-1);
4184 	}
4185 
4186 	if (!metaismeta(compnp)) {	/* handle non-metadevices */
4187 
4188 		/*
4189 		 * The component can have one or more soft partitions on it
4190 		 * already, but can't be part of any other type of metadevice,
4191 		 * so if it is used for a metadevice, but the metadevice
4192 		 * isn't a soft partition, return failure.
4193 		 */
4194 
4195 		if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 &&
4196 		    meta_check_insp(sp, compnp, 0, -1, ep) == 0) {
4197 			return (-1);
4198 		}
4199 	} else {			/* handle metadevices */
4200 		/* get underlying unit & check capabilities */
4201 		if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL)
4202 			return (-1);
4203 
4204 		if ((! (mdp->capabilities & MD_CAN_PARENT)) ||
4205 		    (! (mdp->capabilities & MD_CAN_SP)))
4206 			return (mdmderror(ep, MDE_INVAL_UNIT,
4207 			    meta_getminor(compnp->dev), compnp->cname));
4208 	}
4209 
4210 meta_check_sp_ok:
4211 	mdclrerror(ep);
4212 	return (0);
4213 }
4214 
4215 /*
4216  * FUNCTION:	meta_create_sp()
4217  * INPUT:	sp	- the set name to create in
4218  *		msp	- the unit structure to create
4219  *		oblist	- an optional list of requested extents (-o/-b options)
4220  *		options	- creation options
4221  *		alignment - data alignment
4222  * OUTPUT:	ep	- return error pointer
4223  * RETURNS:	int	-  0 success, -1 error
4224  * PURPOSE:	does most of the work for creating a soft partition.  If
4225  *		metainit -p -e was used, first partition the drive.  Then
4226  *		create an extent list based on the existing soft partitions
4227  *		and assume all space not used by them is free.  Storage for
4228  *		the new soft partition is allocated from the free extents
4229  *		based on the length specified on the command line or the
4230  *		oblist passed in.  The unit structure is then committed and
4231  *		the watermarks are updated.  Finally, the status is changed to
4232  *		Okay and the process is complete.
4233  */
4234 static int
4235 meta_create_sp(
4236 	mdsetname_t	*sp,
4237 	md_sp_t		*msp,
4238 	sp_ext_node_t	*oblist,
4239 	mdcmdopts_t	options,
4240 	sp_ext_length_t	alignment,
4241 	md_error_t	*ep
4242 )
4243 {
4244 	mdname_t	*np = msp->common.namep;
4245 	mdname_t	*compnp = msp->compnamep;
4246 	mp_unit_t	*mp = NULL;
4247 	mdnamelist_t	*keynlp = NULL, *spnlp = NULL;
4248 	md_set_params_t	set_params;
4249 	int		rval = -1;
4250 	diskaddr_t	comp_size;
4251 	diskaddr_t	sp_start;
4252 	sp_ext_node_t	*extlist = NULL;
4253 	int		numexts = 0;	/* number of extents */
4254 	int		count = 0;
4255 	int		committed = 0;
4256 	int		repart_options = MD_REPART_FORCE;
4257 	int		create_flag = MD_CRO_32BIT;
4258 
4259 	md_set_desc	*sd;
4260 	mm_unit_t	*mm;
4261 	md_set_mmown_params_t	*ownpar = NULL;
4262 	int		comp_is_mirror = 0;
4263 
4264 	/* validate soft partition */
4265 	if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0)
4266 		return (-1);
4267 
4268 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4269 		if ((options & MDCMD_DOIT) != 0) {
4270 			if (meta_repartition_drive(sp,
4271 			    compnp->drivenamep,
4272 			    repart_options,
4273 			    NULL, /* Don't return the VTOC */
4274 			    ep) != 0)
4275 
4276 				return (-1);
4277 		} else {
4278 			/*
4279 			 * If -n and -e are both specified, it doesn't make
4280 			 * sense to continue without actually partitioning
4281 			 * the drive.
4282 			 */
4283 			return (0);
4284 		}
4285 	}
4286 
4287 	/* populate the start_blk field of the component name */
4288 	if ((sp_start = meta_sp_get_start(sp, compnp, ep)) ==
4289 	    MD_DISKADDR_ERROR) {
4290 		rval = -1;
4291 		goto out;
4292 	}
4293 
4294 	if (options & MDCMD_DOIT) {
4295 		/* store name in namespace */
4296 		if (add_key_name(sp, compnp, &keynlp, ep) != 0) {
4297 			rval = -1;
4298 			goto out;
4299 		}
4300 	}
4301 
4302 	/*
4303 	 * Get a list of the soft partitions that currently reside on
4304 	 * the component.  We should ALWAYS force reload the cache,
4305 	 * because if this is a single creation, there will not BE a
4306 	 * cached list, and if we're using the md.tab, we must rebuild
4307 	 * the list because it won't contain the previous (if any)
4308 	 * soft partition.
4309 	 */
4310 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4311 	if (count < 0) {
4312 		/* error occured */
4313 		rval = -1;
4314 		goto out;
4315 	}
4316 
4317 	/*
4318 	 * get the size of the underlying device.  if the size is smaller
4319 	 * than or equal to the watermark size, we know there isn't
4320 	 * enough space.
4321 	 */
4322 	if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) {
4323 		rval = -1;
4324 		goto out;
4325 	} else if (comp_size <= MD_SP_WMSIZE) {
4326 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname);
4327 		rval = -1;
4328 		goto out;
4329 	}
4330 	/*
4331 	 * seed extlist with reserved space at the beginning of the volume and
4332 	 * enough space for the end watermark.  The end watermark always gets
4333 	 * updated, but if the underlying device changes size it may not be
4334 	 * pointed to until the extent before it is updated.  Since the
4335 	 * end of the reserved space is where the first watermark starts,
4336 	 * the reserved extent should never be marked for updating.
4337 	 */
4338 
4339 	meta_sp_list_insert(NULL, NULL, &extlist,
4340 	    0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4341 	meta_sp_list_insert(NULL, NULL, &extlist,
4342 	    (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE,
4343 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4344 
4345 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4346 		rval = -1;
4347 		goto out;
4348 	}
4349 
4350 	metafreenamelist(spnlp);
4351 
4352 	if (getenv(META_SP_DEBUG)) {
4353 		meta_sp_debug("meta_create_sp: list of used extents:\n");
4354 		meta_sp_list_dump(extlist);
4355 	}
4356 
4357 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4358 
4359 	/* get extent list from -o/-b options or from free space */
4360 	if (options & MDCMD_DIRECT) {
4361 		if (getenv(META_SP_DEBUG)) {
4362 			meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n");
4363 			meta_sp_list_dump(oblist);
4364 		}
4365 
4366 		numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist);
4367 		if (numexts == -1) {
4368 			(void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname);
4369 			rval = -1;
4370 			goto out;
4371 		}
4372 	} else {
4373 		numexts = meta_sp_alloc_by_len(sp, np, &extlist,
4374 		    &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment :
4375 		    meta_sp_get_default_alignment(sp, compnp, ep));
4376 		if (numexts == -1) {
4377 			(void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname);
4378 			rval = -1;
4379 			goto out;
4380 		}
4381 	}
4382 
4383 	assert(extlist != NULL);
4384 
4385 	/* create soft partition */
4386 	mp = meta_sp_createunit(msp->common.namep, msp->compnamep,
4387 	    extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep);
4388 
4389 	create_flag = meta_check_devicesize(mp->c.un_total_blocks);
4390 
4391 	/* if we're not doing anything (metainit -n), return success */
4392 	if (! (options & MDCMD_DOIT)) {
4393 		rval = 0;	/* success */
4394 		goto out;
4395 	}
4396 
4397 	(void) memset(&set_params, 0, sizeof (set_params));
4398 
4399 	if (create_flag == MD_CRO_64BIT) {
4400 		mp->c.un_revision |= MD_64BIT_META_DEV;
4401 		set_params.options = MD_CRO_64BIT;
4402 	} else {
4403 		mp->c.un_revision &= ~MD_64BIT_META_DEV;
4404 		set_params.options = MD_CRO_32BIT;
4405 	}
4406 
4407 	if (getenv(META_SP_DEBUG)) {
4408 		meta_sp_debug("meta_create_sp: printing unit structure\n");
4409 		meta_sp_printunit(mp);
4410 	}
4411 
4412 	/*
4413 	 * Check to see if we're trying to create a partition on a mirror. If so
4414 	 * we may have to enforce an ownership change before writing the
4415 	 * watermark out.
4416 	 */
4417 	if (metaismeta(compnp)) {
4418 		char *miscname;
4419 
4420 		miscname = metagetmiscname(compnp, ep);
4421 		if (miscname != NULL)
4422 			comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0);
4423 		else
4424 			comp_is_mirror = 0;
4425 	} else {
4426 		comp_is_mirror = 0;
4427 	}
4428 
4429 	/*
4430 	 * For a multi-node environment we have to ensure that the master
4431 	 * node owns an underlying mirror before we issue the MD_IOCSET ioctl.
4432 	 * If the master does not own the device we will deadlock as the
4433 	 * implicit write of the watermarks (in sp_ioctl.c) will cause an
4434 	 * ownership change that will block as the MD_IOCSET is still in
4435 	 * progress. To close this window we force an owner change to occur
4436 	 * before issuing the MD_IOCSET. We cannot simply open the device and
4437 	 * write to it as this will only work for the first soft-partition
4438 	 * creation.
4439 	 */
4440 
4441 	if (comp_is_mirror && !metaislocalset(sp)) {
4442 
4443 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4444 			rval = -1;
4445 			goto out;
4446 		}
4447 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
4448 			mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
4449 			if (mm == NULL) {
4450 				rval = -1;
4451 				goto out;
4452 			} else {
4453 				rval = meta_mn_change_owner(&ownpar, sp->setno,
4454 					meta_getminor(compnp->dev),
4455 					sd->sd_mn_mynode->nd_nodeid,
4456 					MD_MN_MM_PREVENT_CHANGE |
4457 					    MD_MN_MM_SPAWN_THREAD);
4458 				if (rval == -1)
4459 					goto out;
4460 			}
4461 		}
4462 	}
4463 
4464 	set_params.mnum = MD_SID(mp);
4465 	set_params.size = mp->c.un_size;
4466 	set_params.mdp = (uintptr_t)mp;
4467 	MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum));
4468 
4469 	/* first phase of commit. */
4470 	if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
4471 	    np->cname) != 0) {
4472 		(void) mdstealerror(ep, &set_params.mde);
4473 		rval = -1;
4474 		goto out;
4475 	}
4476 
4477 	/* we've successfully committed the record */
4478 	committed = 1;
4479 
4480 	/* write watermarks */
4481 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
4482 		rval = -1;
4483 		goto out;
4484 	}
4485 
4486 	/*
4487 	 * Allow mirror ownership to change. If we don't succeed in this
4488 	 * ioctl it isn't fatal, but the cluster will probably hang fairly
4489 	 * soon as the mirror owner won't change. However, we have
4490 	 * successfully written the watermarks out to the device so the
4491 	 * softpart creation has succeeded
4492 	 */
4493 	if (ownpar) {
4494 		(void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum,
4495 		    ownpar->d.owner,
4496 		    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
4497 	}
4498 
4499 	/* second phase of commit, set status to MD_SP_OK */
4500 	if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) {
4501 		rval = -1;
4502 		goto out;
4503 	}
4504 	rval = 0;
4505 out:
4506 	Free(mp);
4507 	if (ownpar)
4508 		Free(ownpar);
4509 
4510 	if (extlist != NULL)
4511 		meta_sp_list_free(&extlist);
4512 
4513 	if (rval != 0 && keynlp != NULL && committed != 1)
4514 		(void) del_key_names(sp, keynlp, NULL);
4515 
4516 	metafreenamelist(keynlp);
4517 
4518 	return (rval);
4519 }
4520 
4521 /*
4522  * **************************************************************************
4523  *                      Reset (metaclear) Functions                         *
4524  * **************************************************************************
4525  */
4526 
4527 /*
4528  * FUNCTION:	meta_sp_reset_common()
4529  * INPUT:	sp	- the set name of the device to reset
4530  *		np	- the name of the device to reset
4531  *		msp	- the unit structure to reset
4532  *		options	- metaclear options
4533  * OUTPUT:	ep	- return error pointer
4534  * RETURNS:	int	-  0 success, -1 error
4535  * PURPOSE:	"resets", or more accurately deletes, the soft partition
4536  *		specified.  First the state is set to "deleting" and then the
4537  *		watermarks are all cleared out.  Once the watermarks have been
4538  *		updated, the unit structure is deleted from the metadb.
4539  */
4540 static int
4541 meta_sp_reset_common(
4542 	mdsetname_t	*sp,
4543 	mdname_t	*np,
4544 	md_sp_t		*msp,
4545 	md_sp_reset_t	reset_params,
4546 	mdcmdopts_t	options,
4547 	md_error_t	*ep
4548 )
4549 {
4550 	char	*miscname;
4551 	int	rval = -1;
4552 	int	is_open = 0;
4553 
4554 	/* make sure that nobody owns us */
4555 	if (MD_HAS_PARENT(msp->common.parent))
4556 		return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev),
4557 					np->cname));
4558 
4559 	/* make sure that the soft partition isn't open */
4560 	if ((is_open = meta_isopen(sp, np, ep, options)) < 0)
4561 		return (-1);
4562 	else if (is_open)
4563 		return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev),
4564 					np->cname));
4565 
4566 	/* get miscname */
4567 	if ((miscname = metagetmiscname(np, ep)) == NULL)
4568 		return (-1);
4569 
4570 	/* fill in reset params */
4571 	MD_SETDRIVERNAME(&reset_params, miscname, sp->setno);
4572 	reset_params.mnum = meta_getminor(np->dev);
4573 	reset_params.force = (options & MDCMD_FORCE) ? 1 : 0;
4574 
4575 	/*
4576 	 * clear soft partition - phase one.
4577 	 * place the soft partition into the "delete pending" state.
4578 	 */
4579 	if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0)
4580 		return (-1);
4581 
4582 	/*
4583 	 * Now clear the watermarks.  If the force flag is specified,
4584 	 * ignore any errors writing the watermarks and delete the unit
4585 	 * structure anyway.  An error may leave the on-disk format in a
4586 	 * corrupt state.  If force is not specified and we fail here,
4587 	 * the soft partition will remain in the "delete pending" state.
4588 	 */
4589 	if ((meta_sp_clear_wm(sp, msp, ep) < 0) &&
4590 	    ((options & MDCMD_FORCE) == 0))
4591 		goto out;
4592 
4593 	/*
4594 	 * clear soft partition - phase two.
4595 	 * the driver removes the soft partition from the metadb and
4596 	 * zeros out incore version.
4597 	 */
4598 	if (metaioctl(MD_IOCRESET, &reset_params,
4599 	    &reset_params.mde, np->cname) != 0) {
4600 		(void) mdstealerror(ep, &reset_params.mde);
4601 		goto out;
4602 	}
4603 	rval = 0;	/* success */
4604 
4605 	if (options & MDCMD_PRINT) {
4606 		(void) printf(dgettext(TEXT_DOMAIN,
4607 		    "%s: Soft Partition is cleared\n"),
4608 		    np->cname);
4609 		(void) fflush(stdout);
4610 	}
4611 
4612 	/*
4613 	 * if told to recurse and on a metadevice, then attempt to
4614 	 * clear the subdevices.  Indicate failure if the clear fails.
4615 	 */
4616 	if ((options & MDCMD_RECURSE) &&
4617 	    (metaismeta(msp->compnamep)) &&
4618 	    (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0))
4619 		rval = -1;
4620 
4621 out:
4622 	meta_invalidate_name(np);
4623 	return (rval);
4624 }
4625 
4626 /*
4627  * FUNCTION:	meta_sp_reset()
4628  * INPUT:	sp	- the set name of the device to reset
4629  *		np	- the name of the device to reset
4630  *		options	- metaclear options
4631  * OUTPUT:	ep	- return error pointer
4632  * RETURNS:	int	-  0 success, -1 error
4633  * PURPOSE:	provides the entry point to the rest of libmeta for deleting a
4634  *		soft partition.  If np is NULL, then soft partitions are
4635  *		all deleted at the current level and then recursively deleted.
4636  *		Otherwise, if a name is specified either directly or as a
4637  *		result of a recursive operation, it deletes only that name.
4638  *		Since something sitting under a soft partition may be parented
4639  *		to it, we have to reparent that other device to another soft
4640  *		partition on the same component if we're deleting the one it's
4641  *		parented to.
4642  */
4643 int
4644 meta_sp_reset(
4645 	mdsetname_t	*sp,
4646 	mdname_t	*np,
4647 	mdcmdopts_t	options,
4648 	md_error_t	*ep
4649 )
4650 {
4651 	md_sp_t		*msp;
4652 	int		rval = -1;
4653 	mdnamelist_t	*spnlp = NULL, *nlp = NULL;
4654 	md_sp_reset_t	reset_params;
4655 	int		num_sp;
4656 
4657 	assert(sp != NULL);
4658 
4659 	/* reset/delete all soft paritions */
4660 	if (np == NULL) {
4661 		/*
4662 		 * meta_reset_all sets MDCMD_RECURSE, but this behavior
4663 		 * is incorrect for soft partitions.  We want to clear
4664 		 * all soft partitions at a particular level in the
4665 		 * metadevice stack before moving to the next level.
4666 		 * Thus, we clear MDCMD_RECURSE from the options.
4667 		 */
4668 		options &= ~MDCMD_RECURSE;
4669 
4670 		/* for each soft partition */
4671 		rval = 0;
4672 		if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
4673 			rval = -1;
4674 
4675 		for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) {
4676 			np = nlp->namep;
4677 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4678 				rval = -1;
4679 				break;
4680 			}
4681 			/*
4682 			 * meta_reset_all calls us twice to get soft
4683 			 * partitions at the top and bottom of the stack.
4684 			 * thus, if we have a parent, we'll get deleted
4685 			 * on the next call.
4686 			 */
4687 			if (MD_HAS_PARENT(msp->common.parent))
4688 				continue;
4689 			/*
4690 			 * If this is a multi-node set, we send a series
4691 			 * of individual metaclear commands.
4692 			 */
4693 			if (meta_is_mn_set(sp, ep)) {
4694 				if (meta_mn_send_metaclear_command(sp,
4695 				    np->cname, options, 0, ep) != 0) {
4696 					rval = -1;
4697 					break;
4698 				}
4699 			} else {
4700 				if (meta_sp_reset(sp, np, options, ep) != 0) {
4701 					rval = -1;
4702 					break;
4703 				}
4704 			}
4705 		}
4706 		/* cleanup return status */
4707 		metafreenamelist(spnlp);
4708 		return (rval);
4709 	}
4710 
4711 	/* check the name */
4712 	if (metachkmeta(np, ep) != 0)
4713 		return (-1);
4714 
4715 	/* get the unit structure */
4716 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4717 		return (-1);
4718 
4719 	/* clear out reset parameters */
4720 	(void) memset(&reset_params, 0, sizeof (reset_params));
4721 
4722 	/* if our child is a metadevice, we need to deparent/reparent it */
4723 	if (metaismeta(msp->compnamep)) {
4724 		/* get sp's on this component */
4725 		if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep,
4726 		    &spnlp, 1, ep)) <= 0)
4727 			/* no sp's on this device.  error! */
4728 			return (-1);
4729 		else if (num_sp == 1)
4730 			/* last sp on this device, so we deparent */
4731 			reset_params.new_parent = MD_NO_PARENT;
4732 		else {
4733 			/* have to reparent this metadevice */
4734 			for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4735 				if (meta_getminor(nlp->namep->dev) ==
4736 					meta_getminor(np->dev))
4737 					continue;
4738 				/*
4739 				 * this isn't the softpart we are deleting,
4740 				 * so use this device as the new parent.
4741 				 */
4742 				reset_params.new_parent =
4743 				    meta_getminor(nlp->namep->dev);
4744 				break;
4745 			}
4746 		}
4747 		metafreenamelist(spnlp);
4748 	}
4749 
4750 	if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0)
4751 		return (-1);
4752 
4753 	return (0);
4754 }
4755 
4756 /*
4757  * FUNCTION:	meta_sp_reset_component()
4758  * INPUT:	sp	- the set name of the device to reset
4759  *		name	- the string name of the device to reset
4760  *		options	- metaclear options
4761  * OUTPUT:	ep	- return error pointer
4762  * RETURNS:	int	-  0 success, -1 error
4763  * PURPOSE:	provides the ability to delete all soft partitions on a
4764  *		specified device (metaclear -p).  It first gets all of the
4765  *		soft partitions on the component and then deletes each one
4766  *		individually.
4767  */
4768 int
4769 meta_sp_reset_component(
4770 	mdsetname_t	*sp,
4771 	char		*name,
4772 	mdcmdopts_t	options,
4773 	md_error_t	*ep
4774 )
4775 {
4776 	mdname_t	*compnp, *np;
4777 	mdnamelist_t	*spnlp = NULL;
4778 	mdnamelist_t	*nlp = NULL;
4779 	md_sp_t		*msp;
4780 	int		count;
4781 	md_sp_reset_t	reset_params;
4782 
4783 	if ((compnp = metaname(&sp, name, UNKNOWN, ep)) == NULL)
4784 		return (-1);
4785 
4786 	/* If we're starting out with no soft partitions, it's an error */
4787 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4788 	if (count == 0)
4789 		return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname));
4790 	else if (count < 0)
4791 		return (-1);
4792 
4793 	/*
4794 	 * clear all soft partitions on this component.
4795 	 * NOTE: we reparent underlying metadevices as we go so that
4796 	 * things stay sane.  Also, if we encounter an error, we stop
4797 	 * and go no further in case recovery might be needed.
4798 	 */
4799 	for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4800 		/* clear out reset parameters */
4801 		(void) memset(&reset_params, 0, sizeof (reset_params));
4802 
4803 		/* check the name */
4804 		np = nlp->namep;
4805 
4806 		if (metachkmeta(np, ep) != 0) {
4807 			metafreenamelist(spnlp);
4808 			return (-1);
4809 		}
4810 
4811 		/* get the unit structure */
4812 		if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4813 			metafreenamelist(spnlp);
4814 			return (-1);
4815 		}
4816 
4817 		/* have to deparent/reparent metadevices */
4818 		if (metaismeta(compnp)) {
4819 			if (nlp->next == NULL)
4820 				reset_params.new_parent = MD_NO_PARENT;
4821 			else
4822 				reset_params.new_parent =
4823 				    meta_getminor(spnlp->next->namep->dev);
4824 		}
4825 
4826 		/* clear soft partition */
4827 		if (meta_sp_reset_common(sp, np, msp, reset_params,
4828 		    options, ep) < 0) {
4829 			metafreenamelist(spnlp);
4830 			return (-1);
4831 		}
4832 	}
4833 	metafreenamelist(spnlp);
4834 	return (0);
4835 }
4836 
4837 /*
4838  * **************************************************************************
4839  *                      Grow (metattach) Functions                          *
4840  * **************************************************************************
4841  */
4842 
4843 /*
4844  * FUNCTION:	meta_sp_attach()
4845  * INPUT:	sp	- the set name of the device to attach to
4846  *		np	- the name of the device to attach to
4847  *		addsize	- the unparsed string holding the amount of space to add
4848  *		options	- metattach options
4849  *		alignment - data alignment
4850  * OUTPUT:	ep	- return error pointer
4851  * RETURNS:	int	-  0 success, -1 error
4852  * PURPOSE:	grows a soft partition by reading in the existing unit
4853  *		structure and setting its state to Growing, allocating more
4854  *		space (similar to meta_create_sp()), updating the watermarks,
4855  *		and then writing out the new unit structure in the Okay state.
4856  */
4857 int
4858 meta_sp_attach(
4859 	mdsetname_t	*sp,
4860 	mdname_t	*np,
4861 	char		*addsize,
4862 	mdcmdopts_t	options,
4863 	sp_ext_length_t	alignment,
4864 	md_error_t	*ep
4865 )
4866 {
4867 	md_grow_params_t	grow_params;
4868 	sp_ext_length_t		grow_len;	/* amount to grow */
4869 	mp_unit_t		*mp, *new_un;
4870 	mdname_t		*compnp = NULL;
4871 
4872 	sp_ext_node_t		*extlist = NULL;
4873 	int			numexts;
4874 	mdnamelist_t		*spnlp = NULL;
4875 	int			count;
4876 	md_sp_t			*msp;
4877 	daddr_t			start_block;
4878 
4879 	/* should have the same set */
4880 	assert(sp != NULL);
4881 	assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev)));
4882 
4883 	/* check name */
4884 	if (metachkmeta(np, ep) != 0)
4885 		return (-1);
4886 
4887 	if (meta_sp_parsesize(addsize, &grow_len) == -1) {
4888 		return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname));
4889 	}
4890 
4891 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
4892 		return (-1);
4893 
4894 	/* make sure we don't have a parent */
4895 	if (MD_HAS_PARENT(mp->c.un_parent)) {
4896 		Free(mp);
4897 		return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname));
4898 	}
4899 
4900 	if (getenv(META_SP_DEBUG)) {
4901 		meta_sp_debug("meta_sp_attach: Unit structure before new "
4902 		    "space:\n");
4903 		meta_sp_printunit(mp);
4904 	}
4905 
4906 	/*
4907 	 * NOTE: the fast option to metakeyname is 0 as opposed to 1
4908 	 * If this was not the case we would suffer the following
4909 	 * assertion failure:
4910 	 * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP
4911 	 * file meta_check.x, line 315
4912 	 * I guess this is because we have not "seen" this drive before
4913 	 * and hence hit the failure - this is of course the attach routine
4914 	 */
4915 	if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) {
4916 		Free(mp);
4917 		return (-1);
4918 	}
4919 
4920 	/* metakeyname does not fill in the key. */
4921 	compnp->key = mp->un_key;
4922 
4923 	/* work out the space on the component that we are dealing with */
4924 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
4925 
4926 	/*
4927 	 * see if the component has been soft partitioned yet, or if an
4928 	 * error occurred.
4929 	 */
4930 	if (count == 0) {
4931 		Free(mp);
4932 		return (mdmderror(ep, MDE_NOT_SP, 0, np->cname));
4933 	} else if (count < 0) {
4934 		Free(mp);
4935 		return (-1);
4936 	}
4937 
4938 	/*
4939 	 * seed extlist with reserved space at the beginning of the volume and
4940 	 * enough space for the end watermark.  The end watermark always gets
4941 	 * updated, but if the underlying device changes size it may not be
4942 	 * pointed to until the extent before it is updated.  Since the
4943 	 * end of the reserved space is where the first watermark starts,
4944 	 * the reserved extent should never be marked for updating.
4945 	 */
4946 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
4947 	    MD_DISKADDR_ERROR) {
4948 		Free(mp);
4949 		return (-1);
4950 	}
4951 
4952 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
4953 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4954 	meta_sp_list_insert(NULL, NULL, &extlist,
4955 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
4956 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4957 
4958 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4959 		Free(mp);
4960 		return (-1);
4961 	}
4962 
4963 	metafreenamelist(spnlp);
4964 
4965 	if (getenv(META_SP_DEBUG)) {
4966 		meta_sp_debug("meta_sp_attach: list of used extents:\n");
4967 		meta_sp_list_dump(extlist);
4968 	}
4969 
4970 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4971 
4972 	assert(mp->un_numexts >= 1);
4973 	numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len,
4974 	    mp->un_ext[mp->un_numexts - 1].un_poff,
4975 	    (alignment > 0) ? alignment :
4976 	    meta_sp_get_default_alignment(sp, compnp, ep));
4977 
4978 	if (numexts == -1) {
4979 		Free(mp);
4980 		return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname));
4981 	}
4982 
4983 	/* allocate new unit structure and copy in old unit */
4984 	if ((new_un = meta_sp_updateunit(np, mp, extlist,
4985 	    grow_len, numexts, ep)) == NULL) {
4986 		Free(mp);
4987 		return (-1);
4988 	}
4989 	Free(mp);
4990 
4991 	/* If running in dryrun mode (-n option), we're done here */
4992 	if ((options & MDCMD_DOIT) == 0) {
4993 		if (options & MDCMD_PRINT) {
4994 			(void) printf(dgettext(TEXT_DOMAIN,
4995 			    "%s: Soft Partition would grow\n"),
4996 			    np->cname);
4997 			(void) fflush(stdout);
4998 		}
4999 		return (0);
5000 	}
5001 
5002 	if (getenv(META_SP_DEBUG)) {
5003 		meta_sp_debug("meta_sp_attach: updated unit structure:\n");
5004 		meta_sp_printunit(new_un);
5005 	}
5006 
5007 	assert(new_un != NULL);
5008 
5009 	(void) memset(&grow_params, 0, sizeof (grow_params));
5010 	if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) {
5011 		grow_params.options = MD_CRO_64BIT;
5012 		new_un->c.un_revision |= MD_64BIT_META_DEV;
5013 	} else {
5014 		grow_params.options = MD_CRO_32BIT;
5015 		new_un->c.un_revision &= ~MD_64BIT_META_DEV;
5016 	}
5017 	grow_params.mnum = MD_SID(new_un);
5018 	grow_params.size = new_un->c.un_size;
5019 	grow_params.mdp = (uintptr_t)new_un;
5020 	MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum));
5021 
5022 	if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde,
5023 	    np->cname) != 0) {
5024 		(void) mdstealerror(ep, &grow_params.mde);
5025 		return (-1);
5026 	}
5027 
5028 	/* update all watermarks */
5029 
5030 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
5031 		return (-1);
5032 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0)
5033 		return (-1);
5034 
5035 
5036 	/* second phase of commit, set status to MD_SP_OK */
5037 	if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0)
5038 		return (-1);
5039 
5040 	meta_invalidate_name(np);
5041 
5042 	if (options & MDCMD_PRINT) {
5043 		(void) printf(dgettext(TEXT_DOMAIN,
5044 		    "%s: Soft Partition has been grown\n"),
5045 		    np->cname);
5046 		(void) fflush(stdout);
5047 	}
5048 
5049 	return (0);
5050 }
5051 
5052 /*
5053  * **************************************************************************
5054  *                    Recovery (metarecover) Functions                      *
5055  * **************************************************************************
5056  */
5057 
5058 /*
5059  * FUNCTION:	meta_recover_sp()
5060  * INPUT:	sp	- the name of the set we are recovering on
5061  *		compnp	- name pointer for device we are recovering on
5062  *		argc	- argument count
5063  *		argv	- left over arguments not parsed by metarecover command
5064  *		options	- metarecover options
5065  * OUTPUT:	ep	- return error pointer
5066  * RETURNS:	int	- 0 - success, -1 - error
5067  * PURPOSE:	parse soft partitioning-specific metarecover options and
5068  *		dispatch to the appropriate function to handle recovery.
5069  */
5070 int
5071 meta_recover_sp(
5072 	mdsetname_t	*sp,
5073 	mdname_t	*compnp,
5074 	int		argc,
5075 	char		*argv[],
5076 	mdcmdopts_t	options,
5077 	md_error_t	*ep
5078 )
5079 {
5080 	md_set_desc	*sd;
5081 
5082 	if (argc > 1) {
5083 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5084 		    argc, argv);
5085 		return (-1);
5086 	}
5087 
5088 	/*
5089 	 * For a MN set, this operation must be performed on the master
5090 	 * as it is responsible for maintaining the watermarks
5091 	 */
5092 	if (!metaislocalset(sp)) {
5093 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
5094 			return (-1);
5095 		if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) {
5096 			(void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno,
5097 			    sd->sd_mn_master_nodenm, NULL, NULL);
5098 			return (-1);
5099 		}
5100 	}
5101 	if (argc == 0) {
5102 		/*
5103 		 * if no additional arguments are passed, metarecover should
5104 		 * validate both on-disk and metadb structures as well as
5105 		 * checking that both are consistent with each other
5106 		 */
5107 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5108 			return (-1);
5109 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5110 			return (-1);
5111 		if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0)
5112 			return (-1);
5113 	} else if (strcmp(argv[0], "-d") == 0) {
5114 		/*
5115 		 * Ensure that there is no existing valid record for this
5116 		 * soft-partition. If there is we have nothing to do.
5117 		 */
5118 		if (meta_sp_validate_unit(sp, compnp, options, ep) == 0)
5119 			return (-1);
5120 		/* validate and recover from on-disk structures */
5121 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5122 			return (-1);
5123 		if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0)
5124 			return (-1);
5125 	} else if (strcmp(argv[0], "-m") == 0) {
5126 		/* validate and recover from metadb structures */
5127 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5128 			return (-1);
5129 		if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0)
5130 			return (-1);
5131 	} else {
5132 		/* syntax error */
5133 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5134 		    argc, argv);
5135 		return (-1);
5136 	}
5137 
5138 	return (0);
5139 }
5140 
5141 /*
5142  * FUNCTION:	meta_sp_display_exthdr()
5143  * INPUT:	none
5144  * OUTPUT:	none
5145  * RETURNS:	void
5146  * PURPOSE:	print header line for sp_ext_node_t information.  to be used
5147  *		in conjunction with meta_sp_display_ext().
5148  */
5149 static void
5150 meta_sp_display_exthdr(void)
5151 {
5152 	(void) printf("%20s %5s %7s %20s %20s\n",
5153 	    dgettext(TEXT_DOMAIN, "Name"),
5154 	    dgettext(TEXT_DOMAIN, "Seq#"),
5155 	    dgettext(TEXT_DOMAIN, "Type"),
5156 	    dgettext(TEXT_DOMAIN, "Offset"),
5157 	    dgettext(TEXT_DOMAIN, "Length"));
5158 }
5159 
5160 
5161 /*
5162  * FUNCTION:	meta_sp_display_ext()
5163  * INPUT:	ext	- extent to display
5164  * OUTPUT:	none
5165  * RETURNS:	void
5166  * PURPOSE:	print selected fields from sp_ext_node_t.
5167  */
5168 static void
5169 meta_sp_display_ext(sp_ext_node_t *ext)
5170 {
5171 	/* print extent information */
5172 	if (ext->ext_namep != NULL)
5173 		(void) printf("%20s ", ext->ext_namep->cname);
5174 	else
5175 		(void) printf("%20s ", "NONE");
5176 
5177 	(void) printf("%5u ", ext->ext_seq);
5178 
5179 	switch (ext->ext_type) {
5180 	case EXTTYP_ALLOC:
5181 		(void) printf("%7s ", "ALLOC");
5182 		break;
5183 	case EXTTYP_FREE:
5184 		(void) printf("%7s ", "FREE");
5185 		break;
5186 	case EXTTYP_RESERVED:
5187 		(void) printf("%7s ", "RESV");
5188 		break;
5189 	case EXTTYP_END:
5190 		(void) printf("%7s ", "END");
5191 		break;
5192 	default:
5193 		(void) printf("%7s ", "INVLD");
5194 		break;
5195 	}
5196 
5197 	(void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length);
5198 }
5199 
5200 
5201 /*
5202  * FUNCTION:	meta_sp_checkseq()
5203  * INPUT:	extlist	- list of extents to be checked
5204  * OUTPUT:	none
5205  * RETURNS:	int	- 0 - success, -1 - error
5206  * PURPOSE:	check soft partition sequence numbers.  this function assumes
5207  *		that a list of extents representing 1 or more soft partitions
5208  *		is passed in sorted in sequence number order.  within a
5209  *		single soft partition, there may not be any missing or
5210  *		duplicate sequence numbers.
5211  */
5212 static int
5213 meta_sp_checkseq(sp_ext_node_t *extlist)
5214 {
5215 	sp_ext_node_t *ext;
5216 
5217 	assert(extlist != NULL);
5218 
5219 	for (ext = extlist;
5220 	    ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC;
5221 	    ext = ext->ext_next) {
5222 		if (ext->ext_next->ext_namep != NULL &&
5223 		    strcmp(ext->ext_next->ext_namep->cname,
5224 			ext->ext_namep->cname) != 0)
5225 				continue;
5226 
5227 		if (ext->ext_next->ext_seq != ext->ext_seq + 1) {
5228 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5229 			    "%s: sequence numbers are "
5230 			    "incorrect: %d should be %d\n"),
5231 			    ext->ext_next->ext_namep->cname,
5232 			    ext->ext_next->ext_seq, ext->ext_seq + 1);
5233 			return (-1);
5234 		}
5235 	}
5236 	return (0);
5237 }
5238 
5239 
5240 /*
5241  * FUNCTION:	meta_sp_resolve_name_conflict()
5242  * INPUT:	sp	- name of set we're are recovering in.
5243  *		old_np	- name pointer of soft partition we found on disk.
5244  * OUTPUT:	new_np	- name pointer for new soft partition name.
5245  *		ep	- error pointer returned.
5246  * RETURNS:	int	- 0 - name not replace, 1 - name replaced, -1 - error
5247  * PURPOSE:	Check to see if the name of one of the soft partitions we found
5248  *		on disk already exists in the metadb.  If so, prompt for a new
5249  *		name.  In addition, we keep a static array of names that
5250  *		will be recovered from this device since these names don't
5251  *		exist in the configuration at this point but cannot be
5252  *		recovered more than once.
5253  */
5254 static int
5255 meta_sp_resolve_name_conflict(
5256 	mdsetname_t	*sp,
5257 	mdname_t	*old_np,
5258 	mdname_t	**new_np,
5259 	md_error_t	*ep
5260 )
5261 {
5262 	char		yesno[255];
5263 	char		*yes;
5264 	char		newname[MD_SP_MAX_DEVNAME_PLUS_1];
5265 	int		nunits;
5266 	static int	*used_names = NULL;
5267 
5268 	assert(old_np != NULL);
5269 
5270 	if (used_names == NULL) {
5271 		if ((nunits = meta_get_nunits(ep)) < 0)
5272 			return (-1);
5273 		used_names = Zalloc(nunits * sizeof (int));
5274 	}
5275 
5276 	/* see if it exists already */
5277 	if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 &&
5278 	    metagetmiscname(old_np, ep) == NULL) {
5279 		if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5280 			return (-1);
5281 		else {
5282 			used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1;
5283 			mdclrerror(ep);
5284 			return (0);
5285 		}
5286 	}
5287 
5288 	/* name exists, ask the user for a new one */
5289 	(void) printf(dgettext(TEXT_DOMAIN,
5290 	    "WARNING: A soft partition named %s was found in the extent\n"
5291 	    "headers, but this name already exists in the metadb "
5292 	    "configuration.\n"
5293 	    "In order to continue recovery you must supply\n"
5294 	    "a new name for this soft partition.\n"), old_np->cname);
5295 	(void) printf(dgettext(TEXT_DOMAIN,
5296 	    "Would you like to continue and supply a new name? (yes/no) "));
5297 
5298 	(void) fflush(stdout);
5299 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
5300 	    (strlen(yesno) == 1))
5301 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
5302 		    dgettext(TEXT_DOMAIN, "no"));
5303 	yes = dgettext(TEXT_DOMAIN, "yes");
5304 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
5305 		return (-1);
5306 	}
5307 
5308 	(void) fflush(stdin);
5309 
5310 	/* get the new name */
5311 	for (;;) {
5312 		(void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name "
5313 		    "for this soft partition (dXXXX) "));
5314 		(void) fflush(stdout);
5315 		if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL)
5316 			(void) strcpy(newname, "");
5317 
5318 		/* remove newline character */
5319 		if (newname[strlen(newname) - 1] == '\n')
5320 			newname[strlen(newname) - 1] = '\0';
5321 
5322 		if (!(is_metaname(newname)) ||
5323 		    (meta_init_make_device(&sp, newname, ep) <= 0)) {
5324 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5325 			    "Invalid metadevice name\n"));
5326 			(void) fflush(stderr);
5327 			continue;
5328 		}
5329 
5330 		if ((*new_np = metaname(&sp, newname,
5331 		    META_DEVICE, ep)) == NULL) {
5332 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5333 			    "Invalid metadevice name\n"));
5334 			(void) fflush(stderr);
5335 			continue;
5336 		}
5337 
5338 		assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits);
5339 		/* make sure the name isn't already being used */
5340 		if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] ||
5341 		    metagetmiscname(*new_np, ep) != NULL) {
5342 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5343 			    "That name already exists\n"));
5344 			continue;
5345 		} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5346 			return (-1);
5347 
5348 		break;
5349 	}
5350 
5351 	/* got a new name, place in used array and return */
5352 	used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1;
5353 	mdclrerror(ep);
5354 	return (1);
5355 }
5356 
5357 /*
5358  * FUNCTION:	meta_sp_validate_wm()
5359  * INPUT:	sp	- set name we are recovering in
5360  *		compnp	- name pointer for device we are recovering from
5361  *		options	- metarecover options
5362  * OUTPUT:	ep	- error pointer returned
5363  * RETURNS:	int	- 0 - success, -1 - error
5364  * PURPOSE:	validate and display watermark configuration.  walk the
5365  *		on-disk watermark structures and validate the information
5366  *		found within.  since a watermark configuration is
5367  *		"self-defining", the act of traversing the watermarks
5368  *		is part of the validation process.
5369  */
5370 static int
5371 meta_sp_validate_wm(
5372 	mdsetname_t	*sp,
5373 	mdname_t	*compnp,
5374 	mdcmdopts_t	options,
5375 	md_error_t	*ep
5376 )
5377 {
5378 	sp_ext_node_t	*extlist = NULL;
5379 	sp_ext_node_t	*ext;
5380 	int		num_sps = 0;
5381 	int		rval;
5382 
5383 	if ((options & MDCMD_VERBOSE) != 0)
5384 		(void) printf(dgettext(TEXT_DOMAIN,
5385 		    "Verifying on-disk structures on %s.\n"),
5386 		    compnp->cname);
5387 
5388 	/*
5389 	 * for each watermark, build an ext_node, place on list.
5390 	 */
5391 	rval = meta_sp_extlist_from_wm(sp, compnp, &extlist,
5392 	    meta_sp_cmp_by_nameseq, ep);
5393 
5394 	if ((options & MDCMD_VERBOSE) != 0) {
5395 		/* print out what we found */
5396 		if (extlist == NULL)
5397 			(void) printf(dgettext(TEXT_DOMAIN,
5398 			    "No extent headers found on %s.\n"),
5399 			    compnp->cname);
5400 		else {
5401 			(void) printf(dgettext(TEXT_DOMAIN,
5402 			    "The following extent headers were found on %s.\n"),
5403 			    compnp->cname);
5404 			meta_sp_display_exthdr();
5405 		}
5406 		for (ext = extlist; ext != NULL; ext = ext->ext_next)
5407 			meta_sp_display_ext(ext);
5408 	}
5409 
5410 	if (rval < 0) {
5411 		(void) printf(dgettext(TEXT_DOMAIN,
5412 		    "%s: On-disk structures invalid or "
5413 		    "no soft partitions found.\n"),
5414 		    compnp->cname);
5415 		return (-1);
5416 	}
5417 
5418 	assert(extlist != NULL);
5419 
5420 	/* count number of soft partitions */
5421 	for (ext = extlist;
5422 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5423 	    ext = ext->ext_next) {
5424 		if (ext->ext_next != NULL &&
5425 		    ext->ext_next->ext_namep != NULL &&
5426 		    strcmp(ext->ext_next->ext_namep->cname,
5427 			ext->ext_namep->cname) == 0)
5428 				continue;
5429 		num_sps++;
5430 	}
5431 
5432 	if ((options & MDCMD_VERBOSE) != 0)
5433 		(void) printf(dgettext(TEXT_DOMAIN,
5434 		    "Found %d soft partition(s) on %s.\n"), num_sps,
5435 		    compnp->cname);
5436 
5437 	if (num_sps == 0) {
5438 		(void) printf(dgettext(TEXT_DOMAIN,
5439 		    "%s: No soft partitions.\n"), compnp->cname);
5440 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5441 	}
5442 
5443 	/* check sequence numbers */
5444 	if ((options & MDCMD_VERBOSE) != 0)
5445 		(void) printf(dgettext(TEXT_DOMAIN,
5446 		    "Checking sequence numbers.\n"));
5447 
5448 	if (meta_sp_checkseq(extlist) != 0)
5449 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5450 
5451 	return (0);
5452 }
5453 
5454 /*
5455  * FUNCTION:	meta_sp_validate_unit()
5456  * INPUT:	sp	- name of set we are recovering in
5457  *		compnp	- name of component we are recovering from
5458  *		options	- metarecover options
5459  * OUTPUT:	ep	- error pointer returned
5460  * RETURNS:	int	- 0 - success, -1 - error
5461  * PURPOSE:	validate and display metadb configuration.  begin by getting
5462  *		all soft partitions built on the specified component.  get
5463  *		the unit structure for each one and validate the fields within.
5464  */
5465 static int
5466 meta_sp_validate_unit(
5467 	mdsetname_t	*sp,
5468 	mdname_t	*compnp,
5469 	mdcmdopts_t	options,
5470 	md_error_t	*ep
5471 )
5472 {
5473 	md_sp_t		*msp;
5474 	mdnamelist_t	*spnlp = NULL;
5475 	mdnamelist_t	*namep = NULL;
5476 	int		count;
5477 	uint_t		extn;
5478 	sp_ext_length_t	size;
5479 
5480 	if ((options & MDCMD_VERBOSE) != 0)
5481 		(void) printf(dgettext(TEXT_DOMAIN,
5482 		    "%s: Validating soft partition metadb entries.\n"),
5483 		    compnp->cname);
5484 
5485 	if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR)
5486 		return (-1);
5487 
5488 	/* get all soft partitions on component */
5489 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
5490 
5491 	if (count == 0) {
5492 		(void) printf(dgettext(TEXT_DOMAIN,
5493 		    "%s: No soft partitions.\n"), compnp->cname);
5494 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5495 	} else if (count < 0) {
5496 		return (-1);
5497 	}
5498 
5499 	/* Now go through the soft partitions and check each one */
5500 	for (namep = spnlp; namep != NULL; namep = namep->next) {
5501 		mdname_t	*curnp = namep->namep;
5502 		sp_ext_offset_t	curvoff;
5503 
5504 		/* get the unit structure */
5505 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
5506 			return (-1);
5507 
5508 		/* verify generic unit structure parameters */
5509 		if ((options & MDCMD_VERBOSE) != 0)
5510 			(void) printf(dgettext(TEXT_DOMAIN,
5511 			    "\nVerifying device %s.\n"),
5512 			    curnp->cname);
5513 
5514 		/*
5515 		 * MD_SP_LAST is an invalid state and is always the
5516 		 * highest numbered.
5517 		 */
5518 		if (msp->status >= MD_SP_LAST) {
5519 			(void) printf(dgettext(TEXT_DOMAIN,
5520 			    "%s: status value %u is out of range.\n"),
5521 			    curnp->cname, msp->status);
5522 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5523 			    0, curnp->cname));
5524 		} else if ((options & MDCMD_VERBOSE) != 0) {
5525 			uint_t	tstate = 0;
5526 
5527 			if (metaismeta(msp->compnamep)) {
5528 				if (meta_get_tstate(msp->common.namep->dev,
5529 				    &tstate, ep) != 0)
5530 					return (-1);
5531 			}
5532 			(void) printf(dgettext(TEXT_DOMAIN,
5533 			    "%s: Status \"%s\" is valid.\n"),
5534 			    curnp->cname, meta_sp_status_to_name(msp->status,
5535 			    tstate & MD_DEV_ERRORED));
5536 		}
5537 
5538 		/* Now verify each extent */
5539 		if ((options & MDCMD_VERBOSE) != 0)
5540 			(void) printf("%14s %21s %21s %21s\n",
5541 			    dgettext(TEXT_DOMAIN, "Extent Number"),
5542 			    dgettext(TEXT_DOMAIN, "Virtual Offset"),
5543 			    dgettext(TEXT_DOMAIN, "Physical Offset"),
5544 			    dgettext(TEXT_DOMAIN, "Length"));
5545 
5546 		curvoff = 0ULL;
5547 		for (extn = 0; extn < msp->ext.ext_len; extn++) {
5548 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
5549 
5550 			if ((options & MDCMD_VERBOSE) != 0)
5551 				(void) printf("%14u %21llu %21llu %21llu\n",
5552 				    extn, extp->voff, extp->poff, extp->len);
5553 
5554 			if (extp->voff != curvoff) {
5555 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5556 				    "%s: virtual offset for extent %u "
5557 				    "is inconsistent, expected %llu, "
5558 				    "got %llu.\n"), curnp->cname, extn,
5559 				    curvoff, extp->voff);
5560 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5561 				    0, compnp->cname));
5562 			}
5563 
5564 			/* make sure extent does not drop off the end */
5565 			if ((extp->poff + extp->len) == size) {
5566 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5567 				    "%s: extent %u at offset %llu, "
5568 				    "length %llu exceeds the size of the "
5569 				    "device, %llu.\n"), curnp->cname,
5570 				    extn, extp->poff, extp->len, size);
5571 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5572 				    0, compnp->cname));
5573 			}
5574 
5575 			curvoff += extp->len;
5576 		}
5577 	}
5578 	if (options & MDCMD_PRINT) {
5579 		(void) printf(dgettext(TEXT_DOMAIN,
5580 		    "%s: Soft Partition metadb configuration is valid\n"),
5581 		    compnp->cname);
5582 	}
5583 	return (0);
5584 }
5585 
5586 /*
5587  * FUNCTION:	meta_sp_validate_wm_and_unit()
5588  * INPUT:	sp	- name of set we are recovering in
5589  *		compnp	- name of device we are recovering from
5590  *		options	- metarecover options
5591  * OUTPUT:	ep	- error pointer returned
5592  * RETURNS:	int	- 0 - success, -1 error
5593  * PURPOSE:	cross-validate and display watermarks and metadb records.
5594  *		get both the unit structures for the soft partitions built
5595  *		on the specified component and the watermarks found on that
5596  *		component and check to make sure they are consistent with
5597  *		each other.
5598  */
5599 static int
5600 meta_sp_validate_wm_and_unit(
5601 	mdsetname_t	*sp,
5602 	mdname_t	*np,
5603 	mdcmdopts_t	options,
5604 	md_error_t	*ep
5605 )
5606 {
5607 	sp_ext_node_t	*wmlist = NULL;
5608 	sp_ext_node_t	*unitlist = NULL;
5609 	sp_ext_node_t	*unitext;
5610 	sp_ext_node_t	*wmext;
5611 	sp_ext_offset_t	tmpunitoff;
5612 	mdnamelist_t	*spnlp = NULL;
5613 	int		count;
5614 	int		rval = 0;
5615 	int		verbose = (options & MDCMD_VERBOSE);
5616 
5617 	/* get unit structure list */
5618 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
5619 	if (count <= 0)
5620 		return (-1);
5621 
5622 	meta_sp_list_insert(NULL, NULL, &unitlist,
5623 	    metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
5624 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
5625 
5626 	if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) {
5627 		metafreenamelist(spnlp);
5628 		return (-1);
5629 	}
5630 
5631 	metafreenamelist(spnlp);
5632 
5633 	meta_sp_list_freefill(&unitlist, metagetsize(np, ep));
5634 
5635 	if (meta_sp_extlist_from_wm(sp, np, &wmlist,
5636 	    meta_sp_cmp_by_offset, ep) < 0) {
5637 		meta_sp_list_free(&unitlist);
5638 		return (-1);
5639 	}
5640 
5641 	if (getenv(META_SP_DEBUG)) {
5642 		meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n");
5643 		meta_sp_list_dump(unitlist);
5644 		meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n");
5645 		meta_sp_list_dump(wmlist);
5646 	}
5647 
5648 	/*
5649 	 * step through both lists and compare allocated nodes.  Free
5650 	 * nodes and end watermarks may differ between the two but
5651 	 * that's generally ok, and if they're wrong will typically
5652 	 * cause misplaced allocated extents.
5653 	 */
5654 	if (verbose)
5655 		(void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb "
5656 		    "allocations match extent headers.\n"), np->cname);
5657 
5658 	unitext = unitlist;
5659 	wmext = wmlist;
5660 	while ((wmext != NULL) && (unitext != NULL)) {
5661 		/* find next allocated extents in each list */
5662 		while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC)
5663 			wmext = wmext->ext_next;
5664 
5665 		while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC)
5666 			unitext = unitext->ext_next;
5667 
5668 		if (wmext == NULL || unitext == NULL)
5669 			break;
5670 
5671 		if (verbose) {
5672 			(void) printf(dgettext(TEXT_DOMAIN,
5673 			    "Metadb extent:\n"));
5674 			meta_sp_display_exthdr();
5675 			meta_sp_display_ext(unitext);
5676 			(void) printf(dgettext(TEXT_DOMAIN,
5677 			    "Extent header extent:\n"));
5678 			meta_sp_display_exthdr();
5679 			meta_sp_display_ext(wmext);
5680 			(void) printf("\n");
5681 		}
5682 
5683 		if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0)
5684 			rval = -1;
5685 
5686 		/*
5687 		 * if the offsets aren't equal, only increment the
5688 		 * lowest one in hopes of getting the lists back in sync.
5689 		 */
5690 		tmpunitoff = unitext->ext_offset;
5691 		if (unitext->ext_offset <= wmext->ext_offset)
5692 			unitext = unitext->ext_next;
5693 		if (wmext->ext_offset <= tmpunitoff)
5694 			wmext = wmext->ext_next;
5695 	}
5696 
5697 	/*
5698 	 * if both lists aren't at the end then there are extra
5699 	 * allocated nodes in one of them.
5700 	 */
5701 	if (wmext != NULL) {
5702 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5703 		    "%s: extent headers contain allocations not in "
5704 		    "the metadb\n\n"), np->cname);
5705 		rval = -1;
5706 	}
5707 
5708 	if (unitext != NULL) {
5709 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5710 		    "%s: metadb contains allocations not in the extent "
5711 		    "headers\n\n"), np->cname);
5712 		rval = -1;
5713 	}
5714 
5715 	if (options & MDCMD_PRINT) {
5716 		if (rval == 0) {
5717 			(void) printf(dgettext(TEXT_DOMAIN,
5718 			    "%s: Soft Partition metadb matches extent "
5719 			    "header configuration\n"), np->cname);
5720 		} else {
5721 			(void) printf(dgettext(TEXT_DOMAIN,
5722 			    "%s: Soft Partition metadb does not match extent "
5723 			    "header configuration\n"), np->cname);
5724 		}
5725 	}
5726 
5727 	return (rval);
5728 }
5729 
5730 /*
5731  * FUNCTION:	meta_sp_validate_exts()
5732  * INPUT:	compnp	- name pointer for device we are recovering from
5733  *		wmext	- extent node representing watermark
5734  *		unitext	- extent node from unit structure
5735  * OUTPUT:	ep	- return error pointer
5736  * RETURNS:	int	- 0 - succes, mdmderror return code - error
5737  * PURPOSE:	Takes two extent nodes and checks them against each other.
5738  *		offset, length, sequence number, set, and name are compared.
5739  */
5740 static int
5741 meta_sp_validate_exts(
5742 	mdname_t	*compnp,
5743 	sp_ext_node_t	*wmext,
5744 	sp_ext_node_t	*unitext,
5745 	md_error_t	*ep
5746 )
5747 {
5748 	if (wmext->ext_offset != unitext->ext_offset) {
5749 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5750 		    "%s: unit structure and extent header offsets differ.\n"),
5751 		    compnp->cname);
5752 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5753 	}
5754 
5755 	if (wmext->ext_length != unitext->ext_length) {
5756 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5757 		    "%s: unit structure and extent header lengths differ.\n"),
5758 		    compnp->cname);
5759 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5760 	}
5761 
5762 	if (wmext->ext_seq != unitext->ext_seq) {
5763 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5764 		    "%s: unit structure and extent header sequence numbers "
5765 		    "differ.\n"), compnp->cname);
5766 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5767 	}
5768 
5769 	if (wmext->ext_type != unitext->ext_type) {
5770 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5771 		    "%s: unit structure and extent header types differ.\n"),
5772 		    compnp->cname);
5773 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5774 	}
5775 
5776 	/*
5777 	 * If one has a set pointer and the other doesn't, error.
5778 	 * If both extents have setnames, then make sure they match
5779 	 * If both are NULL, it's ok, they match.
5780 	 */
5781 	if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) {
5782 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5783 		    "%s: unit structure and extent header set values "
5784 		    "differ.\n"), compnp->cname);
5785 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5786 	}
5787 
5788 	if (unitext->ext_setp != NULL) {
5789 		if (strcmp(unitext->ext_setp->setname,
5790 		    wmext->ext_setp->setname) != 0) {
5791 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5792 			    "%s: unit structure and extent header set names "
5793 			    "differ.\n"), compnp->cname);
5794 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5795 			    0, compnp->cname));
5796 		}
5797 	}
5798 
5799 	/*
5800 	 * If one has a name pointer and the other doesn't, error.
5801 	 * If both extents have names, then make sure they match
5802 	 * If both are NULL, it's ok, they match.
5803 	 */
5804 	if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) {
5805 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5806 		    "%s: unit structure and extent header name values "
5807 		    "differ.\n"), compnp->cname);
5808 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5809 	}
5810 
5811 	if (unitext->ext_namep != NULL) {
5812 		if (strcmp(wmext->ext_namep->cname,
5813 		    unitext->ext_namep->cname) != 0) {
5814 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5815 			    "%s: unit structure and extent header names "
5816 			    "differ.\n"), compnp->cname);
5817 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5818 			    0, compnp->cname));
5819 		}
5820 	}
5821 
5822 	return (0);
5823 }
5824 
5825 /*
5826  * FUNCTION:	update_sp_status()
5827  * INPUT:	sp	- name of set we are recovering in
5828  *		minors	- pointer to an array of soft partition minor numbers
5829  *		num_sps	- number of minor numbers in array
5830  *		status	- new status to be applied to all soft parts in array
5831  *		mn_set	- set if current set is a multi-node set
5832  * OUTPUT:	ep	- return error pointer
5833  * RETURNS:	int	- 0 - success, -1 - error
5834  * PURPOSE:	update  status of soft partitions to new status. minors is an
5835  *		array of minor numbers to apply the new status to.
5836  *		If mn_set is set, a message is sent to all nodes in the
5837  *		cluster to update the status locally.
5838  */
5839 static int
5840 update_sp_status(
5841 	mdsetname_t	*sp,
5842 	minor_t		*minors,
5843 	int		num_sps,
5844 	sp_status_t	status,
5845 	bool_t		mn_set,
5846 	md_error_t	*ep
5847 )
5848 {
5849 	int	i;
5850 	int	err = 0;
5851 
5852 	if (mn_set) {
5853 		md_mn_msg_sp_setstat_t	sp_setstat_params;
5854 		int			result;
5855 		md_mn_result_t		*resp = NULL;
5856 
5857 		for (i = 0; i < num_sps; i++) {
5858 			sp_setstat_params.sp_setstat_mnum = minors[i];
5859 			sp_setstat_params.sp_setstat_status = status;
5860 
5861 			result = mdmn_send_message(sp->setno,
5862 			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS,
5863 			    (char *)&sp_setstat_params,
5864 			    sizeof (sp_setstat_params),
5865 			    &resp, ep);
5866 			if (resp != NULL) {
5867 				if (resp->mmr_exitval != 0)
5868 					err = -1;
5869 				free_result(resp);
5870 			}
5871 			if (result != 0) {
5872 				err = -1;
5873 			}
5874 		}
5875 	} else {
5876 		if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0)
5877 			err = -1;
5878 	}
5879 	if (err < 0) {
5880 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5881 		    "Error updating status on recovered soft "
5882 		    "partitions.\n"));
5883 	}
5884 	return (err);
5885 }
5886 
5887 /*
5888  * FUNCTION:	meta_sp_recover_from_wm()
5889  * INPUT:	sp	- name of set we are recovering in
5890  *		compnp	- name pointer for component we are recovering from
5891  *		options	- metarecover options
5892  * OUTPUT:	ep	- return error pointer
5893  * RETURNS:	int	- 0 - success, -1 - error
5894  * PURPOSE:	update metadb records to match watermarks.  begin by getting
5895  *		an extlist representing all soft partitions on the component.
5896  *		then build a unit structure for each soft partition.
5897  *		notify user of changes, then commit each soft partition to
5898  *		the metadb one at a time in the "recovering" state.  update
5899  *		any watermarks that may need it	(to reflect possible name
5900  *		changes), and, finally, set the status of all recovered
5901  *		partitions to the "OK" state at once.
5902  */
5903 static int
5904 meta_sp_recover_from_wm(
5905 	mdsetname_t	*sp,
5906 	mdname_t	*compnp,
5907 	mdcmdopts_t	options,
5908 	md_error_t	*ep
5909 )
5910 {
5911 	sp_ext_node_t		*extlist = NULL;
5912 	sp_ext_node_t		*sp_list = NULL;
5913 	sp_ext_node_t		*update_list = NULL;
5914 	sp_ext_node_t		*ext;
5915 	sp_ext_node_t		*sp_ext;
5916 	mp_unit_t		*mp;
5917 	mp_unit_t		**un_array;
5918 	int			numexts = 0, num_sps = 0, i = 0;
5919 	int			err = 0;
5920 	int			not_recovered = 0;
5921 	int			committed = 0;
5922 	sp_ext_length_t		sp_length = 0LL;
5923 	mdnamelist_t		*keynlp = NULL;
5924 	mdname_t		*np;
5925 	mdname_t		*new_np;
5926 	int			new_name;
5927 	md_set_params_t		set_params;
5928 	minor_t			*minors = NULL;
5929 	char			yesno[255];
5930 	char			*yes;
5931 	bool_t			mn_set = 0;
5932 	md_set_desc		*sd;
5933 	mm_unit_t		*mm;
5934 	md_set_mmown_params_t	*ownpar = NULL;
5935 	int			comp_is_mirror = 0;
5936 
5937 	/*
5938 	 * if this component appears in another metadevice already, do
5939 	 * NOT recover from it.
5940 	 */
5941 	if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0)
5942 		return (-1);
5943 
5944 	/* set flag if dealing with a MN set */
5945 	if (!metaislocalset(sp)) {
5946 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5947 			return (-1);
5948 		}
5949 		if (MD_MNSET_DESC(sd))
5950 			mn_set = 1;
5951 	}
5952 	/*
5953 	 * for each watermark, build an ext_node, place on list.
5954 	 */
5955 	if (meta_sp_extlist_from_wm(sp, compnp, &extlist,
5956 	    meta_sp_cmp_by_nameseq, ep) < 0)
5957 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5958 
5959 	assert(extlist != NULL);
5960 
5961 	/* count number of soft partitions */
5962 	for (ext = extlist;
5963 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5964 	    ext = ext->ext_next) {
5965 		if (ext->ext_next != NULL &&
5966 		    ext->ext_next->ext_namep != NULL &&
5967 		    strcmp(ext->ext_next->ext_namep->cname,
5968 			ext->ext_namep->cname) == 0)
5969 				continue;
5970 		num_sps++;
5971 	}
5972 
5973 	/* allocate array of unit structure pointers */
5974 	un_array = Zalloc(num_sps * sizeof (mp_unit_t *));
5975 
5976 	/*
5977 	 * build unit structures from list of ext_nodes.
5978 	 */
5979 	for (ext = extlist;
5980 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5981 	    ext = ext->ext_next) {
5982 		meta_sp_list_insert(ext->ext_setp, ext->ext_namep,
5983 		    &sp_list, ext->ext_offset, ext->ext_length,
5984 		    ext->ext_type, ext->ext_seq, ext->ext_flags,
5985 		    meta_sp_cmp_by_nameseq);
5986 
5987 		numexts++;
5988 		sp_length += ext->ext_length - MD_SP_WMSIZE;
5989 
5990 		if (ext->ext_next != NULL &&
5991 		    ext->ext_next->ext_namep != NULL &&
5992 		    strcmp(ext->ext_next->ext_namep->cname,
5993 			ext->ext_namep->cname) == 0)
5994 				continue;
5995 
5996 		/*
5997 		 * if we made it here, we are at a soft partition
5998 		 * boundary in the list.
5999 		 */
6000 		if (getenv(META_SP_DEBUG)) {
6001 			meta_sp_debug("meta_recover_from_wm: dumping wm "
6002 			    "list:\n");
6003 			meta_sp_list_dump(sp_list);
6004 		}
6005 
6006 		assert(sp_list != NULL);
6007 		assert(sp_list->ext_namep != NULL);
6008 
6009 		if ((new_name = meta_sp_resolve_name_conflict(sp,
6010 		    sp_list->ext_namep, &new_np, ep)) < 0) {
6011 			err = 1;
6012 			goto out;
6013 		} else if (new_name) {
6014 			for (sp_ext = sp_list;
6015 			    sp_ext != NULL;
6016 			    sp_ext = sp_ext->ext_next) {
6017 				/*
6018 				 * insert into the update list for
6019 				 * watermark update.
6020 				 */
6021 				meta_sp_list_insert(sp_ext->ext_setp,
6022 				    new_np, &update_list, sp_ext->ext_offset,
6023 				    sp_ext->ext_length, sp_ext->ext_type,
6024 				    sp_ext->ext_seq, EXTFLG_UPDATE,
6025 				    meta_sp_cmp_by_offset);
6026 			}
6027 
6028 		}
6029 		if (options & MDCMD_DOIT) {
6030 			/* store name in namespace */
6031 			if (mn_set) {
6032 				/* send message to all nodes to return key */
6033 				md_mn_msg_addkeyname_t	*send_params;
6034 				int			result;
6035 				md_mn_result_t		*resp = NULL;
6036 				int			message_size;
6037 
6038 				message_size =  sizeof (*send_params) +
6039 				    strlen(compnp->cname) + 1;
6040 				send_params = Zalloc(message_size);
6041 				send_params->addkeyname_setno = sp->setno;
6042 				(void) strcpy(&send_params->addkeyname_name[0],
6043 				    compnp->cname);
6044 				result = mdmn_send_message(sp->setno,
6045 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6046 				    (char *)send_params, message_size, &resp,
6047 				    ep);
6048 				Free(send_params);
6049 				if (resp != NULL) {
6050 					if (resp->mmr_exitval >= 0) {
6051 						compnp->key =
6052 						    (mdkey_t)resp->mmr_exitval;
6053 					} else {
6054 						err = 1;
6055 						free_result(resp);
6056 						goto out;
6057 					}
6058 					free_result(resp);
6059 				}
6060 				if (result != 0) {
6061 					err = 1;
6062 					goto out;
6063 				}
6064 				(void) metanamelist_append(&keynlp, compnp);
6065 			} else {
6066 				if (add_key_name(sp, compnp, &keynlp,
6067 				    ep) != 0) {
6068 					err = 1;
6069 					goto out;
6070 				}
6071 			}
6072 		}
6073 
6074 		/* create the unit structure */
6075 		if ((mp = meta_sp_createunit(
6076 		    (new_name) ? new_np : sp_list->ext_namep, compnp,
6077 		    sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) {
6078 			err = 1;
6079 			goto out;
6080 		}
6081 
6082 		if (getenv(META_SP_DEBUG)) {
6083 			meta_sp_debug("meta_sp_recover_from_wm: "
6084 			    "printing newly created unit structure");
6085 			meta_sp_printunit(mp);
6086 		}
6087 
6088 		/* place in unit structure array */
6089 		un_array[i++] = mp;
6090 
6091 		/* free sp_list */
6092 		meta_sp_list_free(&sp_list);
6093 		sp_list = NULL;
6094 		numexts = 0;
6095 		sp_length = 0LL;
6096 	}
6097 
6098 	/* display configuration updates */
6099 	(void) printf(dgettext(TEXT_DOMAIN,
6100 	    "The following soft partitions were found and will be added to\n"
6101 	    "your metadevice configuration.\n"));
6102 	(void) printf("%5s %15s %18s\n",
6103 	    dgettext(TEXT_DOMAIN, "Name"),
6104 	    dgettext(TEXT_DOMAIN, "Size"),
6105 	    dgettext(TEXT_DOMAIN, "No. of Extents"));
6106 	for (i = 0; i < num_sps; i++) {
6107 		(void) printf("%5s%lu %15llu %9d\n", "d",
6108 		    MD_MIN2UNIT(MD_SID(un_array[i])),
6109 		    un_array[i]->un_length, un_array[i]->un_numexts);
6110 	}
6111 
6112 	if (!(options & MDCMD_DOIT)) {
6113 		not_recovered = 1;
6114 		goto out;
6115 	}
6116 
6117 	/* ask user for confirmation */
6118 	(void) printf(dgettext(TEXT_DOMAIN,
6119 	    "WARNING: You are about to add one or more soft partition\n"
6120 	    "metadevices to your metadevice configuration.  If there\n"
6121 	    "appears to be an error in the soft partition(s) displayed\n"
6122 	    "above, do NOT proceed with this recovery operation.\n"));
6123 	(void) printf(dgettext(TEXT_DOMAIN,
6124 	    "Are you sure you want to do this (yes/no)? "));
6125 
6126 	(void) fflush(stdout);
6127 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6128 	    (strlen(yesno) == 1))
6129 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
6130 		    dgettext(TEXT_DOMAIN, "no"));
6131 	yes = dgettext(TEXT_DOMAIN, "yes");
6132 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
6133 		not_recovered = 1;
6134 		goto out;
6135 	}
6136 
6137 	/* commit records one at a time */
6138 	for (i = 0; i < num_sps; i++) {
6139 		(void) memset(&set_params, 0, sizeof (set_params));
6140 		set_params.mnum = MD_SID(un_array[i]);
6141 		set_params.size = (un_array[i])->c.un_size;
6142 		set_params.mdp = (uintptr_t)(un_array[i]);
6143 		set_params.options =
6144 				meta_check_devicesize(un_array[i]->un_length);
6145 		if (set_params.options == MD_CRO_64BIT) {
6146 			un_array[i]->c.un_revision |= MD_64BIT_META_DEV;
6147 		} else {
6148 			un_array[i]->c.un_revision &= ~MD_64BIT_META_DEV;
6149 		}
6150 		MD_SETDRIVERNAME(&set_params, MD_SP,
6151 		    MD_MIN2SET(set_params.mnum));
6152 
6153 		np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep);
6154 
6155 		/*
6156 		 * If this is an MN set, send the MD_IOCSET ioctl to all nodes
6157 		 */
6158 		if (mn_set) {
6159 			md_mn_msg_iocset_t	send_params;
6160 			int			result;
6161 			md_mn_result_t		*resp = NULL;
6162 			int			mess_size;
6163 
6164 			/*
6165 			 * Calculate message size. md_mn_msg_iocset_t only
6166 			 * contains one extent, so increment the size to
6167 			 * include all extents
6168 			 */
6169 			mess_size = sizeof (send_params) -
6170 			    sizeof (mp_ext_t) +
6171 			    (un_array[i]->un_numexts * sizeof (mp_ext_t));
6172 
6173 			send_params.iocset_params = set_params;
6174 			(void) memcpy(&send_params.unit, un_array[i],
6175 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
6176 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
6177 			result = mdmn_send_message(sp->setno,
6178 			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS,
6179 			    (char *)&send_params, mess_size, &resp,
6180 			    ep);
6181 			if (resp != NULL) {
6182 				if (resp->mmr_exitval != 0)
6183 					err = 1;
6184 				free_result(resp);
6185 			}
6186 			if (result != 0) {
6187 				err = 1;
6188 			}
6189 		} else {
6190 			if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
6191 			    np->cname) != 0) {
6192 				err = 1;
6193 			}
6194 		}
6195 
6196 		if (err == 1) {
6197 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6198 			    "%s: Error committing record to metadb.\n"),
6199 			    np->cname);
6200 			goto out;
6201 		}
6202 
6203 		/* note that we've committed a record */
6204 		if (!committed)
6205 			committed = 1;
6206 
6207 		/* update any watermarks that need it */
6208 		if (update_list != NULL) {
6209 			md_sp_t *msp;
6210 
6211 			/*
6212 			 * Check to see if we're trying to create a partition
6213 			 * on a mirror. If so we may have to enforce an
6214 			 * ownership change before writing the watermark out.
6215 			 */
6216 			if (metaismeta(compnp)) {
6217 				char *miscname;
6218 
6219 				miscname = metagetmiscname(compnp, ep);
6220 				if (miscname != NULL)
6221 					comp_is_mirror = (strcmp(miscname,
6222 					    MD_MIRROR) == 0);
6223 				else
6224 					comp_is_mirror = 0;
6225 			}
6226 			/*
6227 			 * If this is a MN set and the component is a mirror,
6228 			 * change ownership to this node in order to write the
6229 			 * watermarks
6230 			 */
6231 			if (mn_set && comp_is_mirror) {
6232 				mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
6233 				if (mm == NULL) {
6234 					err = 1;
6235 					goto out;
6236 				} else {
6237 					err = meta_mn_change_owner(&ownpar,
6238 						sp->setno,
6239 						meta_getminor(compnp->dev),
6240 						sd->sd_mn_mynode->nd_nodeid,
6241 						MD_MN_MM_PREVENT_CHANGE |
6242 						    MD_MN_MM_SPAWN_THREAD);
6243 					if (err != 0)
6244 						goto out;
6245 				}
6246 			}
6247 
6248 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
6249 				err = 1;
6250 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6251 				    "%s: Error updating extent headers.\n"),
6252 				    np->cname);
6253 				goto out;
6254 			}
6255 			if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) {
6256 				err = 1;
6257 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6258 				    "%s: Error updating extent headers "
6259 				    "on disk.\n"), np->cname);
6260 				goto out;
6261 			}
6262 		}
6263 		/*
6264 		 * If we have changed ownership earlier and prevented any
6265 		 * ownership changes, we can now allow ownership changes
6266 		 * again.
6267 		 */
6268 		if (ownpar) {
6269 			(void) meta_mn_change_owner(&ownpar, sp->setno,
6270 			    ownpar->d.mnum,
6271 			    ownpar->d.owner,
6272 			    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
6273 		}
6274 	}
6275 
6276 	/* update status of all soft partitions to OK */
6277 	minors = Zalloc(num_sps * sizeof (minor_t));
6278 	for (i = 0; i < num_sps; i++)
6279 		minors[i] = MD_SID(un_array[i]);
6280 
6281 	err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep);
6282 	if (err != 0)
6283 		goto out;
6284 
6285 	if (options & MDCMD_PRINT)
6286 		(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6287 		    "Soft Partitions recovered from device.\n"),
6288 		    compnp->cname);
6289 out:
6290 	/* free memory */
6291 	if (extlist != NULL)
6292 		meta_sp_list_free(&extlist);
6293 	if (sp_list != NULL)
6294 		meta_sp_list_free(&sp_list);
6295 	if (update_list != NULL)
6296 		meta_sp_list_free(&update_list);
6297 	if (un_array != NULL)	{
6298 		for (i = 0; i < num_sps; i++)
6299 			Free(un_array[i]);
6300 		Free(un_array);
6301 	}
6302 	if (minors != NULL)
6303 		Free(minors);
6304 	if (ownpar != NULL)
6305 		Free(ownpar);
6306 	(void) fflush(stdout);
6307 
6308 	if ((keynlp != NULL) && (committed != 1)) {
6309 		/*
6310 		 * if we haven't committed any softparts, either because of an
6311 		 * error or because the user decided not to proceed, delete
6312 		 * namelist key for the component
6313 		 */
6314 		if (mn_set) {
6315 			mdnamelist_t	*p;
6316 
6317 			for (p = keynlp; (p != NULL); p = p->next) {
6318 				mdname_t		*np = p->namep;
6319 				md_mn_msg_delkeyname_t	send_params;
6320 				md_mn_result_t		*resp = NULL;
6321 
6322 				send_params.delkeyname_dev = np->dev;
6323 				send_params.delkeyname_setno = sp->setno;
6324 				send_params.delkeyname_key = np->key;
6325 				(void) mdmn_send_message(sp->setno,
6326 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6327 				    (char *)&send_params, sizeof (send_params),
6328 				    &resp, ep);
6329 				if (resp != NULL) {
6330 					free_result(resp);
6331 				}
6332 			}
6333 		} else {
6334 			(void) del_key_names(sp, keynlp, NULL);
6335 		}
6336 	}
6337 
6338 	metafreenamelist(keynlp);
6339 
6340 	if (err)
6341 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
6342 
6343 	if (not_recovered)
6344 		if (options & MDCMD_PRINT)
6345 			(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6346 			    "Soft Partitions NOT recovered from device.\n"),
6347 			    compnp->cname);
6348 	return (0);
6349 }
6350 
6351 /*
6352  * FUNCTION:	meta_sp_recover_from_unit()
6353  * INPUT:	sp	- name of set we are recovering in
6354  *		compnp	- name of component we are recovering from
6355  *		options	- metarecover options
6356  * OUTPUT:	ep	- return error pointer
6357  * RETURNS:	int	- 0 - success, -1 - error
6358  * PURPOSE:	update watermarks to match metadb records.  begin by getting
6359  *		a namelist representing all soft partitions on the specified
6360  *		component.  then, build an extlist representing the soft
6361  *		partitions, filling in the freespace extents.  notify user
6362  *		of changes, place all soft partitions into the "recovering"
6363  *		state and update the watermarks.  finally, return all soft
6364  *		partitions to the "OK" state.
6365  */
6366 static int
6367 meta_sp_recover_from_unit(
6368 	mdsetname_t	*sp,
6369 	mdname_t	*compnp,
6370 	mdcmdopts_t	options,
6371 	md_error_t	*ep
6372 )
6373 {
6374 	mdnamelist_t	*spnlp = NULL;
6375 	mdnamelist_t	*nlp = NULL;
6376 	sp_ext_node_t	*ext = NULL;
6377 	sp_ext_node_t	*extlist = NULL;
6378 	int		count;
6379 	char		yesno[255];
6380 	char		*yes;
6381 	int		rval = 0;
6382 	minor_t		*minors = NULL;
6383 	int		i;
6384 	md_sp_t		*msp;
6385 	md_set_desc	*sd;
6386 	bool_t		mn_set = 0;
6387 	daddr_t		start_block;
6388 
6389 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
6390 	if (count <= 0)
6391 		return (-1);
6392 
6393 	/* set flag if dealing with a MN set */
6394 	if (!metaislocalset(sp)) {
6395 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
6396 			return (-1);
6397 		}
6398 		if (MD_MNSET_DESC(sd))
6399 			mn_set = 1;
6400 	}
6401 	/*
6402 	 * Save the XDR unit structure for one of the soft partitions;
6403 	 * we'll use this later to provide metadevice context to
6404 	 * update the watermarks so the device can be resolved by
6405 	 * devid instead of dev_t.
6406 	 */
6407 	if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) {
6408 		metafreenamelist(spnlp);
6409 		return (-1);
6410 	}
6411 
6412 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
6413 	    MD_DISKADDR_ERROR) {
6414 		return (-1);
6415 	}
6416 
6417 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
6418 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
6419 	meta_sp_list_insert(NULL, NULL, &extlist,
6420 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
6421 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
6422 
6423 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
6424 		metafreenamelist(spnlp);
6425 		return (-1);
6426 	}
6427 
6428 	assert(extlist != NULL);
6429 	if ((options & MDCMD_VERBOSE) != 0) {
6430 		(void) printf(dgettext(TEXT_DOMAIN,
6431 		    "Updating extent headers on device %s from metadb.\n\n"),
6432 		    compnp->cname);
6433 		(void) printf(dgettext(TEXT_DOMAIN,
6434 		    "The following extent headers will be written:\n"));
6435 		meta_sp_display_exthdr();
6436 	}
6437 
6438 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
6439 
6440 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
6441 
6442 		/* mark every node for updating except the reserved space */
6443 		if (ext->ext_type != EXTTYP_RESERVED) {
6444 			ext->ext_flags |= EXTFLG_UPDATE;
6445 
6446 			/* print extent information */
6447 			if ((options & MDCMD_VERBOSE) != 0)
6448 				meta_sp_display_ext(ext);
6449 		}
6450 	}
6451 
6452 	/* request verification and then update all watermarks */
6453 	if ((options & MDCMD_DOIT) != 0) {
6454 
6455 		(void) printf(dgettext(TEXT_DOMAIN,
6456 		    "\nWARNING: You are about to overwrite portions of %s\n"
6457 		    "with soft partition metadata. The extent headers will be\n"
6458 		    "written to match the existing metadb configuration.  If\n"
6459 		    "the device was not previously setup with this\n"
6460 		    "configuration, data loss may result.\n\n"),
6461 		    compnp->cname);
6462 		(void) printf(dgettext(TEXT_DOMAIN,
6463 		    "Are you sure you want to do this (yes/no)? "));
6464 
6465 		(void) fflush(stdout);
6466 		if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6467 		    (strlen(yesno) == 1))
6468 			(void) snprintf(yesno, sizeof (yesno),
6469 			    "%s\n", dgettext(TEXT_DOMAIN, "no"));
6470 		yes = dgettext(TEXT_DOMAIN, "yes");
6471 		if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) {
6472 			/* place soft partitions into recovering state */
6473 			minors = Zalloc(count * sizeof (minor_t));
6474 			for (nlp = spnlp, i = 0;
6475 			    nlp != NULL && i < count;
6476 			    nlp = nlp->next, i++) {
6477 				assert(nlp->namep != NULL);
6478 				minors[i] = meta_getminor(nlp->namep->dev);
6479 			}
6480 			if (update_sp_status(sp, minors, count,
6481 			    MD_SP_RECOVER, mn_set, ep) != 0) {
6482 				rval = -1;
6483 				goto out;
6484 			}
6485 
6486 			/* update the watermarks */
6487 			if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
6488 				rval = -1;
6489 				goto out;
6490 			}
6491 
6492 			if (options & MDCMD_PRINT) {
6493 				(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6494 				    "Soft Partitions recovered from metadb\n"),
6495 				    compnp->cname);
6496 			}
6497 
6498 			/* return soft partitions to the OK state */
6499 			if (update_sp_status(sp, minors, count,
6500 			    MD_SP_OK, mn_set, ep) != 0) {
6501 				rval = -1;
6502 				goto out;
6503 			}
6504 
6505 			rval = 0;
6506 			goto out;
6507 		}
6508 	}
6509 
6510 	if (options & MDCMD_PRINT) {
6511 		(void) printf(dgettext(TEXT_DOMAIN,
6512 		    "%s: Soft Partitions NOT recovered from metadb\n"),
6513 		    compnp->cname);
6514 	}
6515 
6516 out:
6517 	if (minors != NULL)
6518 		Free(minors);
6519 	metafreenamelist(spnlp);
6520 	meta_sp_list_free(&extlist);
6521 	(void) fflush(stdout);
6522 	return (rval);
6523 }
6524 
6525 
6526 /*
6527  * FUNCTION:	meta_sp_update_abr()
6528  * INPUT:	sp	- name of set we are recovering in
6529  * OUTPUT:	ep	- return error pointer
6530  * RETURNS:	int	- 0 - success, -1 - error
6531  * PURPOSE:	update the ABR state for all soft partitions in the set. This
6532  *		is called when joining a set. It sends a message to the master
6533  *		node for each soft partition to get the value of tstate and
6534  *		then sets ABR ,if required, by opening the sp, setting ABR
6535  *		and then closing the sp. This approach is taken rather that
6536  *		just issuing the MD_MN_SET_CAP ioctl, in order to deal with
6537  *		the case when we have another node simultaneously unsetting ABR.
6538  */
6539 int
6540 meta_sp_update_abr(
6541 	mdsetname_t	*sp,
6542 	md_error_t	*ep
6543 )
6544 {
6545 	mdnamelist_t	*devnlp = NULL;
6546 	mdnamelist_t	*p;
6547 	mdname_t	*devnp = NULL;
6548 	md_unit_t	*un;
6549 	char		fname[MAXPATHLEN];
6550 	int		mnum, fd;
6551 	volcap_t	vc;
6552 	uint_t		tstate;
6553 
6554 
6555 	if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
6556 		return (-1);
6557 	}
6558 
6559 	/* Exit if no soft partitions in this set */
6560 	if (devnlp == NULL)
6561 		return (0);
6562 
6563 	/* For each soft partition */
6564 	for (p = devnlp; (p != NULL); p = p->next) {
6565 		devnp = p->namep;
6566 
6567 		/* check if this is a top level metadevice */
6568 		if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL)
6569 			goto out;
6570 		if (MD_HAS_PARENT(MD_PARENT(un))) {
6571 			Free(un);
6572 			continue;
6573 		}
6574 		Free(un);
6575 
6576 		/* Get tstate from Master */
6577 		if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) {
6578 			mdname_t	*np;
6579 			np = metamnumname(&sp, meta_getminor(devnp->dev), 0,
6580 			    ep);
6581 			if (np) {
6582 				md_perror(dgettext(TEXT_DOMAIN,
6583 				    "Unable to get tstate for %s"), np->cname);
6584 			}
6585 			continue;
6586 		}
6587 		/* If not set on the master, nothing to do */
6588 		if (!(tstate & MD_ABR_CAP))
6589 			continue;
6590 
6591 		mnum = meta_getminor(devnp->dev);
6592 		(void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u",
6593 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
6594 		if ((fd = open(fname, O_RDWR, 0)) < 0) {
6595 			md_perror(dgettext(TEXT_DOMAIN,
6596 			    "Could not open device %s"), fname);
6597 			continue;
6598 		}
6599 
6600 		/* Set ABR state */
6601 		vc.vc_info = 0;
6602 		vc.vc_set = 0;
6603 		if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
6604 			(void) close(fd);
6605 			continue;
6606 		}
6607 
6608 		vc.vc_set = DKV_ABR_CAP;
6609 		if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
6610 			(void) close(fd);
6611 			goto out;
6612 		}
6613 
6614 		(void) close(fd);
6615 	}
6616 	metafreenamelist(devnlp);
6617 	return (0);
6618 out:
6619 	metafreenamelist(devnlp);
6620 	return (-1);
6621 }
6622 
6623 /*
6624  * FUNCTION:	meta_mn_sp_update_abr()
6625  * INPUT:	arg	- Given set.
6626  * PURPOSE:	update the ABR state for all soft partitions in the set by
6627  *		forking a process to call meta_sp_update_abr()
6628  *		This function is only called via rpc.metad when adding a node
6629  *		to a set, ie this node is beong joined to the set by another
6630  *		node.
6631  */
6632 void *
6633 meta_mn_sp_update_abr(void *arg)
6634 {
6635 	set_t		setno = *((set_t *)arg);
6636 	mdsetname_t	*sp;
6637 	md_error_t	mde = mdnullerror;
6638 	int		fval;
6639 
6640 	/* should have a set */
6641 	assert(setno != NULL);
6642 
6643 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6644 		mde_perror(&mde, "");
6645 		return (NULL);
6646 	}
6647 
6648 	if (!(meta_is_mn_set(sp, &mde))) {
6649 		mde_perror(&mde, "");
6650 		return (NULL);
6651 	}
6652 
6653 	/* fork a process */
6654 	if ((fval = md_daemonize(sp, &mde)) != 0) {
6655 		/*
6656 		 * md_daemonize will fork off a process.  The is the
6657 		 * parent or error.
6658 		 */
6659 		if (fval > 0) {
6660 			return (NULL);
6661 		}
6662 		mde_perror(&mde, "");
6663 		return (NULL);
6664 	}
6665 	/*
6666 	 * Child process should never return back to rpc.metad, but
6667 	 * should exit.
6668 	 * Flush all internally cached data inherited from parent process
6669 	 * since cached data will be cleared when parent process RPC request
6670 	 * has completed (which is possibly before this child process
6671 	 * can complete).
6672 	 * Child process can retrieve and cache its own copy of data from
6673 	 * rpc.metad that won't be changed by the parent process.
6674 	 *
6675 	 * Reset md_in_daemon since this child will be a client of rpc.metad
6676 	 * not part of the rpc.metad daemon itself.
6677 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
6678 	 * this thread is rpc.metad or any other thread.  (If this thread
6679 	 * was rpc.metad it could use some short circuit code to get data
6680 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
6681 	 */
6682 	md_in_daemon = 0;
6683 	metaflushsetname(sp);
6684 	sr_cache_flush_setno(setno);
6685 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6686 		mde_perror(&mde, "");
6687 		md_exit(sp, 1);
6688 	}
6689 
6690 
6691 	/*
6692 	 * Closing stdin/out/err here.
6693 	 */
6694 	(void) close(0);
6695 	(void) close(1);
6696 	(void) close(2);
6697 	assert(fval == 0);
6698 
6699 	(void) meta_sp_update_abr(sp, &mde);
6700 
6701 	md_exit(sp, 0);
6702 	/*NOTREACHED*/
6703 	return (NULL);
6704 }
6705