xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_sp.c (revision cde2885fdf538266ee2a3b08dee2d5075ce8fa2b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Just in case we're not in a build environment, make sure that
29  * TEXT_DOMAIN gets set to something.
30  */
31 #if !defined(TEXT_DOMAIN)
32 #define	TEXT_DOMAIN "SYS_TEST"
33 #endif
34 
35 /*
36  * soft partition operations
37  *
38  * Soft Partitions provide a virtual disk mechanism which is used to
39  * divide a large volume into many small pieces, each appearing as a
40  * separate device.  A soft partition consists of a series of extents,
41  * each having an offset and a length.  The extents are logically
42  * contiguous, so where the first extent leaves off the second extent
43  * picks up.  Which extent a given "virtual offset" belongs to is
44  * dependent on the size of all the previous extents in the soft
45  * partition.
46  *
47  * Soft partitions are represented in memory by an extent node
48  * (sp_ext_node_t) which contains all of the information necessary to
49  * create a unit structure and update the on-disk format, called
50  * "watermarks".  These extent nodes are typically kept in a doubly
51  * linked list and are manipulated by list manipulation routines.  A
52  * list of extents may represent all of the soft partitions on a volume,
53  * a single soft partition, or perhaps just a set of extents that need
54  * to be updated.  Extent lists may be sorted by extent or by name/seq#,
55  * depending on which compare function is used.  Most of the routines
56  * require the list be sorted by offset to work, and that's the typical
57  * configuration.
58  *
59  * In order to do an allocation, knowledge of all soft partitions on the
60  * volume is required.  Then free space is determined from the space
61  * that is not allocated, and new allocations can be made from the free
62  * space.  Once the new allocations are made, a unit structure is created
63  * and the watermarks are updated.  The status is then changed to "okay"
64  * on the unit structure to commit the transaction.  If updating the
65  * watermarks fails, the unit structure is in an intermediate state and
66  * the driver will not allow access to the device.
67  *
68  * A typical sequence of events is:
69  *     1. Fetch the list of names for all soft partitions on a volume
70  *         meta_sp_get_by_component()
71  *     2. Construct an extent list from the name list
72  *         meta_sp_extlist_from_namelist()
73  *     3. Fill the gaps in the extent list with free extents
74  *         meta_sp_list_freefill()
75  *     4. Allocate from the free extents
76  *         meta_sp_alloc_by_len()
77  *         meta_sp_alloc_by_list()
78  *     5. Create the unit structure from the extent list
79  *         meta_sp_createunit()
80  *         meta_sp_updateunit()
81  *     6. Write out the watermarks
82  *         meta_sp_update_wm()
83  *     7. Set the status to "Okay"
84  *         meta_sp_setstatus()
85  *
86  */
87 
88 #include <stdio.h>
89 #include <meta.h>
90 #include "meta_repartition.h"
91 #include <sys/lvm/md_sp.h>
92 #include <sys/lvm/md_crc.h>
93 #include <strings.h>
94 #include <sys/lvm/md_mirror.h>
95 #include <sys/bitmap.h>
96 
97 extern int	md_in_daemon;
98 
99 typedef struct sp_ext_node {
100 	struct sp_ext_node	*ext_next;	/* next element */
101 	struct sp_ext_node	*ext_prev;	/* previous element */
102 	sp_ext_type_t		ext_type;	/* type of extent */
103 	sp_ext_offset_t		ext_offset;	/* starting offset */
104 	sp_ext_length_t		ext_length;	/* length of this node */
105 	uint_t			ext_flags;	/* extent flags */
106 	uint32_t		ext_seq;	/* watermark seq no */
107 	mdname_t		*ext_namep;	/* name pointer */
108 	mdsetname_t		*ext_setp;	/* set pointer */
109 } sp_ext_node_t;
110 
111 /* extent flags */
112 #define	EXTFLG_UPDATE	(1)
113 
114 /* Extent node compare function for list sorting */
115 typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *);
116 
117 
118 /* Function Prototypes */
119 
120 /* Debugging Functions */
121 static void meta_sp_debug(char *format, ...);
122 static void meta_sp_printunit(mp_unit_t *mp);
123 
124 /* Misc Support Functions */
125 int meta_sp_parsesize(char *s, sp_ext_length_t *szp);
126 static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp);
127 static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp,
128 	md_error_t *ep);
129 static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp,
130     mdnamelist_t **nlpp, int force, md_error_t *ep);
131 static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp,
132     mdname_t *compnp, md_error_t *ep);
133 
134 /* Extent List Manipulation Functions */
135 static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2);
136 static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2);
137 static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np,
138     sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length,
139     sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare);
140 static void meta_sp_list_free(sp_ext_node_t **head);
141 static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext);
142 static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head,
143     sp_ext_type_t exttype, int exclude_wm);
144 static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head,
145     sp_ext_offset_t offset);
146 static void meta_sp_list_freefill(sp_ext_node_t **extlist,
147     sp_ext_length_t size);
148 static void meta_sp_list_dump(sp_ext_node_t *head);
149 static int meta_sp_list_overlaps(sp_ext_node_t *head);
150 
151 /* Extent List Query Functions */
152 static boolean_t meta_sp_enough_space(int desired_number_of_sps,
153 	blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp,
154 	sp_ext_length_t alignment);
155 static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep,
156 	mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp,
157 	md_error_t *ep);
158 static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep,
159 	mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp);
160 
161 
162 /* Extent Allocation Functions */
163 static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np,
164     sp_ext_node_t **extlist, sp_ext_node_t *free_ext,
165     sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq);
166 static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np,
167     sp_ext_node_t **extlist, sp_ext_length_t *lp,
168     sp_ext_offset_t last_off, sp_ext_length_t alignment);
169 static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np,
170     sp_ext_node_t **extlist, sp_ext_node_t *oblist);
171 
172 /* Extent List Population Functions */
173 static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp,
174     sp_ext_node_t **extlist, md_error_t *ep);
175 static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp,
176     sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep);
177 
178 /* Print (metastat) Functions */
179 static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp,
180     mdprtopts_t options, md_error_t *ep);
181 static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate);
182 static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp,
183     char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep);
184 
185 /* Watermark Manipulation Functions */
186 static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp,
187     sp_ext_node_t *extlist, md_error_t *ep);
188 static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep);
189 static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp,
190     mp_watermark_t *wm, sp_ext_offset_t offset,  md_error_t *ep);
191 static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp,
192     md_error_t *ep);
193 
194 /* Unit Structure Manipulation Functions */
195 static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist);
196 static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp,
197     sp_ext_node_t *extlist, int numexts, sp_ext_length_t len,
198     sp_status_t status, md_error_t *ep);
199 static mp_unit_t *meta_sp_updateunit(mdname_t *np,  mp_unit_t *old_un,
200     sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts,
201     md_error_t *ep);
202 static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist,
203     mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep);
204 static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options,
205     int *repart_options, md_error_t *ep);
206 
207 /* Reset (metaclear) Functions */
208 static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp,
209     md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep);
210 
211 /* Recovery (metarecover) Functions */
212 static void meta_sp_display_exthdr(void);
213 static void meta_sp_display_ext(sp_ext_node_t *ext);
214 static int meta_sp_checkseq(sp_ext_node_t *extlist);
215 static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *,
216     mdname_t **, md_error_t *);
217 static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np,
218     mdcmdopts_t options, md_error_t *ep);
219 static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp,
220     mdcmdopts_t options, md_error_t *ep);
221 static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np,
222     mdcmdopts_t options, md_error_t *ep);
223 static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext,
224     sp_ext_node_t *unitext, md_error_t *ep);
225 static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp,
226     mdcmdopts_t options, md_error_t *ep);
227 static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np,
228     mdcmdopts_t options, md_error_t *ep);
229 
230 /*
231  * Private Constants
232  */
233 
234 static const int FORCE_RELOAD_CACHE = 1;
235 static const uint_t NO_FLAGS = 0;
236 static const sp_ext_offset_t NO_OFFSET = 0ULL;
237 static const uint_t NO_SEQUENCE_NUMBER = 0;
238 static const int ONE_SOFT_PARTITION = 1;
239 
240 static unsigned long sp_parent_printed[BT_BITOUL(MD_MAXUNITS)];
241 
242 #define	TEST_SOFT_PARTITION_NAMEP NULL
243 #define	TEST_SETNAMEP NULL
244 
245 #define	EXCLUDE_WM	(1)
246 #define	INCLUDE_WM	(0)
247 
248 #define	SP_UNALIGNED	(0LL)
249 
250 /*
251  * **************************************************************************
252  *                          Debugging Functions                             *
253  * **************************************************************************
254  */
255 
256 /*PRINTFLIKE1*/
257 static void
258 meta_sp_debug(char *format, ...)
259 {
260 	static int debug;
261 	static int debug_set = 0;
262 	va_list ap;
263 
264 	if (!debug_set) {
265 		debug = getenv(META_SP_DEBUG) ? 1 : 0;
266 		debug_set = 1;
267 	}
268 
269 	if (debug) {
270 		va_start(ap, format);
271 		(void) vfprintf(stderr, format, ap);
272 		va_end(ap);
273 	}
274 }
275 
276 static void
277 meta_sp_printunit(mp_unit_t *mp)
278 {
279 	int i;
280 
281 	if (mp == NULL)
282 		return;
283 
284 	/* print the common fields we know about */
285 	(void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type);
286 	(void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size);
287 	(void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp));
288 
289 	/* sp-specific fields */
290 	(void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status);
291 	(void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts);
292 	(void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length);
293 	(void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev);
294 	(void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev);
295 	(void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key);
296 
297 	/* print extent information */
298 	(void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n");
299 	for (i = 0; i < mp->un_numexts; i++) {
300 		(void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i,
301 		    mp->un_ext[i].un_voff, mp->un_ext[i].un_poff,
302 		    mp->un_ext[i].un_len);
303 	}
304 }
305 
306 /*
307  * FUNCTION:    meta_sp_parsesize()
308  * INPUT:       s       - the string to parse
309  * OUTPUT:      *szp    - disk block count (0 for "all")
310  * RETURNS:     -1 for error, 0 for success
311  * PURPOSE:     parses the command line parameter that specifies the
312  *              requested size of a soft partition.  The input string
313  *              is either the literal "all" or a numeric value
314  *              followed by a single character, b for disk blocks, k
315  *              for kilobytes, m for megabytes, g for gigabytes, or t
316  *              for terabytes.  p for petabytes and e for exabytes
317  *              have been added as undocumented features for future
318  *              expansion.  For example, 100m is 100 megabytes, while
319  *              50g is 50 gigabytes.  All values are rounded up to the
320  *              nearest block size.
321  */
322 int
323 meta_sp_parsesize(char *s, sp_ext_length_t *szp)
324 {
325 	if (s == NULL || szp == NULL) {
326 		return (-1);
327 	}
328 
329 	/* Check for literal "all" */
330 	if (strcasecmp(s, "all") == 0) {
331 		*szp = 0;
332 		return (0);
333 	}
334 
335 	return (meta_sp_parsesizestring(s, szp));
336 }
337 
338 /*
339  * FUNCTION:	meta_sp_parsesizestring()
340  * INPUT:	s	- the string to parse
341  * OUTPUT:	*szp	- disk block count
342  * RETURNS:	-1 for error, 0 for success
343  * PURPOSE:	parses a string that specifies size. The input string is a
344  *		numeric value followed by a single character, b for disk blocks,
345  *		k for kilobytes, m for megabytes, g for gigabytes, or t for
346  *		terabytes.  p for petabytes and e for exabytes have been added
347  *		as undocumented features for future expansion.  For example,
348  *		100m is 100 megabytes, while 50g is 50 gigabytes.  All values
349  *		are rounded up to the nearest block size.
350  */
351 static int
352 meta_sp_parsesizestring(char *s, sp_ext_length_t *szp)
353 {
354 	sp_ext_length_t	len = 0;
355 	char		len_type[2];
356 
357 	if (s == NULL || szp == NULL) {
358 		return (-1);
359 	}
360 
361 	/*
362 	 * make sure block offset does not overflow 2^64 bytes.
363 	 */
364 	if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) ||
365 	    (len == 0LL) ||
366 	    (len > (1LL << (64 - DEV_BSHIFT))))
367 		return (-1);
368 
369 	switch (len_type[0]) {
370 	case 'B':
371 	case 'b':
372 		len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE));
373 		break;
374 	case 'K':
375 	case 'k':
376 		len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE));
377 		break;
378 	case 'M':
379 	case 'm':
380 		len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE));
381 		break;
382 	case 'g':
383 	case 'G':
384 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE));
385 		break;
386 	case 't':
387 	case 'T':
388 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL,
389 		    DEV_BSIZE));
390 		break;
391 	case 'p':
392 	case 'P':
393 		len = lbtodb(roundup(
394 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
395 		    DEV_BSIZE));
396 		break;
397 	case 'e':
398 	case 'E':
399 		len = lbtodb(roundup(
400 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
401 		    DEV_BSIZE));
402 		break;
403 	default:
404 		/* error */
405 		return (-1);
406 	}
407 
408 	*szp = len;
409 	return (0);
410 }
411 
412 /*
413  * FUNCTION:	meta_sp_setgeom()
414  * INPUT:	np      - the underlying device to setup geometry for
415  *		compnp	- the underlying device to setup geometry for
416  *		mp	- the unit structure to set the geometry for
417  * OUTPUT:	ep	- return error pointer
418  * RETURNS:	int	- -1 if error, 0 otherwise
419  * PURPOSE:	establishes geometry information for a device
420  */
421 static int
422 meta_sp_setgeom(
423 	mdname_t	*np,
424 	mdname_t	*compnp,
425 	mp_unit_t	*mp,
426 	md_error_t	*ep
427 )
428 {
429 	mdgeom_t	*geomp;
430 	uint_t		round_cyl = 0;
431 
432 	if ((geomp = metagetgeom(compnp, ep)) == NULL)
433 		return (-1);
434 	if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct,
435 	    geomp->read_reinstruct, round_cyl, ep) != 0)
436 		return (-1);
437 
438 	return (0);
439 }
440 
441 /*
442  * FUNCTION:	meta_sp_setstatus()
443  * INPUT:	sp	- the set name for the devices to set the status on
444  *		minors	- an array of minor numbers of devices to set status on
445  *		num_units - number of entries in the array
446  *		status	- status value to set all units to
447  * OUTPUT:	ep	- return error pointer
448  * RETURNS:	int	- -1 if error, 0 success
449  * PURPOSE:	sets the status of one or more soft partitions to the
450  *		requested value
451  */
452 int
453 meta_sp_setstatus(
454 	mdsetname_t	*sp,
455 	minor_t		*minors,
456 	int		num_units,
457 	sp_status_t	status,
458 	md_error_t	*ep
459 )
460 {
461 	md_sp_statusset_t	status_params;
462 
463 	assert(minors != NULL);
464 
465 	/* update status of all soft partitions to the status passed in */
466 	(void) memset(&status_params, 0, sizeof (status_params));
467 	status_params.num_units = num_units;
468 	status_params.new_status = status;
469 	status_params.size = num_units * sizeof (minor_t);
470 	status_params.minors = (uintptr_t)minors;
471 	MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno);
472 	if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde,
473 	    NULL) != 0) {
474 		(void) mdstealerror(ep, &status_params.mde);
475 		return (-1);
476 	}
477 	return (0);
478 }
479 
480 /*
481  * FUNCTION:	meta_get_sp_names()
482  * INPUT:	sp	- the set name to get soft partitions from
483  *		options	- options from the command line
484  * OUTPUT:	nlpp	- list of all soft partition names
485  *		ep	- return error pointer
486  * RETURNS:	int	- -1 if error, 0 success
487  * PURPOSE:	returns a list of all soft partitions in the metadb
488  *		for all devices in the specified set
489  */
490 int
491 meta_get_sp_names(
492 	mdsetname_t	*sp,
493 	mdnamelist_t	**nlpp,
494 	int		options,
495 	md_error_t	*ep
496 )
497 {
498 	return (meta_get_names(MD_SP, sp, nlpp, options, ep));
499 }
500 
501 /*
502  * FUNCTION:	meta_get_by_component()
503  * INPUT:	sp	- the set name to get soft partitions from
504  *		compnp	- the name of the device containing the soft
505  *			  partitions that will be returned
506  *		force	- 0 - reads cached namelist if available,
507  *			  1 - reloads cached namelist, frees old namelist
508  * OUTPUT:	nlpp	- list of all soft partition names
509  *		ep	- return error pointer
510  * RETURNS:	int	- -1 error, otherwise the number of soft partitions
511  *			  found on the component (0 = none found).
512  * PURPOSE:	returns a list of all soft partitions on a given device
513  *		from the metadb information
514  */
515 static int
516 meta_sp_get_by_component(
517 	mdsetname_t	*sp,
518 	mdname_t	*compnp,
519 	mdnamelist_t	**nlpp,
520 	int		force,
521 	md_error_t	*ep
522 )
523 {
524 	static mdnamelist_t	*cached_list = NULL;	/* cached namelist */
525 	static int		cached_count = 0;	/* cached count */
526 	mdnamelist_t		*spnlp = NULL;		/* all sp names */
527 	mdnamelist_t		*namep;			/* list iterator */
528 	mdnamelist_t		**tailpp = nlpp;	/* namelist tail */
529 	mdnamelist_t		**cachetailpp;		/* cache tail */
530 	md_sp_t			*msp;			/* unit structure */
531 	int			count = 0;		/* count of sp's */
532 	int			err;
533 	mdname_t		*curnp;
534 
535 	if ((cached_list != NULL) && (!force)) {
536 		/* return a copy of the cached list */
537 		for (namep = cached_list; namep != NULL; namep = namep->next)
538 			tailpp = meta_namelist_append_wrapper(tailpp,
539 			    namep->namep);
540 		return (cached_count);
541 	}
542 
543 	/* free the cache and reset values to zeros to prepare for a new list */
544 	metafreenamelist(cached_list);
545 	cached_count = 0;
546 	cached_list = NULL;
547 	cachetailpp = &cached_list;
548 	*nlpp = NULL;
549 
550 	/* get all the softpartitions first of all */
551 	if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
552 		return (-1);
553 
554 	/*
555 	 * Now for each sp, see if it resides on the component we
556 	 * are interested in, if so then add it to our list
557 	 */
558 	for (namep = spnlp; namep != NULL; namep = namep->next) {
559 		curnp = namep->namep;
560 
561 		/* get the unit structure */
562 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
563 			continue;
564 
565 		/*
566 		 * If the current soft partition is not on the same
567 		 * component, continue the search.  If it is on the same
568 		 * component, add it to our namelist.
569 		 */
570 		err = meta_check_samedrive(compnp, msp->compnamep, ep);
571 		if (err <= 0) {
572 			/* not on the same device, check the next one */
573 			continue;
574 		}
575 
576 		/* it's on the same drive */
577 
578 		/*
579 		 * Check for overlapping partitions if the component is not
580 		 * a metadevice.
581 		 */
582 		if (!metaismeta(msp->compnamep)) {
583 			/*
584 			 * if they're on the same drive, neither
585 			 * should be a metadevice if one isn't
586 			 */
587 			assert(!metaismeta(compnp));
588 
589 			if (meta_check_overlap(msp->compnamep->cname,
590 			    compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0)
591 				continue;
592 
593 			/* in this case it's not an error for them to overlap */
594 			mdclrerror(ep);
595 		}
596 
597 		/* Component is on the same device, add to the used list */
598 		tailpp = meta_namelist_append_wrapper(tailpp, curnp);
599 		cachetailpp = meta_namelist_append_wrapper(cachetailpp,
600 		    curnp);
601 
602 		++count;
603 		++cached_count;
604 	}
605 
606 	assert(count == cached_count);
607 	return (count);
608 
609 out:
610 	metafreenamelist(*nlpp);
611 	*nlpp = NULL;
612 	return (-1);
613 }
614 
615 /*
616  * FUNCTION:    meta_sp_get_default_alignment()
617  * INPUT:       sp      - the pertinent set name
618  *              compnp  - the name of the underlying component
619  * OUTPUT:      ep      - return error pointer
620  * RETURNS:     sp_ext_length_t =0: no default alignment
621  *                              >0: default alignment
622  * PURPOSE:     returns the default alignment for soft partitions to
623  *              be built on top of the specified component or
624  *              metadevice
625  */
626 static sp_ext_length_t
627 meta_sp_get_default_alignment(
628 	mdsetname_t	*sp,
629 	mdname_t	*compnp,
630 	md_error_t	*ep
631 )
632 {
633 	sp_ext_length_t	a = SP_UNALIGNED;
634 	char		*mname;
635 
636 	assert(compnp != NULL);
637 
638 	/*
639 	 * We treat raw devices as opaque, and assume nothing about
640 	 * their alignment requirements.
641 	 */
642 	if (!metaismeta(compnp))
643 		return (SP_UNALIGNED);
644 
645 	/*
646 	 * We already know it's a metadevice from the previous test;
647 	 * metagetmiscname() will tell us which metadevice type we
648 	 * have
649 	 */
650 	mname = metagetmiscname(compnp, ep);
651 	if (mname == NULL)
652 		goto out;
653 
654 	/*
655 	 * For a mirror, we want to deal with the stripe that is the
656 	 * primary side.  If it happens to be asymmetrically
657 	 * configured, there is no simple way to fake a universal
658 	 * alignment.  There's a chance that the least common
659 	 * denominator of the set of interlaces from all stripes of
660 	 * all submirrors would do it, but nobody that really cared
661 	 * that much about this issue would create an asymmetric
662 	 * config to start with.
663 	 *
664 	 * If the component underlying the soft partition is a mirror,
665 	 * then at the exit of this loop, compnp will have been
666 	 * updated to describe the first active submirror.
667 	 */
668 	if (strcmp(mname, MD_MIRROR) == 0) {
669 		md_mirror_t	*mp;
670 		int		smi;
671 		md_submirror_t	*smp;
672 
673 		mp = meta_get_mirror(sp, compnp, ep);
674 		if (mp == NULL)
675 			goto out;
676 
677 		for (smi = 0; smi < NMIRROR; smi++) {
678 
679 			smp = &mp->submirrors[smi];
680 			if (smp->state == SMS_UNUSED)
681 				continue;
682 
683 			compnp = smp->submirnamep;
684 			assert(compnp != NULL);
685 
686 			mname = metagetmiscname(compnp, ep);
687 			if (mname == NULL)
688 				goto out;
689 
690 			break;
691 		}
692 
693 		if (smi == NMIRROR)
694 			goto out;
695 	}
696 
697 	/*
698 	 * Handle stripes and submirrors identically; just return the
699 	 * interlace of the first row.
700 	 */
701 	if (strcmp(mname, MD_STRIPE) == 0) {
702 		md_stripe_t	*stp;
703 
704 		stp = meta_get_stripe(sp, compnp, ep);
705 		if (stp == NULL)
706 			goto out;
707 
708 		a = stp->rows.rows_val[0].interlace;
709 		goto out;
710 	}
711 
712 	/*
713 	 * Raid is even more straightforward; the interlace applies to
714 	 * the entire device.
715 	 */
716 	if (strcmp(mname, MD_RAID) == 0) {
717 		md_raid_t	*rp;
718 
719 		rp = meta_get_raid(sp, compnp, ep);
720 		if (rp == NULL)
721 			goto out;
722 
723 		a = rp->interlace;
724 		goto out;
725 	}
726 
727 	/*
728 	 * If we have arrived here with the alignment still not set,
729 	 * then we expect the error to have been set by one of the
730 	 * routines we called.  If neither is the case, something has
731 	 * really gone wrong above.  (Probably the submirror walk
732 	 * failed to produce a valid submirror, but that would be
733 	 * really bad...)
734 	 */
735 out:
736 	meta_sp_debug("meta_sp_get_default_alignment: miscname %s, "
737 	    "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a);
738 
739 	if (getenv(META_SP_DEBUG) && !mdisok(ep)) {
740 		mde_perror(ep, NULL);
741 	}
742 
743 	assert((a > 0) || (!mdisok(ep)));
744 
745 	return (a);
746 }
747 
748 
749 
750 /*
751  * FUNCTION:	meta_check_insp()
752  * INPUT:	sp	- the set name for the device to check
753  *		np	- the name of the device to check
754  *		slblk	- the starting offset of the device to check
755  *		nblks	- the number of blocks in the device to check
756  * OUTPUT:	ep	- return error pointer
757  * RETURNS:	int	-  0 - device contains soft partitions
758  *			  -1 - device does not contain soft partitions
759  * PURPOSE:	determines whether a device contains any soft partitions
760  */
761 /* ARGSUSED */
762 int
763 meta_check_insp(
764 	mdsetname_t	*sp,
765 	mdname_t	*np,
766 	diskaddr_t	slblk,
767 	diskaddr_t	nblks,
768 	md_error_t	*ep
769 )
770 {
771 	mdnamelist_t	*spnlp = NULL;	/* soft partition name list */
772 	int		count;
773 	int		rval;
774 
775 	/* check set pointer */
776 	assert(sp != NULL);
777 
778 	/*
779 	 * Get a list of the soft partitions that currently reside on
780 	 * the component.  We should ALWAYS force reload the cache,
781 	 * because if we're using the md.tab, we must rebuild
782 	 * the list because it won't contain the previous (if any)
783 	 * soft partition.
784 	 */
785 	/* find all soft partitions on the component */
786 	count = meta_sp_get_by_component(sp, np, &spnlp, 1, ep);
787 
788 	if (count == -1) {
789 		rval = -1;
790 	} else if (count > 0) {
791 		rval = mduseerror(ep, MDE_ALREADY, np->dev,
792 		    spnlp->namep->cname, np->cname);
793 	} else {
794 		rval = 0;
795 	}
796 
797 	metafreenamelist(spnlp);
798 	return (rval);
799 }
800 
801 /*
802  * **************************************************************************
803  *                    Extent List Manipulation Functions                    *
804  * **************************************************************************
805  */
806 
807 /*
808  * FUNCTION:	meta_sp_cmp_by_nameseq()
809  * INPUT:	e1	- first node to compare
810  *		e2	- second node to compare
811  * OUTPUT:	none
812  * RETURNS:	int	- =0 - nodes are equal
813  *			  <0 - e1 should go before e2
814  *			  >0 - e1 should go after e2
815  * PURPOSE:	used for sorted list inserts to build a list sorted by
816  *		name first and sequence number second.
817  */
818 static int
819 meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2)
820 {
821 	int rval;
822 
823 	if (e1->ext_namep == NULL)
824 		return (1);
825 	if (e2->ext_namep == NULL)
826 		return (-1);
827 	if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0)
828 		return (rval);
829 
830 	/* the names are equal, compare sequence numbers */
831 	if (e1->ext_seq > e2->ext_seq)
832 		return (1);
833 	if (e1->ext_seq < e2->ext_seq)
834 		return (-1);
835 	/* sequence numbers are also equal */
836 	return (0);
837 }
838 
839 /*
840  * FUNCTION:	meta_sp_cmp_by_offset()
841  * INPUT:	e1	- first node to compare
842  *		e2	- second node to compare
843  * OUTPUT:	none
844  * RETURNS:	int	- =0 - nodes are equal
845  *			  <0 - e1 should go before e2
846  *			  >0 - e1 should go after e2
847  * PURPOSE:	used for sorted list inserts to build a list sorted by offset
848  */
849 static int
850 meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2)
851 {
852 	if (e1->ext_offset > e2->ext_offset)
853 		return (1);
854 	if (e1->ext_offset < e2->ext_offset)
855 		return (-1);
856 	/* offsets are equal */
857 	return (0);
858 }
859 
860 /*
861  * FUNCTION:	meta_sp_list_insert()
862  * INPUT:	sp	- the set name for the device the node belongs to
863  *		np	- the name of the device the node belongs to
864  *		head	- the head of the list, must be NULL for empty list
865  *		offset	- the physical offset of this extent in sectors
866  *		length	- the length of this extent in sectors
867  *		type	- the type of the extent being inserted
868  *		seq	- the sequence number of the extent being inserted
869  *		flags	- extent flags (eg. whether it needs to be updated)
870  *		compare	- the compare function to use
871  * OUTPUT:	head	- points to the new head if a node was inserted
872  *			  at the beginning
873  * RETURNS:	void
874  * PURPOSE:	inserts an extent node into a sorted doubly linked list.
875  *		The sort order is determined by the compare function.
876  *		Memory is allocated for the node in this function and it
877  *		is up to the caller to free it, possibly using
878  *		meta_sp_list_free().  If a node is inserted at the
879  *		beginning of the list, the head pointer is updated to
880  *		point to the new first node.
881  */
882 static void
883 meta_sp_list_insert(
884 	mdsetname_t	*sp,
885 	mdname_t	*np,
886 	sp_ext_node_t	**head,
887 	sp_ext_offset_t	offset,
888 	sp_ext_length_t	length,
889 	sp_ext_type_t	type,
890 	uint_t		seq,
891 	uint_t		flags,
892 	ext_cmpfunc_t	compare
893 )
894 {
895 	sp_ext_node_t	*newext;
896 	sp_ext_node_t	*curext;
897 
898 	assert(head != NULL);
899 
900 	/* Don't bother adding zero length nodes */
901 	if (length == 0ULL)
902 		return;
903 
904 	/* allocate and fill in new ext_node */
905 	newext = Zalloc(sizeof (sp_ext_node_t));
906 
907 	newext->ext_offset = offset;
908 	newext->ext_length = length;
909 	newext->ext_flags = flags;
910 	newext->ext_type = type;
911 	newext->ext_seq = seq;
912 	newext->ext_setp = sp;
913 	newext->ext_namep = np;
914 
915 	/* first node in the list */
916 	if (*head == NULL) {
917 		newext->ext_next = newext->ext_prev = NULL;
918 		*head = newext;
919 	} else if ((*compare)(*head, newext) >= 0) {
920 		/* the first node has a bigger offset, so insert before it */
921 		assert((*head)->ext_prev == NULL);
922 
923 		newext->ext_prev = NULL;
924 		newext->ext_next = *head;
925 		(*head)->ext_prev = newext;
926 		*head = newext;
927 	} else {
928 		/*
929 		 * find the next node whose offset is greater than
930 		 * the one we want to insert, or the end of the list.
931 		 */
932 		for (curext = *head;
933 		    (curext->ext_next != NULL) &&
934 		    ((*compare)(curext->ext_next, newext) < 0);
935 		    (curext = curext->ext_next))
936 			;
937 
938 		/* link the new node in after the current node */
939 		newext->ext_next = curext->ext_next;
940 		newext->ext_prev = curext;
941 
942 		if (curext->ext_next != NULL)
943 			curext->ext_next->ext_prev = newext;
944 
945 		curext->ext_next = newext;
946 	}
947 }
948 
949 /*
950  * FUNCTION:	meta_sp_list_free()
951  * INPUT:	head	- the head of the list, must be NULL for empty list
952  * OUTPUT:	head	- points to NULL on return
953  * RETURNS:	void
954  * PURPOSE:	walks a double linked extent list and frees each node
955  */
956 static void
957 meta_sp_list_free(sp_ext_node_t **head)
958 {
959 	sp_ext_node_t	*ext;
960 	sp_ext_node_t	*next;
961 
962 	assert(head != NULL);
963 
964 	ext = *head;
965 	while (ext) {
966 		next = ext->ext_next;
967 		Free(ext);
968 		ext = next;
969 	}
970 	*head = NULL;
971 }
972 
973 /*
974  * FUNCTION:	meta_sp_list_remove()
975  * INPUT:	head	- the head of the list, must be NULL for empty list
976  *		ext	- the extent to remove, must be a member of the list
977  * OUTPUT:	head	- points to the new head of the list
978  * RETURNS:	void
979  * PURPOSE:	unlinks the node specified by ext from the list and
980  *		frees it, possibly moving the head pointer forward if
981  *		the head is the node being removed.
982  */
983 static void
984 meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext)
985 {
986 	assert(head != NULL);
987 	assert(*head != NULL);
988 
989 	if (*head == ext)
990 		*head = ext->ext_next;
991 
992 	if (ext->ext_prev != NULL)
993 		ext->ext_prev->ext_next = ext->ext_next;
994 	if (ext->ext_next != NULL)
995 		ext->ext_next->ext_prev = ext->ext_prev;
996 	Free(ext);
997 }
998 
999 /*
1000  * FUNCTION:	meta_sp_list_size()
1001  * INPUT:	head	- the head of the list, must be NULL for empty list
1002  *		exttype	- the type of the extents to sum
1003  *		exclude_wm - subtract space for extent headers from total
1004  * OUTPUT:	none
1005  * RETURNS:	sp_ext_length_t	- the sum of all of the lengths
1006  * PURPOSE:	sums the lengths of all extents in the list matching the
1007  *		specified type.  This could be used for computing the
1008  *		amount of free or used space, for example.
1009  */
1010 static sp_ext_length_t
1011 meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm)
1012 {
1013 	sp_ext_node_t	*ext;
1014 	sp_ext_length_t	size = 0LL;
1015 
1016 	for (ext = head; ext != NULL; ext = ext->ext_next)
1017 		if (ext->ext_type == exttype)
1018 			size += ext->ext_length -
1019 			    ((exclude_wm) ? MD_SP_WMSIZE : 0);
1020 
1021 	return (size);
1022 }
1023 
1024 /*
1025  * FUNCTION:	meta_sp_list_find()
1026  * INPUT:	head	- the head of the list, must be NULL for empty list
1027  *		offset	- the offset contained by the node to find
1028  * OUTPUT:	none
1029  * RETURNS:	sp_ext_node_t *	- the node containing the requested offset
1030  *				  or NULL if no such nodes were found.
1031  * PURPOSE:	finds a node in a list containing the requested offset
1032  *		(inclusive).  If multiple nodes contain this offset then
1033  *		only the first will be returned, though typically these
1034  *		lists are managed with non-overlapping nodes.
1035  *
1036  *		*The list MUST be sorted by offset for this function to work.*
1037  */
1038 static sp_ext_node_t *
1039 meta_sp_list_find(
1040 	sp_ext_node_t	*head,
1041 	sp_ext_offset_t	offset
1042 )
1043 {
1044 	sp_ext_node_t	*ext;
1045 
1046 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1047 		/* check if the offset lies within this extent */
1048 		if ((offset >= ext->ext_offset) &&
1049 		    (offset < ext->ext_offset + ext->ext_length)) {
1050 			/*
1051 			 * the requested extent should always be a
1052 			 * subset of an extent in the list.
1053 			 */
1054 			return (ext);
1055 		}
1056 	}
1057 	return (NULL);
1058 }
1059 
1060 /*
1061  * FUNCTION:	meta_sp_list_freefill()
1062  * INPUT:	head	- the head of the list, must be NULL for empty list
1063  *		size	- the size of the volume this extent list is
1064  *			  representing
1065  * OUTPUT:	head	- the new head of the list
1066  * RETURNS:	void
1067  * PURPOSE:	finds gaps in the extent list and fills them with a free
1068  *		node.  If there is a gap at the beginning the head
1069  *		pointer will be changed to point to the new free node.
1070  *		If there is free space at the end, the last free extent
1071  *		will extend all the way out to the size specified.
1072  *
1073  *		*The list MUST be sorted by offset for this function to work.*
1074  */
1075 static void
1076 meta_sp_list_freefill(
1077 	sp_ext_node_t	**head,
1078 	sp_ext_length_t	size
1079 )
1080 {
1081 	sp_ext_node_t	*ext;
1082 	sp_ext_offset_t	curoff = 0LL;
1083 
1084 	for (ext = *head; ext != NULL; ext = ext->ext_next) {
1085 		if (curoff < ext->ext_offset)
1086 			meta_sp_list_insert(NULL, NULL, head,
1087 			    curoff, ext->ext_offset - curoff,
1088 			    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1089 		curoff = ext->ext_offset + ext->ext_length;
1090 	}
1091 
1092 	/* pad inverse list out to the end */
1093 	if (curoff < size)
1094 		meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff,
1095 		    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1096 
1097 	if (getenv(META_SP_DEBUG)) {
1098 		meta_sp_debug("meta_sp_list_freefill: Extent list with "
1099 		    "holes freefilled:\n");
1100 		meta_sp_list_dump(*head);
1101 	}
1102 }
1103 
1104 /*
1105  * FUNCTION:	meta_sp_list_dump()
1106  * INPUT:	head	- the head of the list, must be NULL for empty list
1107  * OUTPUT:	none
1108  * RETURNS:	void
1109  * PURPOSE:	dumps the entire extent list to stdout for easy debugging
1110  */
1111 static void
1112 meta_sp_list_dump(sp_ext_node_t *head)
1113 {
1114 	sp_ext_node_t	*ext;
1115 
1116 	meta_sp_debug("meta_sp_list_dump: dumping extent list:\n");
1117 	meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name",
1118 	    "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev",
1119 	    "Next");
1120 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1121 		if (ext->ext_namep != NULL)
1122 			meta_sp_debug("%5s", ext->ext_namep->cname);
1123 		else
1124 			meta_sp_debug("%5s", "NONE");
1125 
1126 		meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq);
1127 		switch (ext->ext_type) {
1128 		case EXTTYP_ALLOC:
1129 			meta_sp_debug("%7s ", "ALLOC");
1130 			break;
1131 		case EXTTYP_FREE:
1132 			meta_sp_debug("%7s ", "FREE");
1133 			break;
1134 		case EXTTYP_END:
1135 			meta_sp_debug("%7s ", "END");
1136 			break;
1137 		case EXTTYP_RESERVED:
1138 			meta_sp_debug("%7s ", "RESV");
1139 			break;
1140 		default:
1141 			meta_sp_debug("%7s ", "INVLD");
1142 			break;
1143 		}
1144 
1145 		meta_sp_debug("%10llu %10llu %5u %10p %10p\n",
1146 		    ext->ext_offset, ext->ext_length,
1147 		    ext->ext_flags, (void *) ext->ext_prev,
1148 		    (void *) ext->ext_next);
1149 	}
1150 	meta_sp_debug("\n");
1151 }
1152 
1153 /*
1154  * FUNCTION:	meta_sp_list_overlaps()
1155  * INPUT:	head	- the head of the list, must be NULL for empty list
1156  * OUTPUT:	none
1157  * RETURNS:	int	- 1 if extents overlap, 0 if ok
1158  * PURPOSE:	checks a list for overlaps.  The list MUST be sorted by
1159  *		offset for this function to work properly.
1160  */
1161 static int
1162 meta_sp_list_overlaps(sp_ext_node_t *head)
1163 {
1164 	sp_ext_node_t	*ext;
1165 
1166 	for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) {
1167 		if (ext->ext_offset + ext->ext_length >
1168 		    ext->ext_next->ext_offset)
1169 			return (1);
1170 	}
1171 	return (0);
1172 }
1173 
1174 /*
1175  * **************************************************************************
1176  *                        Extent Allocation Functions                       *
1177  * **************************************************************************
1178  */
1179 
1180 /*
1181  * FUNCTION:	meta_sp_alloc_by_ext()
1182  * INPUT:	sp	- the set name for the device the node belongs to
1183  *		np	- the name of the device the node belongs to
1184  *		head	- the head of the list, must be NULL for empty list
1185  *		free_ext	- the free extent being allocated from
1186  *		alloc_offset	- the offset of the allocation
1187  *		alloc_len	- the length of the allocation
1188  *		seq		- the sequence number of the allocation
1189  * OUTPUT:	head	- the new head pointer
1190  * RETURNS:	void
1191  * PURPOSE:	allocates a portion of the free extent free_ext.  The
1192  *		allocated portion starts at alloc_offset and is
1193  *		alloc_length long.  Both (alloc_offset) and (alloc_offset +
1194  *		alloc_length) must be contained within the free extent.
1195  *
1196  *		The free extent is split into as many as 3 pieces - a
1197  *		free extent containing [ free_offset .. alloc_offset ), an
1198  *		allocated extent containing the range [ alloc_offset ..
1199  *		alloc_end ], and another free extent containing the
1200  *		range ( alloc_end .. free_end ].  If either of the two
1201  *		new free extents would be zero length, they are not created.
1202  *
1203  *		Finally, the original free extent is removed.  All newly
1204  *		created extents have the EXTFLG_UPDATE flag set.
1205  */
1206 static void
1207 meta_sp_alloc_by_ext(
1208 	mdsetname_t	*sp,
1209 	mdname_t	*np,
1210 	sp_ext_node_t	**head,
1211 	sp_ext_node_t	*free_ext,
1212 	sp_ext_offset_t	alloc_offset,
1213 	sp_ext_length_t	alloc_length,
1214 	uint_t		seq
1215 )
1216 {
1217 	sp_ext_offset_t	free_offset = free_ext->ext_offset;
1218 	sp_ext_length_t	free_length = free_ext->ext_length;
1219 
1220 	sp_ext_offset_t	alloc_end = alloc_offset + alloc_length;
1221 	sp_ext_offset_t	free_end  = free_offset  + free_length;
1222 
1223 	/* allocated extent must be a subset of the free extent */
1224 	assert(free_offset <= alloc_offset);
1225 	assert(free_end >= alloc_end);
1226 
1227 	meta_sp_list_remove(head, free_ext);
1228 
1229 	if (free_offset < alloc_offset) {
1230 		meta_sp_list_insert(NULL, NULL, head, free_offset,
1231 		    (alloc_offset - free_offset), EXTTYP_FREE, 0,
1232 		    EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1233 	}
1234 
1235 	if (free_end > alloc_end) {
1236 		meta_sp_list_insert(NULL, NULL, head, alloc_end,
1237 		    (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE,
1238 		    meta_sp_cmp_by_offset);
1239 	}
1240 
1241 	meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length,
1242 	    EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1243 
1244 	if (getenv(META_SP_DEBUG)) {
1245 		meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n");
1246 		meta_sp_list_dump(*head);
1247 	}
1248 }
1249 
1250 /*
1251  * FUNCTION:	meta_sp_alloc_by_len()
1252  * INPUT:	sp	- the set name for the device the node belongs to
1253  *		np	- the name of the device the node belongs to
1254  *		head	- the head of the list, must be NULL for empty list
1255  *		*lp	- the requested length to allocate
1256  *		last_off	- the last offset already allocated.
1257  *		alignment	- the desired extent alignmeent
1258  * OUTPUT:	head	- the new head pointer
1259  *		*lp	- the length allocated
1260  * RETURNS:	int	- -1 if error, the number of new extents on success
1261  * PURPOSE:	allocates extents from free space to satisfy the requested
1262  *		length.  If requested length is zero, allocates all
1263  *		remaining free space.  This function provides the meat
1264  *		of the extent allocation algorithm.  Allocation is a
1265  *		three tier process:
1266  *
1267  *		1. If last_off is nonzero and there is free space following
1268  *		   that node, then it is extended to allocate as much of that
1269  *		   free space as possible.  This is useful for metattach.
1270  *		2. If a free extent can be found to satisfy the remaining
1271  *		   requested space, then satisfy the rest of the request
1272  *		   from that extent.
1273  *		3. Start allocating space from any remaining free extents until
1274  *		   the remainder of the request is satisified.
1275  *
1276  *              If alignment is non-zero, then every extent modified
1277  *              or newly allocated will be aligned modulo alignment,
1278  *              with a length that is an integer multiple of
1279  *              alignment.
1280  *
1281  *		The EXTFLG_UPDATE flag is set for all nodes (free and
1282  *		allocated) that require updated watermarks.
1283  *
1284  *		This algorithm may have a negative impact on fragmentation
1285  *		in pathological cases and may be improved if it turns out
1286  *		to be a problem.  This may be exacerbated by particularly
1287  *		large alignments.
1288  *
1289  * NOTE:	It's confusing, so it demands an explanation:
1290  *		- len is used to represent requested data space; it
1291  *		  does not include room for a watermark.  On each full
1292  *		  or partial allocation, len will be decremented by
1293  *		  alloc_len (see next paragraph) until it reaches
1294  *		  zero.
1295  *		- alloc_len is used to represent data space allocated
1296  *		  from a particular extent; it does not include space
1297  *		  for a watermark.  In the rare event that a_length
1298  *		  (see next paragraph) is equal to MD_SP_WMSIZE,
1299  *		  alloc_len will be zero and the resulting MD_SP_WMSIZE
1300  *		  fragment of space will be utterly unusable.
1301  *		- a_length is used to represent all space to be
1302  *		  allocated from a particular extent; it DOES include
1303  *		  space for a watermark.
1304  */
1305 static int
1306 meta_sp_alloc_by_len(
1307 	mdsetname_t	*sp,
1308 	mdname_t	*np,
1309 	sp_ext_node_t	**head,
1310 	sp_ext_length_t	*lp,
1311 	sp_ext_offset_t	last_off,
1312 	sp_ext_offset_t	alignment
1313 )
1314 {
1315 	sp_ext_node_t	*free_ext;
1316 	sp_ext_node_t	*alloc_ext;
1317 	uint_t		last_seq = 0;
1318 	uint_t		numexts = 0;
1319 	sp_ext_length_t	freespace;
1320 	sp_ext_length_t	alloc_len;
1321 	sp_ext_length_t	len;
1322 
1323 	/* We're DOA if we can't read *lp */
1324 	assert(lp != NULL);
1325 	len = *lp;
1326 
1327 	/*
1328 	 * Process the nominal case first: we've been given an actual
1329 	 * size argument, rather than the literal "all"
1330 	 */
1331 
1332 	if (len != 0) {
1333 
1334 		/*
1335 		 * Short circuit the check for free space.  This may
1336 		 * tell us we have enough space when we really don't
1337 		 * because each extent loses space to a watermark, but
1338 		 * it will always tell us there isn't enough space
1339 		 * correctly.  Worst case we do some extra work.
1340 		 */
1341 		freespace = meta_sp_list_size(*head, EXTTYP_FREE,
1342 		    INCLUDE_WM);
1343 
1344 		if (freespace < len)
1345 			return (-1);
1346 
1347 		/*
1348 		 * First see if we can extend the last extent for an
1349 		 * attach.
1350 		 */
1351 		if (last_off != 0LL) {
1352 			int align = 0;
1353 
1354 			alloc_ext =
1355 			    meta_sp_list_find(*head, last_off);
1356 			assert(alloc_ext != NULL);
1357 
1358 			/*
1359 			 * The offset test reflects the
1360 			 * inclusion of the watermark in the extent
1361 			 */
1362 			align = (alignment > 0) &&
1363 			    (((alloc_ext->ext_offset + MD_SP_WMSIZE) %
1364 			    alignment) == 0);
1365 
1366 			/*
1367 			 * If we decided not to align here, we should
1368 			 * also reset "alignment" so we don't bother
1369 			 * later, either.
1370 			 */
1371 			if (!align) {
1372 				alignment = 0;
1373 			}
1374 
1375 			last_seq = alloc_ext->ext_seq;
1376 
1377 			free_ext = meta_sp_list_find(*head,
1378 			    alloc_ext->ext_offset +
1379 			    alloc_ext->ext_length);
1380 
1381 			/*
1382 			 * If a free extent follows our last allocated
1383 			 * extent, then remove the last allocated
1384 			 * extent and increase the size of the free
1385 			 * extent to overlap it, then allocate the
1386 			 * total space from the new free extent.
1387 			 */
1388 			if (free_ext != NULL &&
1389 			    free_ext->ext_type == EXTTYP_FREE) {
1390 				assert(free_ext->ext_offset ==
1391 				    alloc_ext->ext_offset +
1392 				    alloc_ext->ext_length);
1393 
1394 				alloc_len =
1395 				    MIN(len, free_ext->ext_length);
1396 
1397 				if (align && (alloc_len < len)) {
1398 					/* No watermark space needed */
1399 					alloc_len -= alloc_len % alignment;
1400 				}
1401 
1402 				if (alloc_len > 0) {
1403 					free_ext->ext_offset -=
1404 					    alloc_ext->ext_length;
1405 					free_ext->ext_length +=
1406 					    alloc_ext->ext_length;
1407 
1408 					meta_sp_alloc_by_ext(sp, np, head,
1409 					    free_ext, free_ext->ext_offset,
1410 					    alloc_ext->ext_length + alloc_len,
1411 					    last_seq);
1412 
1413 					/*
1414 					 * now remove the original allocated
1415 					 * node.  We may have overlapping
1416 					 * extents for a short time before
1417 					 * this node is removed.
1418 					 */
1419 					meta_sp_list_remove(head, alloc_ext);
1420 					len -= alloc_len;
1421 				}
1422 			}
1423 			last_seq++;
1424 		}
1425 
1426 		if (len == 0LL)
1427 			goto out;
1428 
1429 		/*
1430 		 * Next, see if we can find a single allocation for
1431 		 * the remainder.  This may make fragmentation worse
1432 		 * in some cases, but there's no good way to allocate
1433 		 * that doesn't have a highly fragmented corner case.
1434 		 */
1435 		for (free_ext = *head; free_ext != NULL;
1436 		    free_ext = free_ext->ext_next) {
1437 			sp_ext_offset_t	a_offset;
1438 			sp_ext_offset_t	a_length;
1439 
1440 			if (free_ext->ext_type != EXTTYP_FREE)
1441 				continue;
1442 
1443 			/*
1444 			 * The length test should include space for
1445 			 * the watermark
1446 			 */
1447 
1448 			a_offset = free_ext->ext_offset;
1449 			a_length = free_ext->ext_length;
1450 
1451 			if (alignment > 0) {
1452 
1453 				/*
1454 				 * Shortcut for extents that have been
1455 				 * previously added to pad out the
1456 				 * data space
1457 				 */
1458 				if (a_length < alignment) {
1459 					continue;
1460 				}
1461 
1462 				/*
1463 				 * Round up so the data space begins
1464 				 * on a properly aligned boundary.
1465 				 */
1466 				a_offset += alignment -
1467 				    (a_offset % alignment) - MD_SP_WMSIZE;
1468 
1469 				/*
1470 				 * This is only necessary in case the
1471 				 * watermark size is ever greater than
1472 				 * one.  It'll never happen, of
1473 				 * course; we'll get rid of watermarks
1474 				 * before we make 'em bigger.
1475 				 */
1476 				if (a_offset < free_ext->ext_offset) {
1477 					a_offset += alignment;
1478 				}
1479 
1480 				/*
1481 				 * Adjust the length to account for
1482 				 * the space lost above (if any)
1483 				 */
1484 				a_length -=
1485 				    (a_offset - free_ext->ext_offset);
1486 			}
1487 
1488 			if (a_length >= len + MD_SP_WMSIZE) {
1489 				meta_sp_alloc_by_ext(sp, np, head,
1490 				    free_ext, a_offset,
1491 				    len + MD_SP_WMSIZE, last_seq);
1492 
1493 				len = 0LL;
1494 				numexts++;
1495 				break;
1496 			}
1497 		}
1498 
1499 		if (len == 0LL)
1500 			goto out;
1501 
1502 
1503 		/*
1504 		 * If the request could not be satisfied by extending
1505 		 * the last extent or by a single extent, then put
1506 		 * multiple smaller extents together until the request
1507 		 * is satisfied.
1508 		 */
1509 		for (free_ext = *head; (free_ext != NULL) && (len > 0);
1510 		    free_ext = free_ext->ext_next) {
1511 			sp_ext_offset_t a_offset;
1512 			sp_ext_length_t a_length;
1513 
1514 			if (free_ext->ext_type != EXTTYP_FREE)
1515 				continue;
1516 
1517 			a_offset = free_ext->ext_offset;
1518 			a_length = free_ext->ext_length;
1519 
1520 			if (alignment > 0) {
1521 
1522 				/*
1523 				 * Shortcut for extents that have been
1524 				 * previously added to pad out the
1525 				 * data space
1526 				 */
1527 				if (a_length < alignment) {
1528 					continue;
1529 				}
1530 
1531 				/*
1532 				 * Round up so the data space begins
1533 				 * on a properly aligned boundary.
1534 				 */
1535 				a_offset += alignment -
1536 				    (a_offset % alignment) - MD_SP_WMSIZE;
1537 
1538 				/*
1539 				 * This is only necessary in case the
1540 				 * watermark size is ever greater than
1541 				 * one.  It'll never happen, of
1542 				 * course; we'll get rid of watermarks
1543 				 * before we make 'em bigger.
1544 				 */
1545 				if (a_offset < free_ext->ext_offset) {
1546 					a_offset += alignment;
1547 				}
1548 
1549 				/*
1550 				 * Adjust the length to account for
1551 				 * the space lost above (if any)
1552 				 */
1553 				a_length -=
1554 				    (a_offset - free_ext->ext_offset);
1555 
1556 				/*
1557 				 * Adjust the length to be properly
1558 				 * aligned if it is NOT to be the
1559 				 * last extent in the soft partition.
1560 				 */
1561 				if ((a_length - MD_SP_WMSIZE) < len)
1562 					a_length -=
1563 					    (a_length - MD_SP_WMSIZE)
1564 					    % alignment;
1565 			}
1566 
1567 			alloc_len = MIN(len, a_length - MD_SP_WMSIZE);
1568 			if (alloc_len == 0)
1569 				continue;
1570 
1571 			/*
1572 			 * meta_sp_alloc_by_ext() expects the
1573 			 * allocation length to include the watermark
1574 			 * size, which is why we don't simply pass in
1575 			 * alloc_len here.
1576 			 */
1577 			meta_sp_alloc_by_ext(sp, np, head, free_ext,
1578 			    a_offset, MIN(len + MD_SP_WMSIZE, a_length),
1579 			    last_seq);
1580 
1581 			len -= alloc_len;
1582 			numexts++;
1583 			last_seq++;
1584 		}
1585 
1586 
1587 		/*
1588 		 * If there was not enough space we can throw it all
1589 		 * away since no real work has been done yet.
1590 		 */
1591 		if (len != 0) {
1592 			meta_sp_list_free(head);
1593 			return (-1);
1594 		}
1595 	}
1596 
1597 	/*
1598 	 * Otherwise, the literal "all" was specified: allocate all
1599 	 * available free space.  Don't bother with alignment.
1600 	 */
1601 	else {
1602 		/* First, extend the last extent if this is a grow */
1603 		if (last_off != 0LL) {
1604 			alloc_ext =
1605 			    meta_sp_list_find(*head, last_off);
1606 			assert(alloc_ext != NULL);
1607 
1608 			last_seq = alloc_ext->ext_seq;
1609 
1610 			free_ext = meta_sp_list_find(*head,
1611 			    alloc_ext->ext_offset +
1612 			    alloc_ext->ext_length);
1613 
1614 			/*
1615 			 * If a free extent follows our last allocated
1616 			 * extent, then remove the last allocated
1617 			 * extent and increase the size of the free
1618 			 * extent to overlap it, then allocate the
1619 			 * total space from the new free extent.
1620 			 */
1621 			if (free_ext != NULL &&
1622 			    free_ext->ext_type == EXTTYP_FREE) {
1623 				assert(free_ext->ext_offset ==
1624 				    alloc_ext->ext_offset +
1625 				    alloc_ext->ext_length);
1626 
1627 				len = alloc_len =
1628 				    free_ext->ext_length;
1629 
1630 				free_ext->ext_offset -=
1631 				    alloc_ext->ext_length;
1632 				free_ext->ext_length +=
1633 				    alloc_ext->ext_length;
1634 
1635 				meta_sp_alloc_by_ext(sp, np, head,
1636 				    free_ext, free_ext->ext_offset,
1637 				    alloc_ext->ext_length + alloc_len,
1638 				    last_seq);
1639 
1640 				/*
1641 				 * now remove the original allocated
1642 				 * node.  We may have overlapping
1643 				 * extents for a short time before
1644 				 * this node is removed.
1645 				 */
1646 				meta_sp_list_remove(head, alloc_ext);
1647 			}
1648 
1649 			last_seq++;
1650 		}
1651 
1652 		/* Next, grab all remaining free space */
1653 		for (free_ext = *head; free_ext != NULL;
1654 		    free_ext = free_ext->ext_next) {
1655 
1656 			if (free_ext->ext_type == EXTTYP_FREE) {
1657 				alloc_len =
1658 				    free_ext->ext_length - MD_SP_WMSIZE;
1659 				if (alloc_len == 0)
1660 					continue;
1661 
1662 				/*
1663 				 * meta_sp_alloc_by_ext() expects the
1664 				 * allocation length to include the
1665 				 * watermark size, which is why we
1666 				 * don't simply pass in alloc_len
1667 				 * here.
1668 				 */
1669 				meta_sp_alloc_by_ext(sp, np, head,
1670 				    free_ext, free_ext->ext_offset,
1671 				    free_ext->ext_length,
1672 				    last_seq);
1673 
1674 				len += alloc_len;
1675 				numexts++;
1676 				last_seq++;
1677 			}
1678 		}
1679 	}
1680 
1681 out:
1682 	if (getenv(META_SP_DEBUG)) {
1683 		meta_sp_debug("meta_sp_alloc_by_len: Extent list after "
1684 		    "allocation:\n");
1685 		meta_sp_list_dump(*head);
1686 	}
1687 
1688 	if (*lp == 0) {
1689 		*lp = len;
1690 
1691 		/*
1692 		 * Make sure the callers hit a no space error if we
1693 		 * didn't actually find anything.
1694 		 */
1695 		if (len == 0) {
1696 			return (-1);
1697 		}
1698 	}
1699 
1700 	return (numexts);
1701 }
1702 
1703 /*
1704  * FUNCTION:	meta_sp_alloc_by_list()
1705  * INPUT:	sp	- the set name for the device the node belongs to
1706  *		np	- the name of the device the node belongs to
1707  *		head	- the head of the list, must be NULL for empty list
1708  *		oblist	- an extent list containing requested nodes to allocate
1709  * OUTPUT:	head	- the new head pointer
1710  * RETURNS:	int	- -1 if error, the number of new extents on success
1711  * PURPOSE:	allocates extents from free space to satisfy the requested
1712  *		extent list.  This is primarily used for the -o/-b options
1713  *		where the user may specifically request extents to allocate.
1714  *		Each extent in the oblist must be a subset (inclusive) of a
1715  *		free extent and may not overlap each other.  This
1716  *		function sets the EXTFLG_UPDATE flag for each node that
1717  *		requires a watermark update after allocating.
1718  */
1719 static int
1720 meta_sp_alloc_by_list(
1721 	mdsetname_t	*sp,
1722 	mdname_t	*np,
1723 	sp_ext_node_t	**head,
1724 	sp_ext_node_t	*oblist
1725 )
1726 {
1727 	sp_ext_node_t	*ext;
1728 	sp_ext_node_t	*free_ext;
1729 	uint_t		numexts = 0;
1730 
1731 	for (ext = oblist; ext != NULL; ext = ext->ext_next) {
1732 
1733 		free_ext = meta_sp_list_find(*head,
1734 		    ext->ext_offset - MD_SP_WMSIZE);
1735 
1736 		/* Make sure the allocation is within the free extent */
1737 		if ((free_ext == NULL) ||
1738 		    (ext->ext_offset + ext->ext_length >
1739 		    free_ext->ext_offset + free_ext->ext_length) ||
1740 		    (free_ext->ext_type != EXTTYP_FREE))
1741 			return (-1);
1742 
1743 		meta_sp_alloc_by_ext(sp, np, head, free_ext,
1744 		    ext->ext_offset - MD_SP_WMSIZE,
1745 		    ext->ext_length + MD_SP_WMSIZE, ext->ext_seq);
1746 
1747 		numexts++;
1748 	}
1749 
1750 	assert(meta_sp_list_overlaps(*head) == 0);
1751 
1752 	if (getenv(META_SP_DEBUG)) {
1753 		meta_sp_debug("meta_sp_alloc_by_list: Extent list after "
1754 		    "allocation:\n");
1755 		meta_sp_list_dump(*head);
1756 	}
1757 
1758 	return (numexts);
1759 }
1760 
1761 /*
1762  * **************************************************************************
1763  *                     Extent List Population Functions                     *
1764  * **************************************************************************
1765  */
1766 
1767 /*
1768  * FUNCTION:	meta_sp_extlist_from_namelist()
1769  * INPUT:	sp	- the set name for the device the node belongs to
1770  *		spnplp	- the namelist of soft partitions to build a list from
1771  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1772  *		ep	- return error pointer
1773  * RETURNS:	int	- -1 if error, 0 on success
1774  * PURPOSE:	builds an extent list representing the soft partitions
1775  *		specified in the namelist.  Each extent in each soft
1776  *		partition is added to the list with the type EXTTYP_ALLOC.
1777  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1778  *		extent in the list includes the space occupied by the
1779  *		watermark, which is not included in the unit structures.
1780  */
1781 static int
1782 meta_sp_extlist_from_namelist(
1783 	mdsetname_t	*sp,
1784 	mdnamelist_t	*spnlp,
1785 	sp_ext_node_t	**extlist,
1786 	md_error_t	*ep
1787 )
1788 {
1789 	int		extn;
1790 	md_sp_t		*msp;		/* unit structure of the sp's */
1791 	mdnamelist_t	*namep;
1792 
1793 	assert(sp != NULL);
1794 
1795 	/*
1796 	 * Now go through the soft partitions and add a node to the used
1797 	 * list for each allocated extent.
1798 	 */
1799 	for (namep = spnlp; namep != NULL; namep = namep->next) {
1800 		mdname_t	*curnp = namep->namep;
1801 
1802 		/* get the unit structure */
1803 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
1804 			return (-1);
1805 
1806 		for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1807 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1808 
1809 			/*
1810 			 * subtract from offset and add to the length
1811 			 * to account for the watermark, which is not
1812 			 * contained in the extents in the unit structure.
1813 			 */
1814 			meta_sp_list_insert(sp, curnp, extlist,
1815 			    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
1816 			    EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset);
1817 		}
1818 	}
1819 	return (0);
1820 }
1821 
1822 /*
1823  * FUNCTION:	meta_sp_extlist_from_wm()
1824  * INPUT:	sp	- the set name for the device the node belongs to
1825  *		compnp	- the name of the device to scan watermarks on
1826  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1827  *		ep	- return error pointer
1828  * RETURNS:	int	- -1 if error, 0 on success
1829  * PURPOSE:	builds an extent list representing the soft partitions
1830  *		specified in the namelist.  Each extent in each soft
1831  *		partition is added to the list with the type EXTTYP_ALLOC.
1832  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1833  *		extent in the list includes the space occupied by the
1834  *		watermark, which is not included in the unit structures.
1835  */
1836 static int
1837 meta_sp_extlist_from_wm(
1838 	mdsetname_t	*sp,
1839 	mdname_t	*compnp,
1840 	sp_ext_node_t	**extlist,
1841 	ext_cmpfunc_t	compare,
1842 	md_error_t	*ep
1843 )
1844 {
1845 	mp_watermark_t	wm;
1846 	mdname_t	*np = NULL;
1847 	mdsetname_t	*spsetp = NULL;
1848 	sp_ext_offset_t	cur_off;
1849 	md_set_desc	*sd;
1850 	int		init = 0;
1851 	mdkey_t		key;
1852 	minor_t		mnum;
1853 
1854 	if (!metaislocalset(sp)) {
1855 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1856 			return (-1);
1857 	}
1858 
1859 	if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR)
1860 		return (-1);
1861 
1862 	for (;;) {
1863 		if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) {
1864 			return (-1);
1865 		}
1866 
1867 		/* get the set and name pointers */
1868 		if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) {
1869 			if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) {
1870 				return (-1);
1871 			}
1872 		}
1873 
1874 		/*
1875 		 * For the MN set, meta_init_make_device needs to
1876 		 * be run on all the nodes so the entries for the
1877 		 * softpart device name and its comp can be created
1878 		 * in the same order in the replica namespace.  If
1879 		 * we have it run on mdmn_do_iocset then the mddbs
1880 		 * will be out of sync between master node and slave
1881 		 * nodes.
1882 		 */
1883 		if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) {
1884 
1885 			if (!metaislocalset(sp) && MD_MNSET_DESC(sd)) {
1886 				md_mn_msg_addmdname_t	*send_params;
1887 				int			result;
1888 				md_mn_result_t		*resp = NULL;
1889 				int			message_size;
1890 
1891 				message_size =  sizeof (*send_params) +
1892 				    strlen(wm.wm_mdname) + 1;
1893 				send_params = Zalloc(message_size);
1894 				send_params->addmdname_setno = sp->setno;
1895 				(void) strcpy(&send_params->addmdname_name[0],
1896 				    wm.wm_mdname);
1897 				result = mdmn_send_message(sp->setno,
1898 				    MD_MN_MSG_ADDMDNAME,
1899 				    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0,
1900 				    (char *)send_params, message_size, &resp,
1901 				    ep);
1902 				Free(send_params);
1903 				if (resp != NULL) {
1904 					if (resp->mmr_exitval != 0) {
1905 						free_result(resp);
1906 						return (-1);
1907 					}
1908 					free_result(resp);
1909 				}
1910 				if (result != 0)
1911 					return (-1);
1912 			} else {
1913 
1914 				if (!is_existing_meta_hsp(sp, wm.wm_mdname)) {
1915 					if ((key = meta_init_make_device(&sp,
1916 					    wm.wm_mdname, ep)) <= 0) {
1917 						return (-1);
1918 					}
1919 					init = 1;
1920 				}
1921 			}
1922 
1923 			np = metaname(&spsetp, wm.wm_mdname, META_DEVICE, ep);
1924 			if (np == NULL) {
1925 				if (init) {
1926 					if (meta_getnmentbykey(sp->setno,
1927 					    MD_SIDEWILD, key, NULL, &mnum,
1928 					    NULL, ep) != NULL) {
1929 						(void) metaioctl(MD_IOCREM_DEV,
1930 						    &mnum, ep, NULL);
1931 					}
1932 					(void) del_self_name(sp, key, ep);
1933 				}
1934 				return (-1);
1935 			}
1936 		}
1937 
1938 		/* insert watermark into extent list */
1939 		meta_sp_list_insert(spsetp, np, extlist, cur_off,
1940 		    wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq,
1941 		    EXTFLG_UPDATE, compare);
1942 
1943 		/* if we see the end watermark, we're done */
1944 		if (wm.wm_type == EXTTYP_END)
1945 			break;
1946 
1947 		cur_off += wm.wm_length + 1;
1948 
1949 		/* clear out set and name pointers for next iteration */
1950 		np = NULL;
1951 		spsetp = NULL;
1952 	}
1953 
1954 	return (0);
1955 }
1956 
1957 /*
1958  * **************************************************************************
1959  *                        Print (metastat) Functions                        *
1960  * **************************************************************************
1961  */
1962 
1963 /*
1964  * FUNCTION:	meta_sp_short_print()
1965  * INPUT:	msp	- the unit structure to display
1966  *		fp	- the file pointer to send output to
1967  *		options	- print options from the command line processor
1968  * OUTPUT:	ep	- return error pointer
1969  * RETURNS:	int	- -1 if error, 0 on success
1970  * PURPOSE:	display a short report of the soft partition in md.tab
1971  *		form, primarily used for metastat -p.
1972  */
1973 static int
1974 meta_sp_short_print(
1975 	md_sp_t		*msp,
1976 	char		*fname,
1977 	FILE		*fp,
1978 	mdprtopts_t	options,
1979 	md_error_t	*ep
1980 )
1981 {
1982 	int	extn;
1983 
1984 	if (options & PRINT_LARGEDEVICES) {
1985 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0)
1986 			return (0);
1987 	}
1988 
1989 	if (options & PRINT_FN) {
1990 		if ((msp->common.revision & MD_FN_META_DEV) == 0)
1991 			return (0);
1992 	}
1993 
1994 	/* print name and -p */
1995 	if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF)
1996 		return (mdsyserror(ep, errno, fname));
1997 
1998 	/* print the component */
1999 	/*
2000 	 * Always print the full path name
2001 	 */
2002 	if (fprintf(fp, " %s", msp->compnamep->rname) == EOF)
2003 		return (mdsyserror(ep, errno, fname));
2004 
2005 	/* print out each extent */
2006 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2007 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2008 		if (fprintf(fp, " -o %llu -b %llu ", extp->poff,
2009 		    extp->len) == EOF)
2010 			return (mdsyserror(ep, errno, fname));
2011 	}
2012 
2013 	if (fprintf(fp, "\n") == EOF)
2014 		return (mdsyserror(ep, errno, fname));
2015 
2016 	/* success */
2017 	return (0);
2018 }
2019 
2020 /*
2021  * FUNCTION:	meta_sp_status_to_name()
2022  * INPUT:	xsp_status	- the status value to convert to a string
2023  *		tstate		- transient errored device state. If set the
2024  *				  device is Unavailable
2025  * OUTPUT:	none
2026  * RETURNS:	char *	- a pointer to the string representing the status value
2027  * PURPOSE:	return an internationalized string representing the
2028  *		status value for a soft partition.  The strings are
2029  *		strdup'd and must be freed by the caller.
2030  */
2031 static char *
2032 meta_sp_status_to_name(
2033 	xsp_status_t	xsp_status,
2034 	uint_t		tstate
2035 )
2036 {
2037 	char *rval = NULL;
2038 
2039 	/*
2040 	 * Check to see if we have MD_INACCESSIBLE set. This is the only valid
2041 	 * value for an 'Unavailable' return. tstate can be set because of
2042 	 * other multi-node reasons (e.g. ABR being set)
2043 	 */
2044 	if (tstate & MD_INACCESSIBLE) {
2045 		return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable")));
2046 	}
2047 
2048 	switch (xsp_status) {
2049 	case MD_SP_CREATEPEND:
2050 		rval = Strdup(dgettext(TEXT_DOMAIN, "Creating"));
2051 		break;
2052 	case MD_SP_GROWPEND:
2053 		rval = Strdup(dgettext(TEXT_DOMAIN, "Growing"));
2054 		break;
2055 	case MD_SP_DELPEND:
2056 		rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting"));
2057 		break;
2058 	case MD_SP_OK:
2059 		rval = Strdup(dgettext(TEXT_DOMAIN, "Okay"));
2060 		break;
2061 	case MD_SP_ERR:
2062 		rval = Strdup(dgettext(TEXT_DOMAIN, "Errored"));
2063 		break;
2064 	case MD_SP_RECOVER:
2065 		rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering"));
2066 		break;
2067 	}
2068 
2069 	if (rval == NULL)
2070 		rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid"));
2071 
2072 	return (rval);
2073 }
2074 
2075 /*
2076  * FUNCTION:	meta_sp_report()
2077  * INPUT:	sp	- the set name for the unit being displayed
2078  *		msp	- the unit structure to display
2079  *		nlpp	- pass back the large devs
2080  *		fp	- the file pointer to send output to
2081  *		options	- print options from the command line processor
2082  * OUTPUT:	ep	- return error pointer
2083  * RETURNS:	int	- -1 if error, 0 on success
2084  * PURPOSE:	print a full report of the device specified
2085  */
2086 static int
2087 meta_sp_report(
2088 	mdsetname_t	*sp,
2089 	md_sp_t		*msp,
2090 	mdnamelist_t	**nlpp,
2091 	char		*fname,
2092 	FILE		*fp,
2093 	mdprtopts_t	options,
2094 	md_error_t	*ep
2095 )
2096 {
2097 	uint_t		extn;
2098 	char		*status;
2099 	char		*devid = "";
2100 	mdname_t	*didnp = NULL;
2101 	ddi_devid_t	dtp;
2102 	int		len;
2103 	uint_t		tstate = 0;
2104 
2105 	if (options & PRINT_LARGEDEVICES) {
2106 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0) {
2107 			return (0);
2108 		} else {
2109 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2110 				return (-1);
2111 		}
2112 	}
2113 
2114 	if (options & PRINT_FN) {
2115 		if ((msp->common.revision & MD_FN_META_DEV) == 0) {
2116 			return (0);
2117 		} else {
2118 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2119 				return (-1);
2120 		}
2121 	}
2122 
2123 	if (options & PRINT_HEADER) {
2124 		if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"),
2125 		    msp->common.namep->cname) == EOF)
2126 			return (mdsyserror(ep, errno, fname));
2127 	}
2128 
2129 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Device: %s\n"),
2130 	    msp->compnamep->cname) == EOF)
2131 		return (mdsyserror(ep, errno, fname));
2132 
2133 	/* Determine if device is available before displaying status */
2134 	if (metaismeta(msp->common.namep)) {
2135 		if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0)
2136 			return (-1);
2137 	}
2138 	status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED);
2139 
2140 	/* print out "State" to be consistent with other metadevices */
2141 	if (tstate & MD_ABR_CAP) {
2142 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2143 		    "    State: %s - Application Based Recovery (ABR)\n"),
2144 		    status) == EOF) {
2145 			Free(status);
2146 			return (mdsyserror(ep, errno, fname));
2147 		}
2148 	} else {
2149 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2150 		    "    State: %s\n"), status) == EOF) {
2151 			Free(status);
2152 			return (mdsyserror(ep, errno, fname));
2153 		}
2154 	}
2155 	free(status);
2156 
2157 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Size: %llu blocks (%s)\n"),
2158 	    msp->common.size,
2159 	    meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF)
2160 		return (mdsyserror(ep, errno, fname));
2161 
2162 	/* print component details */
2163 	if (! metaismeta(msp->compnamep)) {
2164 		diskaddr_t	start_blk;
2165 		int		has_mddb;
2166 		char		*has_mddb_str;
2167 
2168 		/* print header */
2169 		/*
2170 		 * Building a format string on the fly that will
2171 		 * be used in (f)printf. This allows the length
2172 		 * of the ctd to vary from small to large without
2173 		 * looking horrible.
2174 		 */
2175 		len = strlen(msp->compnamep->cname);
2176 		len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device")));
2177 		len += 2;
2178 		if (fprintf(fp,
2179 		    "\t%-*.*s %-12.12s %-5.5s %s\n",
2180 		    len, len,
2181 		    dgettext(TEXT_DOMAIN, "Device"),
2182 		    dgettext(TEXT_DOMAIN, "Start Block"),
2183 		    dgettext(TEXT_DOMAIN, "Dbase"),
2184 		    dgettext(TEXT_DOMAIN, "Reloc")) == EOF) {
2185 			return (mdsyserror(ep, errno, fname));
2186 		}
2187 
2188 
2189 		/* get info */
2190 		if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) ==
2191 		    MD_DISKADDR_ERROR)
2192 			return (-1);
2193 
2194 		if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0)
2195 			return (-1);
2196 
2197 		if (has_mddb)
2198 			has_mddb_str = dgettext(TEXT_DOMAIN, "Yes");
2199 		else
2200 			has_mddb_str = dgettext(TEXT_DOMAIN, "No");
2201 
2202 		/* populate the key in the name_p structure */
2203 		didnp = metadevname(&sp, msp->compnamep->dev, ep);
2204 		if (didnp == NULL) {
2205 			return (-1);
2206 		}
2207 
2208 		/* determine if devid does NOT exist */
2209 		if (options & PRINT_DEVID) {
2210 			if ((dtp = meta_getdidbykey(sp->setno,
2211 			    getmyside(sp, ep), didnp->key, ep)) == NULL)
2212 				devid = dgettext(TEXT_DOMAIN, "No ");
2213 			else {
2214 				devid = dgettext(TEXT_DOMAIN, "Yes");
2215 				free(dtp);
2216 			}
2217 		}
2218 
2219 		/* print info */
2220 		/*
2221 		 * This allows the length
2222 		 * of the ctd to vary from small to large without
2223 		 * looking horrible.
2224 		 */
2225 		if (fprintf(fp, "\t%-*s %8lld     %-5.5s %s\n",
2226 		    len, msp->compnamep->cname,
2227 		    start_blk, has_mddb_str, devid) == EOF) {
2228 			return (mdsyserror(ep, errno, fname));
2229 		}
2230 		(void) fprintf(fp, "\n");
2231 	}
2232 
2233 
2234 	/* print the headers */
2235 	if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n",
2236 	    dgettext(TEXT_DOMAIN, "Extent"),
2237 	    dgettext(TEXT_DOMAIN, "Start Block"),
2238 	    dgettext(TEXT_DOMAIN, "Block count")) == EOF)
2239 		return (mdsyserror(ep, errno, fname));
2240 
2241 	/* print out each extent */
2242 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2243 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2244 
2245 		/* If PRINT_TIMES option is ever supported, add output here */
2246 		if (fprintf(fp, "\t%6u %24llu %24llu\n",
2247 		    extn, extp->poff, extp->len) == EOF)
2248 			return (mdsyserror(ep, errno, fname));
2249 	}
2250 
2251 	/* separate records with a newline */
2252 	(void) fprintf(fp, "\n");
2253 	return (0);
2254 }
2255 
2256 /*
2257  * FUNCTION:	meta_sp_print()
2258  * INPUT:	sp	- the set name for the unit being displayed
2259  *		np	- the name of the device to print
2260  *		fname	- ??? not used
2261  *		fp	- the file pointer to send output to
2262  *		options	- print options from the command line processor
2263  * OUTPUT:	ep	- return error pointer
2264  * RETURNS:	int	- -1 if error, 0 on success
2265  * PURPOSE:	print a full report of the device specified by metastat.
2266  *		This is the main entry point for printing.
2267  */
2268 int
2269 meta_sp_print(
2270 	mdsetname_t	*sp,
2271 	mdname_t	*np,
2272 	mdnamelist_t	**nlpp,
2273 	char		*fname,
2274 	FILE		*fp,
2275 	mdprtopts_t	options,
2276 	md_error_t	*ep
2277 )
2278 {
2279 	md_sp_t		*msp;
2280 	md_unit_t	*mdp;
2281 	int		rval = 0;
2282 
2283 	/* should always have the same set */
2284 	assert(sp != NULL);
2285 
2286 	/* print all the soft partitions */
2287 	if (np == NULL) {
2288 		mdnamelist_t	*nlp = NULL;
2289 		mdnamelist_t	*p;
2290 		int		cnt;
2291 
2292 		if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0)
2293 			return (-1);
2294 		else if (cnt == 0)
2295 			return (0);
2296 
2297 		/* recusively print them out */
2298 		for (p = nlp; (p != NULL); p = p->next) {
2299 			mdname_t	*curnp = p->namep;
2300 
2301 			/*
2302 			 * one problem with the rval of -1 here is that
2303 			 * the error gets "lost" when the next device is
2304 			 * printed, but we want to print them all anyway.
2305 			 */
2306 			rval = meta_sp_print(sp, curnp, nlpp, fname, fp,
2307 			    options, ep);
2308 		}
2309 
2310 		/* clean up, return success */
2311 		metafreenamelist(nlp);
2312 		return (rval);
2313 	}
2314 
2315 	/* get the unit structure */
2316 	if ((msp = meta_get_sp_common(sp, np,
2317 	    ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL)
2318 		return (-1);
2319 
2320 	/* check for parented */
2321 	if ((! (options & PRINT_SUBDEVS)) &&
2322 	    (MD_HAS_PARENT(msp->common.parent))) {
2323 		return (0);
2324 	}
2325 
2326 	/* print appropriate detail */
2327 	if (options & PRINT_SHORT) {
2328 		if (meta_sp_short_print(msp, fname, fp, options, ep) != 0)
2329 			return (-1);
2330 	} else {
2331 		if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0)
2332 			return (-1);
2333 	}
2334 
2335 	/*
2336 	 * Print underlying metadevices if they are parented to us and
2337 	 * if the info for the underlying metadevice has not been printed.
2338 	 */
2339 	if (metaismeta(msp->compnamep)) {
2340 		/* get the unit structure for the subdevice */
2341 		if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL)
2342 			return (-1);
2343 
2344 		/* If info not already printed, recurse */
2345 		if (!BT_TEST(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)))) {
2346 			if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp,
2347 			    (options | PRINT_HEADER | PRINT_SUBDEVS),
2348 			    NULL, ep) != 0) {
2349 				return (-1);
2350 			}
2351 			BT_SET(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)));
2352 		}
2353 	}
2354 	return (0);
2355 }
2356 
2357 /*
2358  * **************************************************************************
2359  *                     Watermark Manipulation Functions                     *
2360  * **************************************************************************
2361  */
2362 
2363 /*
2364  * FUNCTION:	meta_sp_get_start()
2365  * INPUT:	sp	- the operating set
2366  *		np 	- device upon which the sp is being built
2367  * OUTPUT:	ep	- return error pointer
2368  * RETURNS:	daddr_t	- -1 if error, otherwise the start block
2369  * PURPOSE:	Encapsulate the determination of the start block of the
2370  *		device upon which the sp is built or being built.
2371  */
2372 static diskaddr_t
2373 meta_sp_get_start(
2374 	mdsetname_t	*sp,
2375 	mdname_t	*np,
2376 	md_error_t	*ep
2377 )
2378 {
2379 	daddr_t		start_block;
2380 
2381 	if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR)
2382 		start_block += MD_SP_START;
2383 
2384 	return (start_block);
2385 }
2386 
2387 /*
2388  * FUNCTION:	meta_sp_update_wm_common()
2389  * INPUT:	sp	- the operating set
2390  *		msp	- a pointer to the XDR unit structure
2391  *		extlist	- the extent list specifying watermarks to update
2392  *		iocval	- either MD_IOC_SPUPDATEWM or MD_MN_IOC_SPUPDATEWM
2393  * OUTPUT:	ep	- return error pointer
2394  * RETURNS:	int	- -1 if error, 0 on success
2395  * PURPOSE:	steps backwards through the extent list updating
2396  *		watermarks for all extents with the EXTFLG_UPDATE flag
2397  *		set.  Writing the watermarks guarantees consistency when
2398  *		extents must be broken into pieces since the original
2399  *		watermark will be the last to be updated, and will be
2400  *		changed to point to a new watermark that is already
2401  *		known to be consistent.  If one of the writes fails, the
2402  *		original watermark stays intact and none of the changes
2403  *		are realized.
2404  */
2405 static int
2406 meta_sp_update_wm_common(
2407 	mdsetname_t	*sp,
2408 	md_sp_t		*msp,
2409 	sp_ext_node_t	*extlist,
2410 	int		iocval,
2411 	md_error_t	*ep
2412 )
2413 {
2414 	sp_ext_node_t	*ext;
2415 	sp_ext_node_t	*tail;
2416 	mp_watermark_t	*wmp, *watermarks;
2417 	xsp_offset_t	*osp, *offsets;
2418 	int		update_count = 0;
2419 	int		rval = 0;
2420 	md_unit_t	*mdp;
2421 	md_sp_update_wm_t	update_params;
2422 
2423 	if (getenv(META_SP_DEBUG)) {
2424 		meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n");
2425 		meta_sp_list_dump(extlist);
2426 	}
2427 
2428 	/*
2429 	 * find the last node so we can write the watermarks backwards
2430 	 * and count watermarks to update so we can allocate space
2431 	 */
2432 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
2433 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2434 			update_count++;
2435 		}
2436 
2437 		if (ext->ext_next == NULL) {
2438 			tail = ext;
2439 		}
2440 	}
2441 	ext = tail;
2442 
2443 	wmp = watermarks =
2444 	    Zalloc(update_count * sizeof (mp_watermark_t));
2445 	osp = offsets =
2446 	    Zalloc(update_count * sizeof (sp_ext_offset_t));
2447 
2448 	while (ext != NULL) {
2449 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2450 			/* update watermark */
2451 			wmp->wm_magic = MD_SP_MAGIC;
2452 			wmp->wm_version = MD_SP_VERSION;
2453 			wmp->wm_type = ext->ext_type;
2454 			wmp->wm_seq = ext->ext_seq;
2455 			wmp->wm_length = ext->ext_length - MD_SP_WMSIZE;
2456 
2457 			/* fill in the volume name and set name */
2458 			if (ext->ext_namep != NULL)
2459 				(void) strcpy(wmp->wm_mdname,
2460 				    ext->ext_namep->cname);
2461 			else
2462 				(void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME);
2463 			if (ext->ext_setp != NULL &&
2464 			    ext->ext_setp->setno != MD_LOCAL_SET)
2465 				(void) strcpy(wmp->wm_setname,
2466 				    ext->ext_setp->setname);
2467 			else
2468 				(void) strcpy(wmp->wm_setname,
2469 				    MD_SP_LOCALSETNAME);
2470 
2471 			/* Generate the checksum */
2472 			wmp->wm_checksum = 0;
2473 			crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum,
2474 			    sizeof (*wmp), NULL);
2475 
2476 			/* record the extent offset */
2477 			*osp = ext->ext_offset;
2478 
2479 			/* Advance the placeholders */
2480 			osp++; wmp++;
2481 		}
2482 		ext = ext->ext_prev;
2483 	}
2484 
2485 	mdp = meta_get_mdunit(sp, msp->common.namep, ep);
2486 	if (mdp == NULL) {
2487 		rval = -1;
2488 		goto out;
2489 	}
2490 
2491 	(void) memset(&update_params, 0, sizeof (update_params));
2492 	update_params.mnum = MD_SID(mdp);
2493 	update_params.count = update_count;
2494 	update_params.wmp = (uintptr_t)watermarks;
2495 	update_params.osp = (uintptr_t)offsets;
2496 	MD_SETDRIVERNAME(&update_params, MD_SP,
2497 	    MD_MIN2SET(update_params.mnum));
2498 
2499 	if (metaioctl(iocval, &update_params, &update_params.mde,
2500 	    msp->common.namep->cname) != 0) {
2501 		(void) mdstealerror(ep, &update_params.mde);
2502 		rval = -1;
2503 		goto out;
2504 	}
2505 
2506 out:
2507 	Free(watermarks);
2508 	Free(offsets);
2509 
2510 	return (rval);
2511 }
2512 
2513 static int
2514 meta_sp_update_wm(
2515 	mdsetname_t	*sp,
2516 	md_sp_t		*msp,
2517 	sp_ext_node_t	*extlist,
2518 	md_error_t	*ep
2519 )
2520 {
2521 	return (meta_sp_update_wm_common(sp, msp, extlist, MD_IOC_SPUPDATEWM,
2522 	    ep));
2523 }
2524 
2525 static int
2526 meta_mn_sp_update_wm(
2527 	mdsetname_t	*sp,
2528 	md_sp_t		*msp,
2529 	sp_ext_node_t	*extlist,
2530 	md_error_t	*ep
2531 )
2532 {
2533 	return (meta_sp_update_wm_common(sp, msp, extlist, MD_MN_IOC_SPUPDATEWM,
2534 	    ep));
2535 }
2536 
2537 /*
2538  * FUNCTION:	meta_sp_clear_wm()
2539  * INPUT:	sp	- the operating set
2540  *		msp	- the unit structure for the soft partition to clear
2541  * OUTPUT:	ep	- return error pointer
2542  * RETURNS:	int	- -1 if error, 0 on success
2543  * PURPOSE:	steps through the extents for a soft partition unit and
2544  *		creates an extent list designed to mark all of the
2545  *		watermarks for those extents as free.  The extent list
2546  *		is then passed to meta_sp_update_wm() to actually write
2547  *		the watermarks out.
2548  */
2549 static int
2550 meta_sp_clear_wm(
2551 	mdsetname_t	*sp,
2552 	md_sp_t		*msp,
2553 	md_error_t	*ep
2554 )
2555 {
2556 	sp_ext_node_t	*extlist = NULL;
2557 	int		numexts = msp->ext.ext_len;
2558 	uint_t		i;
2559 	int		rval = 0;
2560 
2561 	/* for each watermark must set the flag to SP_FREE */
2562 	for (i = 0; i < numexts; i++) {
2563 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
2564 
2565 		meta_sp_list_insert(NULL, NULL, &extlist,
2566 		    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
2567 		    EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
2568 	}
2569 
2570 	/* update watermarks */
2571 	rval = meta_sp_update_wm(sp, msp, extlist, ep);
2572 
2573 	meta_sp_list_free(&extlist);
2574 	return (rval);
2575 }
2576 
2577 /*
2578  * FUNCTION:	meta_sp_read_wm()
2579  * INPUT:	sp	- setname for component
2580  *		compnp	- mdname_t for component
2581  *		offset	- the offset of the watermark to read (sectors)
2582  * OUTPUT:	wm	- the watermark structure to read into
2583  *		ep	- return error pointer
2584  * RETURNS:	int	- -1 if error, 0 on success
2585  * PURPOSE:	seeks out to the requested offset and reads a watermark.
2586  *		It then verifies that the magic number is correct and
2587  *		that the checksum is valid, returning an error if either
2588  *		is wrong.
2589  */
2590 static int
2591 meta_sp_read_wm(
2592 	mdsetname_t	*sp,
2593 	mdname_t	*compnp,
2594 	mp_watermark_t	*wm,
2595 	sp_ext_offset_t	offset,
2596 	md_error_t	*ep
2597 )
2598 {
2599 	md_sp_read_wm_t	read_params;
2600 
2601 	/*
2602 	 * make sure block offset does not overflow 2^64 bytes and it's a
2603 	 * multiple of the block size.
2604 	 */
2605 	assert(offset <= (1LL << (64 - DEV_BSHIFT)));
2606 	/* LINTED */
2607 	assert((sizeof (*wm) % DEV_BSIZE) == 0);
2608 
2609 	(void) memset(wm, 0, sizeof (*wm));
2610 
2611 	(void) memset(&read_params, 0, sizeof (read_params));
2612 	read_params.rdev = compnp->dev;
2613 	read_params.wmp = (uintptr_t)wm;
2614 	read_params.offset = offset;
2615 	MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno);
2616 
2617 	if (metaioctl(MD_IOC_SPREADWM, &read_params,
2618 	    &read_params.mde, compnp->cname) != 0) {
2619 
2620 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2621 		    "Extent header read failed, block %llu.\n"), offset);
2622 		return (mdstealerror(ep, &read_params.mde));
2623 	}
2624 
2625 	/* make sure magic number is correct */
2626 	if (wm->wm_magic != MD_SP_MAGIC) {
2627 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2628 		    "found incorrect magic number %x, expected %x.\n"),
2629 		    wm->wm_magic, MD_SP_MAGIC);
2630 		/*
2631 		 * Pass NULL for the device name as we don't have
2632 		 * valid watermark contents.
2633 		 */
2634 		return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL));
2635 	}
2636 
2637 	if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
2638 	    sizeof (*wm), NULL)) {
2639 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2640 		    "found incorrect checksum %x.\n"),
2641 		    wm->wm_checksum);
2642 		return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname));
2643 	}
2644 
2645 	return (0);
2646 }
2647 
2648 /*
2649  * **************************************************************************
2650  *                  Query Functions
2651  * **************************************************************************
2652  */
2653 
2654 /*
2655  * IMPORTANT NOTE: This is a static function that assumes that
2656  *		   its input parameters have been checked and
2657  *		   have valid values that lie within acceptable
2658  *		   ranges.
2659  *
2660  * FUNCTION:	meta_sp_enough_space()
2661  * INPUT:	desired_number_of_sps - the number of soft partitions desired;
2662  *					must be > 0
2663  *		desired_sp_size - the desired soft partition size in blocks;
2664  *				  must be > 0
2665  *		extent_listpp - a reference to a reference to an extent
2666  *				list that lists the extents on a device;
2667  *				must be a reference to a reference to a
2668  *				valid extent list
2669  *		alignment - the desired data space alignment for the sp's
2670  * OUTPUT:	boolean_t return value
2671  * RETURNS:	boolean_t - B_TRUE if there's enough space in the extent
2672  *			    list to create the desired soft partitions,
2673  *			    B_FALSE if there's not enough space
2674  * PURPOSE:	determines whether there's enough free space in an extent
2675  *		list to allow creation of a set of soft partitions
2676  */
2677 static boolean_t
2678 meta_sp_enough_space(
2679 	int		desired_number_of_sps,
2680 	blkcnt_t	desired_sp_size,
2681 	sp_ext_node_t	**extent_listpp,
2682 	sp_ext_length_t	alignment
2683 )
2684 {
2685 	boolean_t		enough_space;
2686 	int			number_of_sps;
2687 	int			number_of_extents_used;
2688 	sp_ext_length_t		desired_ext_length = desired_sp_size;
2689 
2690 	enough_space = B_TRUE;
2691 	number_of_sps = 0;
2692 	while ((enough_space == B_TRUE) &&
2693 	    (number_of_sps < desired_number_of_sps)) {
2694 		/*
2695 		 * Use the extent allocation algorithm implemented by
2696 		 * meta_sp_alloc_by_len() to test whether the free
2697 		 * extents in the extent list referenced by *extent_listpp
2698 		 * contain enough space to accomodate a soft partition
2699 		 * of size desired_ext_length.
2700 		 *
2701 		 * Repeat the test <desired_number_of_sps> times
2702 		 * or until it fails, whichever comes first,
2703 		 * each time allocating the extents required to
2704 		 * create the soft partition without actually
2705 		 * creating the soft partition.
2706 		 */
2707 		number_of_extents_used = meta_sp_alloc_by_len(
2708 		    TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2709 		    extent_listpp, &desired_ext_length,
2710 		    NO_OFFSET, alignment);
2711 		if (number_of_extents_used == -1) {
2712 			enough_space = B_FALSE;
2713 		} else {
2714 			number_of_sps++;
2715 		}
2716 	}
2717 	return (enough_space);
2718 }
2719 
2720 /*
2721  * IMPORTANT NOTE: This is a static function that calls other functions
2722  *		   that check its mdsetnamep and device_mdnamep
2723  *		   input parameters, but expects extent_listpp to
2724  *		   be a initialized to a valid address to which
2725  *		   it can write a reference to the extent list that
2726  *		   it creates.
2727  *
2728  * FUNCTION:	meta_sp_get_extent_list()
2729  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2730  *			     for the set containing the device for
2731  *			     which the extents are to be listed
2732  *		device_mdnamep - a reference to the mdname_t structure
2733  *				 for the device for which the extents
2734  *				 are to be listed
2735  * OUTPUT:	*extent_listpp - a reference to the extent list for
2736  *				 the device; NULL if the function fails
2737  *		*ep - the libmeta error encountered, if any
2738  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2739  *			    B_FALSE if not
2740  * PURPOSE:	gets the extent list for a device
2741  */
2742 static boolean_t
2743 meta_sp_get_extent_list(
2744 	mdsetname_t	*mdsetnamep,
2745 	mdname_t	*device_mdnamep,
2746 	sp_ext_node_t	**extent_listpp,
2747 	md_error_t	*ep
2748 )
2749 {
2750 	diskaddr_t		device_size_in_blocks;
2751 	mdnamelist_t		*sp_name_listp;
2752 	diskaddr_t		start_block_address_in_blocks;
2753 
2754 	*extent_listpp = NULL;
2755 	sp_name_listp = NULL;
2756 
2757 	start_block_address_in_blocks = meta_sp_get_start(mdsetnamep,
2758 	    device_mdnamep, ep);
2759 	if (start_block_address_in_blocks == MD_DISKADDR_ERROR) {
2760 		if (getenv(META_SP_DEBUG)) {
2761 			mde_perror(ep,
2762 			    "meta_sp_get_extent_list:meta_sp_get_start");
2763 		}
2764 		return (B_FALSE);
2765 	}
2766 
2767 	device_size_in_blocks = metagetsize(device_mdnamep, ep);
2768 	if (device_size_in_blocks == MD_DISKADDR_ERROR) {
2769 		if (getenv(META_SP_DEBUG)) {
2770 			mde_perror(ep,
2771 			    "meta_sp_get_extent_list:metagetsize");
2772 		}
2773 		return (B_FALSE);
2774 	}
2775 
2776 	/*
2777 	 * Sanity check: the start block will have skipped an integer
2778 	 * number of cylinders, C.  C will usually be zero.  If (C > 0),
2779 	 * and the disk slice happens to only be C cylinders in total
2780 	 * size, we'll fail this check.
2781 	 */
2782 	if (device_size_in_blocks <=
2783 	    (start_block_address_in_blocks + MD_SP_WMSIZE)) {
2784 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname);
2785 		return (B_FALSE);
2786 	}
2787 
2788 	/*
2789 	 * After this point, we will have allocated resources, so any
2790 	 * failure returns must be through the supplied "fail" label
2791 	 * to properly deallocate things.
2792 	 */
2793 
2794 	/*
2795 	 * Create an empty extent list that starts one watermark past
2796 	 * the start block of the device and ends one watermark before
2797 	 * the end of the device.
2798 	 */
2799 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2800 	    extent_listpp, NO_OFFSET,
2801 	    (sp_ext_length_t)start_block_address_in_blocks,
2802 	    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2803 	    meta_sp_cmp_by_offset);
2804 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2805 	    extent_listpp, (sp_ext_offset_t)(device_size_in_blocks -
2806 	    MD_SP_WMSIZE), MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER,
2807 	    NO_FLAGS, meta_sp_cmp_by_offset);
2808 
2809 	/*
2810 	 * Get the list of soft partitions that are already on the
2811 	 * device.
2812 	 */
2813 	if (meta_sp_get_by_component(mdsetnamep, device_mdnamep,
2814 	    &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) {
2815 		if (getenv(META_SP_DEBUG)) {
2816 			mde_perror(ep,
2817 			    "meta_sp_get_extent_list:meta_sp_get_by_component");
2818 		}
2819 		goto fail;
2820 	}
2821 
2822 	if (sp_name_listp != NULL) {
2823 		/*
2824 		 * If there are soft partitions on the device, add the
2825 		 * extents used in them to the extent list.
2826 		 */
2827 		if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp,
2828 		    extent_listpp, ep) == -1) {
2829 			if (getenv(META_SP_DEBUG)) {
2830 				mde_perror(ep, "meta_sp_get_extent_list:"
2831 				    "meta_sp_extlist_from_namelist");
2832 			}
2833 			goto fail;
2834 		}
2835 		metafreenamelist(sp_name_listp);
2836 	}
2837 
2838 	/*
2839 	 * Add free extents to the extent list to represent
2840 	 * the remaining regions of free space on the
2841 	 * device.
2842 	 */
2843 	meta_sp_list_freefill(extent_listpp, device_size_in_blocks);
2844 	return (B_TRUE);
2845 
2846 fail:
2847 	if (sp_name_listp != NULL) {
2848 		metafreenamelist(sp_name_listp);
2849 	}
2850 
2851 	if (*extent_listpp != NULL) {
2852 		/*
2853 		 * meta_sp_list_free sets *extent_listpp to NULL.
2854 		 */
2855 		meta_sp_list_free(extent_listpp);
2856 	}
2857 	return (B_FALSE);
2858 }
2859 
2860 /*
2861  * IMPORTANT NOTE: This is a static function that calls other functions
2862  *		   that check its mdsetnamep and mddrivenamep
2863  *		   input parameters, but expects extent_listpp to
2864  *		   be a initialized to a valid address to which
2865  *		   it can write a reference to the extent list that
2866  *		   it creates.
2867  *
2868  * FUNCTION:	meta_sp_get_extent_list_for_drive()
2869  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2870  *			     for the set containing the drive for
2871  *			     which the extents are to be listed
2872  *		mddrivenamep   - a reference to the mddrivename_t structure
2873  *				 for the drive for which the extents
2874  *				 are to be listed
2875  * OUTPUT:	*extent_listpp - a reference to the extent list for
2876  *				 the drive; NULL if the function fails
2877  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2878  *			    B_FALSE if not
2879  * PURPOSE:	gets the extent list for a drive when the entire drive
2880  *		is to be soft partitioned
2881  */
2882 static boolean_t
2883 meta_sp_get_extent_list_for_drive(
2884 	mdsetname_t	*mdsetnamep,
2885 	mddrivename_t	*mddrivenamep,
2886 	sp_ext_node_t	**extent_listpp
2887 )
2888 {
2889 	boolean_t		can_use;
2890 	diskaddr_t		free_space;
2891 	md_error_t		mderror;
2892 	mdvtoc_t		proposed_vtoc;
2893 	int			repartition_options;
2894 	int			return_value;
2895 	md_sp_t			test_sp_struct;
2896 
2897 	can_use = B_TRUE;
2898 	*extent_listpp = NULL;
2899 	mderror = mdnullerror;
2900 	test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0,
2901 	    &mderror);
2902 	if (test_sp_struct.compnamep == NULL) {
2903 		can_use = B_FALSE;
2904 	}
2905 
2906 	if (can_use == B_TRUE) {
2907 		mderror = mdnullerror;
2908 		repartition_options = 0;
2909 		return_value = meta_check_sp(mdsetnamep, &test_sp_struct,
2910 		    MDCMD_USE_WHOLE_DISK, &repartition_options, &mderror);
2911 		if (return_value != 0) {
2912 			can_use = B_FALSE;
2913 		}
2914 	}
2915 
2916 	if (can_use == B_TRUE) {
2917 		mderror = mdnullerror;
2918 		repartition_options = repartition_options |
2919 		    (MD_REPART_FORCE | MD_REPART_DONT_LABEL);
2920 		return_value = meta_repartition_drive(mdsetnamep, mddrivenamep,
2921 		    repartition_options, &proposed_vtoc, &mderror);
2922 		if (return_value != 0) {
2923 			can_use = B_FALSE;
2924 		}
2925 	}
2926 
2927 	if (can_use == B_TRUE) {
2928 		free_space = proposed_vtoc.parts[MD_SLICE0].size;
2929 		if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) {
2930 			can_use = B_FALSE;
2931 		}
2932 	}
2933 
2934 	if (can_use == B_TRUE) {
2935 		/*
2936 		 * Create an extent list that starts with
2937 		 * a reserved extent that ends at the start
2938 		 * of the usable space on slice zero of the
2939 		 * proposed VTOC, ends with an extent that
2940 		 * reserves space for a watermark at the end
2941 		 * of slice zero, and contains a single free
2942 		 * extent that occupies the rest of the space
2943 		 * on the slice.
2944 		 *
2945 		 * NOTE:
2946 		 *
2947 		 * Don't use metagetstart() or metagetsize() to
2948 		 * find the usable space.  They query the mdname_t
2949 		 * structure that represents an actual device to
2950 		 * determine the amount of space on the device that
2951 		 * contains metadata and the total amount of space
2952 		 * on the device.  Since this function creates a
2953 		 * proposed extent list that doesn't reflect the
2954 		 * state of an actual device, there's no mdname_t
2955 		 * structure to be queried.
2956 		 *
2957 		 * When a drive is reformatted to prepare for
2958 		 * soft partitioning, all of slice seven is
2959 		 * reserved for metadata, all of slice zero is
2960 		 * available for soft partitioning, and all other
2961 		 * slices on the drive are empty.  The proposed
2962 		 * extent list for the drive therefore contains
2963 		 * only three extents: a reserved extent that ends
2964 		 * at the start of the usable space on slice zero,
2965 		 * a single free extent that occupies all the usable
2966 		 * space on slice zero, and an ending extent that
2967 		 * reserves space for a watermark at the end of
2968 		 * slice zero.
2969 		 */
2970 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2971 		    extent_listpp, NO_OFFSET, (sp_ext_length_t)(MD_SP_START),
2972 		    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2973 		    meta_sp_cmp_by_offset);
2974 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2975 		    extent_listpp, (sp_ext_offset_t)(free_space - MD_SP_WMSIZE),
2976 		    MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER, NO_FLAGS,
2977 		    meta_sp_cmp_by_offset);
2978 		meta_sp_list_freefill(extent_listpp, free_space);
2979 	}
2980 	return (can_use);
2981 }
2982 
2983 /*
2984  * FUNCTION:	meta_sp_can_create_sps()
2985  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2986  *			     for the set containing the device for
2987  *			     which the extents are to be listed
2988  *		mdnamep - a reference to the mdname_t of the device
2989  *			  on which the soft parititions are to be created
2990  *		number_of_sps - the desired number of soft partitions
2991  *		sp_size - the desired soft partition size
2992  * OUTPUT:	boolean_t return value
2993  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
2994  *			    B_FALSE if not
2995  * PURPOSE:	determines whether a set of soft partitions can be created
2996  *		on a device
2997  */
2998 boolean_t
2999 meta_sp_can_create_sps(
3000 	mdsetname_t	*mdsetnamep,
3001 	mdname_t	*mdnamep,
3002 	int		number_of_sps,
3003 	blkcnt_t	sp_size
3004 )
3005 {
3006 	sp_ext_node_t	*extent_listp;
3007 	boolean_t	succeeded;
3008 	md_error_t	mde;
3009 
3010 	if ((number_of_sps > 0) && (sp_size > 0)) {
3011 		succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3012 		    &extent_listp, &mde);
3013 	} else {
3014 		succeeded = B_FALSE;
3015 	}
3016 
3017 	/*
3018 	 * We don't really care about an error return from the
3019 	 * alignment call; that will just result in passing zero,
3020 	 * which will be interpreted as no alignment.
3021 	 */
3022 
3023 	if (succeeded == B_TRUE) {
3024 		succeeded = meta_sp_enough_space(number_of_sps,
3025 		    sp_size, &extent_listp,
3026 		    meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde));
3027 		meta_sp_list_free(&extent_listp);
3028 	}
3029 	return (succeeded);
3030 }
3031 
3032 /*
3033  * FUNCTION:	meta_sp_can_create_sps_on_drive()
3034  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3035  *			     for the set containing the drive for
3036  *			     which the extents are to be listed
3037  *		mddrivenamep - a reference to the mddrivename_t of the drive
3038  *			       on which the soft parititions are to be created
3039  *		number_of_sps - the desired number of soft partitions
3040  *		sp_size - the desired soft partition size
3041  * OUTPUT:	boolean_t return value
3042  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3043  *			    B_FALSE if not
3044  * PURPOSE:	determines whether a set of soft partitions can be created
3045  *		on a drive if the entire drive is soft partitioned
3046  */
3047 boolean_t
3048 meta_sp_can_create_sps_on_drive(
3049 	mdsetname_t	*mdsetnamep,
3050 	mddrivename_t	*mddrivenamep,
3051 	int		number_of_sps,
3052 	blkcnt_t	sp_size
3053 )
3054 {
3055 	sp_ext_node_t	*extent_listp;
3056 	boolean_t	succeeded;
3057 
3058 	if ((number_of_sps > 0) && (sp_size > 0)) {
3059 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3060 		    mddrivenamep, &extent_listp);
3061 	} else {
3062 		succeeded = B_FALSE;
3063 	}
3064 
3065 	/*
3066 	 * We don't care about alignment on the space call because
3067 	 * we're specifically dealing with a drive, which will have no
3068 	 * inherent alignment.
3069 	 */
3070 
3071 	if (succeeded == B_TRUE) {
3072 		succeeded = meta_sp_enough_space(number_of_sps, sp_size,
3073 		    &extent_listp, SP_UNALIGNED);
3074 		meta_sp_list_free(&extent_listp);
3075 	}
3076 	return (succeeded);
3077 }
3078 
3079 /*
3080  * FUNCTION:	meta_sp_get_free_space()
3081  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3082  *			     for the set containing the device for
3083  *			     which the free space is to be returned
3084  *		mdnamep - a reference to the mdname_t of the device
3085  *			  for which the free space is to be returned
3086  * OUTPUT:	blkcnt_t return value
3087  * RETURNS:	blkcnt_t - the number of blocks of free space on the device
3088  * PURPOSE:	returns the number of blocks of free space on a device
3089  */
3090 blkcnt_t
3091 meta_sp_get_free_space(
3092 	mdsetname_t	*mdsetnamep,
3093 	mdname_t	*mdnamep
3094 )
3095 {
3096 	sp_ext_node_t		*extent_listp;
3097 	sp_ext_length_t		free_blocks;
3098 	boolean_t		succeeded;
3099 	md_error_t		mde;
3100 
3101 	extent_listp = NULL;
3102 	free_blocks = 0;
3103 	succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3104 	    &extent_listp, &mde);
3105 	if (succeeded == B_TRUE) {
3106 		free_blocks = meta_sp_list_size(extent_listp,
3107 		    EXTTYP_FREE, INCLUDE_WM);
3108 		meta_sp_list_free(&extent_listp);
3109 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3110 			/*
3111 			 * Subtract a safety margin for watermarks when
3112 			 * computing the number of blocks available for
3113 			 * use.  The actual number of watermarks can't
3114 			 * be calculated without knowing the exact numbers
3115 			 * and sizes of both the free extents and the soft
3116 			 * partitions to be created.  The calculation is
3117 			 * highly complex and error-prone even if those
3118 			 * quantities are known.  The approximate value
3119 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3120 			 * correct value in all practical cases.
3121 			 */
3122 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3123 		} else {
3124 			free_blocks = 0;
3125 		}
3126 	} else {
3127 		mdclrerror(&mde);
3128 	}
3129 
3130 	return (free_blocks);
3131 }
3132 
3133 /*
3134  * FUNCTION:	meta_sp_get_free_space_on_drive()
3135  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3136  *			     for the set containing the drive for
3137  *			     which the free space is to be returned
3138  *		mddrivenamep - a reference to the mddrivename_t of the drive
3139  *			       for which the free space is to be returned
3140  * OUTPUT:	blkcnt_t return value
3141  * RETURNS:	blkcnt_t - the number of blocks of free space on the drive
3142  * PURPOSE:	returns the number of blocks of space usable for soft
3143  *		partitions on an entire drive, if the entire drive is
3144  *		soft partitioned
3145  */
3146 blkcnt_t
3147 meta_sp_get_free_space_on_drive(
3148 	mdsetname_t	*mdsetnamep,
3149 	mddrivename_t	*mddrivenamep
3150 )
3151 {
3152 	sp_ext_node_t		*extent_listp;
3153 	sp_ext_length_t		free_blocks;
3154 	boolean_t		succeeded;
3155 
3156 	extent_listp = NULL;
3157 	free_blocks = 0;
3158 	succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3159 	    mddrivenamep, &extent_listp);
3160 	if (succeeded == B_TRUE) {
3161 		free_blocks = meta_sp_list_size(extent_listp,
3162 		    EXTTYP_FREE, INCLUDE_WM);
3163 		meta_sp_list_free(&extent_listp);
3164 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3165 			/*
3166 			 * Subtract a safety margin for watermarks when
3167 			 * computing the number of blocks available for
3168 			 * use.  The actual number of watermarks can't
3169 			 * be calculated without knowing the exact numbers
3170 			 * and sizes of both the free extents and the soft
3171 			 * partitions to be created.  The calculation is
3172 			 * highly complex and error-prone even if those
3173 			 * quantities are known.  The approximate value
3174 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3175 			 * correct value in all practical cases.
3176 			 */
3177 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3178 		} else {
3179 			free_blocks = 0;
3180 		}
3181 	}
3182 	return (free_blocks);
3183 }
3184 
3185 /*
3186  * FUNCTION:	meta_sp_get_number_of_possible_sps()
3187  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3188  *			     for the set containing the device for
3189  *			     which the number of possible soft partitions
3190  *			     is to be returned
3191  *		mdnamep - a reference to the mdname_t of the device
3192  *			  for which the number of possible soft partitions
3193  *			  is to be returned
3194  * OUTPUT:	int return value
3195  * RETURNS:	int - the number of soft partitions of the desired size
3196  *		      that can be created on the device
3197  * PURPOSE:	returns the number of soft partitions of a given size
3198  *		that can be created on a device
3199  */
3200 int
3201 meta_sp_get_number_of_possible_sps(
3202 	mdsetname_t	*mdsetnamep,
3203 	mdname_t	*mdnamep,
3204 	blkcnt_t	sp_size
3205 )
3206 {
3207 	sp_ext_node_t	*extent_listp;
3208 	int		number_of_possible_sps;
3209 	boolean_t	succeeded;
3210 	md_error_t	mde;
3211 	sp_ext_length_t	alignment;
3212 
3213 	extent_listp = NULL;
3214 	number_of_possible_sps = 0;
3215 	if (sp_size > 0) {
3216 		if ((succeeded = meta_sp_get_extent_list(mdsetnamep,
3217 		    mdnamep, &extent_listp, &mde)) == B_FALSE)
3218 			mdclrerror(&mde);
3219 	} else {
3220 		succeeded = B_FALSE;
3221 	}
3222 
3223 	if (succeeded == B_TRUE) {
3224 		alignment = meta_sp_get_default_alignment(mdsetnamep,
3225 		    mdnamep, &mde);
3226 	}
3227 
3228 	while (succeeded == B_TRUE) {
3229 		/*
3230 		 * Keep allocating space from the extent list
3231 		 * for soft partitions of the desired size until
3232 		 * there's not enough free space left in the list
3233 		 * for another soft partiition of that size.
3234 		 * Add one to the number of possible soft partitions
3235 		 * for each soft partition for which there is
3236 		 * enough free space left.
3237 		 */
3238 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3239 		    sp_size, &extent_listp, alignment);
3240 		if (succeeded == B_TRUE) {
3241 			number_of_possible_sps++;
3242 		}
3243 	}
3244 	if (extent_listp != NULL) {
3245 		meta_sp_list_free(&extent_listp);
3246 	}
3247 	return (number_of_possible_sps);
3248 }
3249 
3250 /*
3251  * FUNCTION:	meta_sp_get_number_of_possible_sps_on_drive()
3252  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3253  *			     for the set containing the drive for
3254  *			     which the number of possible soft partitions
3255  *			     is to be returned
3256  *		mddrivenamep - a reference to the mddrivename_t of the drive
3257  *			       for which the number of possible soft partitions
3258  *			       is to be returned
3259  *		sp_size - the size in blocks of the proposed soft partitions
3260  * OUTPUT:	int return value
3261  * RETURNS:	int - the number of soft partitions of the desired size
3262  *		      that can be created on the drive
3263  * PURPOSE:	returns the number of soft partitions of a given size
3264  *		that can be created on a drive, if the entire drive is
3265  *		soft partitioned
3266  */
3267 int
3268 meta_sp_get_number_of_possible_sps_on_drive(
3269 	mdsetname_t	*mdsetnamep,
3270 	mddrivename_t	*mddrivenamep,
3271 	blkcnt_t	sp_size
3272 )
3273 {
3274 	sp_ext_node_t	*extent_listp;
3275 	int		number_of_possible_sps;
3276 	boolean_t	succeeded;
3277 
3278 	extent_listp = NULL;
3279 	number_of_possible_sps = 0;
3280 	if (sp_size > 0) {
3281 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3282 		    mddrivenamep, &extent_listp);
3283 	} else {
3284 		succeeded = B_FALSE;
3285 	}
3286 	while (succeeded == B_TRUE) {
3287 		/*
3288 		 * Keep allocating space from the extent list
3289 		 * for soft partitions of the desired size until
3290 		 * there's not enough free space left in the list
3291 		 * for another soft partition of that size.
3292 		 * Add one to the number of possible soft partitions
3293 		 * for each soft partition for which there is
3294 		 * enough free space left.
3295 		 *
3296 		 * Since it's a drive, not a metadevice, make no
3297 		 * assumptions about alignment.
3298 		 */
3299 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3300 		    sp_size, &extent_listp, SP_UNALIGNED);
3301 		if (succeeded == B_TRUE) {
3302 			number_of_possible_sps++;
3303 		}
3304 	}
3305 	if (extent_listp != NULL) {
3306 		meta_sp_list_free(&extent_listp);
3307 	}
3308 	return (number_of_possible_sps);
3309 }
3310 
3311 /*
3312  * FUNCTION:	meta_sp_get_possible_sp_size()
3313  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3314  *			     for the set containing the device for
3315  *			     which the possible soft partition size
3316  *			     is to be returned
3317  *		mdnamep - a reference to the mdname_t of the device
3318  *			  for which the possible soft partition size
3319  *			  is to be returned
3320  *		number_of_sps - the desired number of soft partitions
3321  * OUTPUT:	blkcnt_t return value
3322  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3323  * PURPOSE:	returns the maximum possible size of each of a given number of
3324  *		soft partitions of equal size that can be created on a device
3325  */
3326 blkcnt_t
3327 meta_sp_get_possible_sp_size(
3328 	mdsetname_t	*mdsetnamep,
3329 	mdname_t	*mdnamep,
3330 	int		number_of_sps
3331 )
3332 {
3333 	blkcnt_t	free_blocks;
3334 	blkcnt_t	sp_size;
3335 	boolean_t	succeeded;
3336 
3337 	sp_size = 0;
3338 	if (number_of_sps > 0) {
3339 		free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep);
3340 		sp_size = free_blocks / number_of_sps;
3341 		succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3342 		    number_of_sps, sp_size);
3343 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3344 			/*
3345 			 * To compensate for space that may have been
3346 			 * occupied by watermarks, reduce sp_size by a
3347 			 * number of blocks equal to the number of soft
3348 			 * partitions desired, and test again to see
3349 			 * whether the desired number of soft partitions
3350 			 * can be created.
3351 			 */
3352 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3353 			succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3354 			    number_of_sps, sp_size);
3355 		}
3356 		if (sp_size < 0) {
3357 			sp_size = 0;
3358 		}
3359 	}
3360 	return (sp_size);
3361 }
3362 
3363 /*
3364  * FUNCTION:	meta_sp_get_possible_sp_size_on_drive()
3365  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3366  *			     for the set containing the drive for
3367  *			     which the possible soft partition size
3368  *			     is to be returned
3369  *		mddrivenamep - a reference to the mddrivename_t of the drive
3370  *			       for which the possible soft partition size
3371  *			       is to be returned
3372  *		number_of_sps - the desired number of soft partitions
3373  * OUTPUT:	blkcnt_t return value
3374  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3375  * PURPOSE:	returns the maximum possible size of each of a given number of
3376  *		soft partitions of equal size that can be created on a drive
3377  *              if the entire drive is soft partitioned
3378  */
3379 blkcnt_t
3380 meta_sp_get_possible_sp_size_on_drive(
3381 	mdsetname_t	*mdsetnamep,
3382 	mddrivename_t	*mddrivenamep,
3383 	int		number_of_sps
3384 )
3385 {
3386 	blkcnt_t	free_blocks;
3387 	blkcnt_t	sp_size;
3388 	boolean_t	succeeded;
3389 
3390 	sp_size = 0;
3391 	if (number_of_sps > 0) {
3392 		free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep,
3393 		    mddrivenamep);
3394 		sp_size = free_blocks / number_of_sps;
3395 		succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3396 		    mddrivenamep, number_of_sps, sp_size);
3397 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3398 			/*
3399 			 * To compensate for space that may have been
3400 			 * occupied by watermarks, reduce sp_size by a
3401 			 * number of blocks equal to the number of soft
3402 			 * partitions desired, and test again to see
3403 			 * whether the desired number of soft partitions
3404 			 * can be created.
3405 			 */
3406 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3407 			succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3408 			    mddrivenamep, number_of_sps, sp_size);
3409 		}
3410 		if (sp_size < 0) {
3411 			sp_size = 0;
3412 		}
3413 	}
3414 	return (sp_size);
3415 }
3416 
3417 /*
3418  * **************************************************************************
3419  *                  Unit Structure Manipulation Functions                   *
3420  * **************************************************************************
3421  */
3422 
3423 /*
3424  * FUNCTION:	meta_sp_fillextarray()
3425  * INPUT:	mp	- the unit structure to fill
3426  *		extlist	- the list of extents to fill with
3427  * OUTPUT:	none
3428  * RETURNS:	void
3429  * PURPOSE:	fills in the unit structure extent list with the extents
3430  *		specified by extlist.  Only extents in extlist with the
3431  *		EXTFLG_UPDATE flag are changed in the unit structure,
3432  *		and the index into the unit structure is the sequence
3433  *		number in the extent list.  After all of the nodes have
3434  *		been updated the virtual offsets in the unit structure
3435  *		are updated to reflect the new lengths.
3436  */
3437 static void
3438 meta_sp_fillextarray(
3439 	mp_unit_t	*mp,
3440 	sp_ext_node_t	*extlist
3441 )
3442 {
3443 	int	i;
3444 	sp_ext_node_t	*ext;
3445 	sp_ext_offset_t	curvoff = 0LL;
3446 
3447 	assert(mp != NULL);
3448 
3449 	/* go through the allocation list and fill in our unit structure */
3450 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
3451 		if ((ext->ext_type == EXTTYP_ALLOC) &&
3452 		    (ext->ext_flags & EXTFLG_UPDATE) != 0) {
3453 			mp->un_ext[ext->ext_seq].un_poff =
3454 			    ext->ext_offset + MD_SP_WMSIZE;
3455 			mp->un_ext[ext->ext_seq].un_len =
3456 			    ext->ext_length - MD_SP_WMSIZE;
3457 		}
3458 	}
3459 
3460 	for (i = 0; i < mp->un_numexts; i++) {
3461 		assert(mp->un_ext[i].un_poff != 0);
3462 		assert(mp->un_ext[i].un_len  != 0);
3463 		mp->un_ext[i].un_voff = curvoff;
3464 		curvoff += mp->un_ext[i].un_len;
3465 	}
3466 }
3467 
3468 /*
3469  * FUNCTION:	meta_sp_createunit()
3470  * INPUT:	np	- the name of the device to create a unit structure for
3471  *		compnp	- the name of the device the soft partition is on
3472  *		extlist	- the extent list to populate the new unit with
3473  *		numexts	- the number of extents in the extent list
3474  *		len	- the total size of the soft partition (sectors)
3475  *		status	- the initial status of the unit structure
3476  * OUTPUT:	ep	- return error pointer
3477  * RETURNS:	mp_unit_t * - the new unit structure.
3478  * PURPOSE:	allocates and fills in a new soft partition unit
3479  *		structure to be passed to the soft partitioning driver
3480  *		for creation.
3481  */
3482 static mp_unit_t *
3483 meta_sp_createunit(
3484 	mdname_t	*np,
3485 	mdname_t	*compnp,
3486 	sp_ext_node_t	*extlist,
3487 	int		numexts,
3488 	sp_ext_length_t	len,
3489 	sp_status_t	status,
3490 	md_error_t	*ep
3491 )
3492 {
3493 	mp_unit_t	*mp;
3494 	uint_t		ms_size;
3495 
3496 	ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) +
3497 	    (numexts * sizeof (mp->un_ext[0]));
3498 
3499 	mp = Zalloc(ms_size);
3500 
3501 	/* fill in fields in common unit structure */
3502 	mp->c.un_type = MD_METASP;
3503 	mp->c.un_size = ms_size;
3504 	MD_SID(mp) = meta_getminor(np->dev);
3505 	mp->c.un_total_blocks = len;
3506 	mp->c.un_actual_tb = len;
3507 
3508 	/* set up geometry */
3509 	(void) meta_sp_setgeom(np, compnp, mp, ep);
3510 
3511 	/* if we're building on metadevice we can't parent */
3512 	if (metaismeta(compnp))
3513 		MD_CAPAB(mp) = MD_CANT_PARENT;
3514 	else
3515 		MD_CAPAB(mp) = MD_CAN_PARENT;
3516 
3517 	/* fill soft partition-specific fields */
3518 	mp->un_dev = compnp->dev;
3519 	mp->un_key = compnp->key;
3520 
3521 	/* mdname_t start_blk field is not 64-bit! */
3522 	mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk;
3523 	mp->un_status = status;
3524 	mp->un_numexts = numexts;
3525 	mp->un_length = len;
3526 
3527 	/* fill in the extent array */
3528 	meta_sp_fillextarray(mp, extlist);
3529 
3530 	return (mp);
3531 }
3532 
3533 /*
3534  * FUNCTION:	meta_sp_updateunit()
3535  * INPUT:	np       - name structure for the metadevice being updated
3536  *		old_un	 - the original unit structure that is being updated
3537  *		extlist	 - the extent list to populate the new unit with
3538  *		grow_len - the amount by which the partition is being grown
3539  *		numexts	 - the number of extents in the extent list
3540  *		ep       - return error pointer
3541  * OUTPUT:	none
3542  * RETURNS:	mp_unit_t * - the updated unit structure
3543  * PURPOSE:	allocates and fills in a new soft partition unit structure to
3544  *		be passed to the soft partitioning driver for creation.  The
3545  *		old unit structure is first copied in, and then the updated
3546  *		extents are changed in the new unit structure.  This is
3547  *		typically used when the size of an existing unit is changed.
3548  */
3549 static mp_unit_t *
3550 meta_sp_updateunit(
3551 	mdname_t	*np,
3552 	mp_unit_t	*old_un,
3553 	sp_ext_node_t	*extlist,
3554 	sp_ext_length_t	grow_len,
3555 	int		numexts,
3556 	md_error_t	*ep
3557 )
3558 {
3559 	mp_unit_t	*new_un;
3560 	sp_ext_length_t	new_len;
3561 	uint_t		new_size;
3562 
3563 	assert(old_un != NULL);
3564 	assert(extlist != NULL);
3565 
3566 	/* allocate new unit structure and copy in old unit */
3567 	new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) +
3568 	    ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0]));
3569 	new_len = old_un->un_length + grow_len;
3570 	new_un = Zalloc(new_size);
3571 	bcopy(old_un, new_un, old_un->c.un_size);
3572 
3573 	/* update size and geometry information */
3574 	new_un->c.un_size = new_size;
3575 	new_un->un_length = new_len;
3576 	new_un->c.un_total_blocks = new_len;
3577 	new_un->c.un_actual_tb = new_len;
3578 	if (meta_adjust_geom((md_unit_t *)new_un, np,
3579 	    old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct,
3580 	    0, ep) != 0) {
3581 		Free(new_un);
3582 		return (NULL);
3583 	}
3584 
3585 	/* update extent information */
3586 	new_un->un_numexts += numexts;
3587 
3588 	meta_sp_fillextarray(new_un, extlist);
3589 
3590 	return (new_un);
3591 }
3592 
3593 /*
3594  * FUNCTION:	meta_get_sp()
3595  * INPUT:	sp	- the set name for the device to get
3596  *		np	- the name of the device to get
3597  * OUTPUT:	ep	- return error pointer
3598  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition
3599  * PURPOSE:	interface to the rest of libmeta for fetching a unit structure
3600  *		for the named device.  Just a wrapper for meta_get_sp_common().
3601  */
3602 md_sp_t *
3603 meta_get_sp(
3604 	mdsetname_t	*sp,
3605 	mdname_t	*np,
3606 	md_error_t	*ep
3607 )
3608 {
3609 	return (meta_get_sp_common(sp, np, 0, ep));
3610 }
3611 
3612 /*
3613  * FUNCTION:	meta_get_sp_common()
3614  * INPUT:	sp	- the set name for the device to get
3615  *		np	- the name of the device to get
3616  *		fast	- whether to use the cache or not (NOT IMPLEMENTED!)
3617  * OUTPUT:	ep	- return error pointer
3618  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition,
3619  *			    NULL if np is not a soft partition
3620  * PURPOSE:	common routine for fetching a soft partition unit structure
3621  */
3622 md_sp_t *
3623 meta_get_sp_common(
3624 	mdsetname_t	*sp,
3625 	mdname_t	*np,
3626 	int		fast,
3627 	md_error_t	*ep
3628 )
3629 {
3630 	mddrivename_t	*dnp = np->drivenamep;
3631 	char		*miscname;
3632 	mp_unit_t	*mp;
3633 	md_sp_t		*msp;
3634 	int		i;
3635 
3636 	/* must have set */
3637 	assert(sp != NULL);
3638 
3639 	/* short circuit */
3640 	if (dnp->unitp != NULL) {
3641 		if (dnp->unitp->type != MD_METASP)
3642 			return (NULL);
3643 		return ((md_sp_t *)dnp->unitp);
3644 	}
3645 	/* get miscname and unit */
3646 	if ((miscname = metagetmiscname(np, ep)) == NULL)
3647 		return (NULL);
3648 
3649 	if (strcmp(miscname, MD_SP) != 0) {
3650 		(void) mdmderror(ep, MDE_NOT_SP, 0, np->cname);
3651 		return (NULL);
3652 	}
3653 
3654 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
3655 		return (NULL);
3656 
3657 	assert(mp->c.un_type == MD_METASP);
3658 
3659 	/* allocate soft partition */
3660 	msp = Zalloc(sizeof (*msp));
3661 
3662 	/* get the common information */
3663 	msp->common.namep = np;
3664 	msp->common.type = mp->c.un_type;
3665 	msp->common.state = mp->c.un_status;
3666 	msp->common.capabilities = mp->c.un_capabilities;
3667 	msp->common.parent = mp->c.un_parent;
3668 	msp->common.size = mp->c.un_total_blocks;
3669 	msp->common.user_flags = mp->c.un_user_flags;
3670 	msp->common.revision = mp->c.un_revision;
3671 
3672 	/* get soft partition information */
3673 	if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL)
3674 		goto out;
3675 
3676 	/*
3677 	 * Fill in the key and the start block.  Note that the start
3678 	 * block in the unit structure is 64 bits but the name pointer
3679 	 * only supports 32 bits.
3680 	 */
3681 	msp->compnamep->key = mp->un_key;
3682 	msp->compnamep->start_blk = mp->un_start_blk;
3683 
3684 	/* fill in status field */
3685 	msp->status = mp->un_status;
3686 
3687 	/* allocate the extents */
3688 	msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val));
3689 	msp->ext.ext_len = mp->un_numexts;
3690 
3691 	/* do the extents for this soft partition */
3692 	for (i = 0; i < mp->un_numexts; i++) {
3693 		struct mp_ext	*mde = &mp->un_ext[i];
3694 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
3695 
3696 		extp->voff = mde->un_voff;
3697 		extp->poff = mde->un_poff;
3698 		extp->len = mde->un_len;
3699 	}
3700 
3701 	/* cleanup, return success */
3702 	Free(mp);
3703 	dnp->unitp = (md_common_t *)msp;
3704 	return (msp);
3705 
3706 out:
3707 	/* clean up and return error */
3708 	Free(mp);
3709 	Free(msp);
3710 	return (NULL);
3711 }
3712 
3713 
3714 /*
3715  * FUNCTION:	meta_init_sp()
3716  * INPUT:	spp	- the set name for the new device
3717  *		argc	- the remaining argument count for the metainit cmdline
3718  *		argv	- the remainder of the unparsed command line
3719  *		options	- global options parsed by metainit
3720  * OUTPUT:	ep	- return error pointer
3721  * RETURNS:	int	- -1 failure, 0 success
3722  * PURPOSE:	provides the command line parsing and name management overhead
3723  *		for creating a new soft partition.  Ultimately this calls
3724  *		meta_create_sp() which does the real work of allocating space
3725  *		for the new soft partition.
3726  */
3727 int
3728 meta_init_sp(
3729 	mdsetname_t	**spp,
3730 	int		argc,
3731 	char		*argv[],
3732 	mdcmdopts_t	options,
3733 	md_error_t	*ep
3734 )
3735 {
3736 	char		*compname = NULL;
3737 	mdname_t	*spcompnp = NULL;	/* name of component volume */
3738 	char		*devname = argv[0];	/* unit name */
3739 	mdname_t	*np = NULL;		/* name of soft partition */
3740 	md_sp_t		*msp = NULL;
3741 	int		c;
3742 	int		old_optind;
3743 	sp_ext_length_t	len = 0LL;
3744 	int		rval = -1;
3745 	uint_t		seq;
3746 	int		oflag;
3747 	int		failed;
3748 	mddrivename_t	*dnp = NULL;
3749 	sp_ext_length_t	alignment = 0LL;
3750 	sp_ext_node_t	*extlist = NULL;
3751 
3752 	assert(argc > 0);
3753 
3754 	/* expect sp name, -p, optional -e, compname, and size parameters */
3755 	/* grab soft partition name */
3756 	if ((np = metaname(spp, devname, META_DEVICE, ep)) == NULL)
3757 		goto out;
3758 
3759 	/* see if it exists already */
3760 	if (metagetmiscname(np, ep) != NULL) {
3761 		(void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP,
3762 		    meta_getminor(np->dev), devname);
3763 		goto out;
3764 	} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) {
3765 		goto out;
3766 	} else {
3767 		mdclrerror(ep);
3768 	}
3769 	--argc, ++argv;
3770 
3771 	if (argc == 0)
3772 		goto syntax;
3773 
3774 	/* grab -p */
3775 	if (strcmp(argv[0], "-p") != 0)
3776 		goto syntax;
3777 	--argc, ++argv;
3778 
3779 	if (argc == 0)
3780 		goto syntax;
3781 
3782 	/* see if -e is there */
3783 	if (strcmp(argv[0], "-e") == 0) {
3784 		/* use the whole disk */
3785 		options |= MDCMD_USE_WHOLE_DISK;
3786 		--argc, ++argv;
3787 	}
3788 
3789 	if (argc == 0)
3790 		goto syntax;
3791 
3792 	/* get component name */
3793 	compname = Strdup(argv[0]);
3794 
3795 	if (options & MDCMD_USE_WHOLE_DISK) {
3796 		if ((dnp = metadrivename(spp, compname, ep)) == NULL) {
3797 			goto out;
3798 		}
3799 		if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) {
3800 			goto out;
3801 		}
3802 	} else if ((spcompnp = metaname(spp, compname, UNKNOWN, ep)) == NULL) {
3803 		goto out;
3804 	}
3805 	assert(*spp != NULL);
3806 
3807 	if (!(options & MDCMD_NOLOCK)) {
3808 		/* grab set lock */
3809 		if (meta_lock(*spp, TRUE, ep))
3810 			goto out;
3811 
3812 		if (meta_check_ownership(*spp, ep) != 0)
3813 			goto out;
3814 	}
3815 
3816 	/* allocate the soft partition */
3817 	msp = Zalloc(sizeof (*msp));
3818 
3819 	/* setup common */
3820 	msp->common.namep = np;
3821 	msp->common.type = MD_METASP;
3822 
3823 	compname = spcompnp->cname;
3824 
3825 	assert(spcompnp->rname != NULL);
3826 	--argc, ++argv;
3827 
3828 	if (argc == 0) {
3829 		goto syntax;
3830 	}
3831 
3832 	if (*argv[0] == '-') {
3833 		/*
3834 		 * parse any other command line options, this includes
3835 		 * the recovery options -o and -b. The special thing
3836 		 * with these options is that the len needs to be
3837 		 * kept track of otherwise when the geometry of the
3838 		 * "device" is built it will create an invalid geometry
3839 		 */
3840 		old_optind = optind = 0;
3841 		opterr = 0;
3842 		oflag = 0;
3843 		seq = 0;
3844 		failed = 0;
3845 		while ((c = getopt(argc, argv, "A:o:b:")) != -1) {
3846 			sp_ext_offset_t	offset;
3847 			sp_ext_length_t	length;
3848 			longlong_t	tmp_size;
3849 
3850 			switch (c) {
3851 			case 'A':	/* data alignment */
3852 				if (meta_sp_parsesizestring(optarg,
3853 				    &alignment) == -1) {
3854 					failed = 1;
3855 				}
3856 				break;
3857 			case 'o':	/* offset in the partition */
3858 				if (oflag == 1) {
3859 					failed = 1;
3860 				} else {
3861 					tmp_size = atoll(optarg);
3862 					if (tmp_size <= 0) {
3863 						failed = 1;
3864 					} else {
3865 						oflag = 1;
3866 						options |= MDCMD_DIRECT;
3867 
3868 						offset = tmp_size;
3869 					}
3870 				}
3871 
3872 				break;
3873 			case 'b':	/* number of blocks */
3874 				if (oflag == 0) {
3875 					failed = 1;
3876 				} else {
3877 					tmp_size = atoll(optarg);
3878 					if (tmp_size <= 0) {
3879 						failed = 1;
3880 					} else {
3881 						oflag = 0;
3882 
3883 						length = tmp_size;
3884 
3885 						/* we have a pair of values */
3886 						meta_sp_list_insert(*spp, np,
3887 						    &extlist, offset, length,
3888 						    EXTTYP_ALLOC, seq++,
3889 						    EXTFLG_UPDATE,
3890 						    meta_sp_cmp_by_offset);
3891 						len += length;
3892 					}
3893 				}
3894 
3895 				break;
3896 			default:
3897 				argc -= old_optind;
3898 				argv += old_optind;
3899 				goto options;
3900 			}
3901 
3902 			if (failed) {
3903 				argc -= old_optind;
3904 				argv += old_optind;
3905 				goto syntax;
3906 			}
3907 
3908 			old_optind = optind;
3909 		}
3910 		argc -= optind;
3911 		argv += optind;
3912 
3913 		/*
3914 		 * Must have matching pairs of -o and -b flags
3915 		 */
3916 		if (oflag != 0)
3917 			goto syntax;
3918 
3919 		/*
3920 		 * Can't specify both layout (indicated indirectly by
3921 		 * len being set by thye -o/-b cases above) AND
3922 		 * alignment
3923 		 */
3924 		if ((len > 0LL) && (alignment > 0LL))
3925 			goto syntax;
3926 
3927 		/*
3928 		 * sanity check the allocation list
3929 		 */
3930 		if ((extlist != NULL) && meta_sp_list_overlaps(extlist))
3931 			goto syntax;
3932 	}
3933 
3934 	if (len == 0LL) {
3935 		if (argc == 0)
3936 			goto syntax;
3937 		if (meta_sp_parsesize(argv[0], &len) == -1)
3938 			goto syntax;
3939 		--argc, ++argv;
3940 	}
3941 
3942 	msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val));
3943 	msp->ext.ext_val->len = len;
3944 	msp->compnamep = spcompnp;
3945 
3946 	/* we should be at the end */
3947 	if (argc != 0)
3948 		goto syntax;
3949 
3950 	/* create soft partition */
3951 	if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0)
3952 		goto out;
3953 	rval = 0;
3954 
3955 	/* let em know */
3956 	if (options & MDCMD_PRINT) {
3957 		(void) printf(dgettext(TEXT_DOMAIN,
3958 		    "%s: Soft Partition is setup\n"),
3959 		    devname);
3960 		(void) fflush(stdout);
3961 	}
3962 	goto out;
3963 
3964 syntax:
3965 	/* syntax error */
3966 	rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv);
3967 	goto out;
3968 
3969 options:
3970 	/* options error */
3971 	rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv);
3972 	goto out;
3973 
3974 out:
3975 	if (msp != NULL) {
3976 		if (msp->ext.ext_val != NULL) {
3977 			Free(msp->ext.ext_val);
3978 		}
3979 		Free(msp);
3980 	}
3981 
3982 	return (rval);
3983 }
3984 
3985 /*
3986  * FUNCTION:	meta_free_sp()
3987  * INPUT:	msp	- the soft partition unit to free
3988  * OUTPUT:	none
3989  * RETURNS:	void
3990  * PURPOSE:	provides an interface from the rest of libmeta for freeing a
3991  *		soft partition unit
3992  */
3993 void
3994 meta_free_sp(md_sp_t *msp)
3995 {
3996 	Free(msp);
3997 }
3998 
3999 /*
4000  * FUNCTION:	meta_sp_issp()
4001  * INPUT:	sp	- the set name to check
4002  *		np	- the name to check
4003  * OUTPUT:	ep	- return error pointer
4004  * RETURNS:	int	- 0 means sp,np is a soft partition
4005  *			  1 means sp,np is not a soft partition
4006  * PURPOSE:	determines whether the given device is a soft partition
4007  *		device.  This is called by other metadevice check routines.
4008  */
4009 int
4010 meta_sp_issp(
4011 	mdsetname_t	*sp,
4012 	mdname_t	*np,
4013 	md_error_t	*ep
4014 )
4015 {
4016 	if (meta_get_sp_common(sp, np, 0, ep) == NULL)
4017 		return (1);
4018 
4019 	return (0);
4020 }
4021 
4022 /*
4023  * FUNCTION:	meta_check_sp()
4024  * INPUT:	sp	- the set name to check
4025  *		msp	- the unit structure to check
4026  *		options	- creation options
4027  * OUTPUT:	repart_options - options to be passed to
4028  *				meta_repartition_drive()
4029  *		ep	- return error pointer
4030  * RETURNS:	int	-  0 ok to create on this component
4031  *			  -1 error or not ok to create on this component
4032  * PURPOSE:	Checks to determine whether the rules for creation of
4033  *		soft partitions allow creation of a soft partition on
4034  *		the device described by the mdname_t structure referred
4035  *		to by msp->compnamep.
4036  *
4037  *		NOTE: Does NOT check to determine whether the extents
4038  *		      described in the md_sp_t structure referred to by
4039  *		      msp will fit on the device described by the mdname_t
4040  *		      structure located at msp->compnamep.
4041  */
4042 static int
4043 meta_check_sp(
4044 	mdsetname_t	*sp,
4045 	md_sp_t		*msp,
4046 	mdcmdopts_t	options,
4047 	int		*repart_options,
4048 	md_error_t	*ep
4049 )
4050 {
4051 	md_common_t	*mdp;
4052 	mdname_t	*compnp = msp->compnamep;
4053 	uint_t		slice;
4054 	mddrivename_t	*dnp;
4055 	mdname_t	*slicenp;
4056 	mdvtoc_t	*vtocp;
4057 
4058 	/* make sure it is in the set */
4059 	if (meta_check_inset(sp, compnp, ep) != 0)
4060 		return (-1);
4061 
4062 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4063 		uint_t	rep_slice;
4064 
4065 		/*
4066 		 * check to make sure we can partition this drive.
4067 		 * we cannot continue if any of the following are
4068 		 * true:
4069 		 * The drive is a metadevice.
4070 		 * The drive contains a mounted slice.
4071 		 * The drive contains a slice being swapped to.
4072 		 * The drive contains slices which are part of other
4073 		 * metadevices.
4074 		 * The drive contains a metadb.
4075 		 */
4076 		if (metaismeta(compnp))
4077 			return (mddeverror(ep, MDE_IS_META, compnp->dev,
4078 			    compnp->cname));
4079 
4080 		assert(compnp->drivenamep != NULL);
4081 
4082 		/*
4083 		 * ensure that we have slice 0 since the disk will be
4084 		 * repartitioned in the USE_WHOLE_DISK case.  this check
4085 		 * is redundant unless the user incorrectly specifies a
4086 		 * a fully qualified drive AND slice name (i.e.,
4087 		 * /dev/dsk/cXtXdXsX), which will be incorrectly
4088 		 * recognized as a drive name by the metaname code.
4089 		 */
4090 
4091 		if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL)
4092 			return (-1);
4093 		if (slice != MD_SLICE0)
4094 			return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname));
4095 
4096 		dnp = compnp->drivenamep;
4097 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
4098 			return (-1);
4099 
4100 		for (slice = 0; slice < vtocp->nparts; slice++) {
4101 
4102 			/* only check if the slice really exists */
4103 			if (vtocp->parts[slice].size == 0)
4104 				continue;
4105 
4106 			slicenp = metaslicename(dnp, slice, ep);
4107 			if (slicenp == NULL)
4108 				return (-1);
4109 
4110 			/* check to ensure that it is not already in use */
4111 			if (meta_check_inuse(sp,
4112 			    slicenp, MDCHK_INUSE, ep) != 0) {
4113 				return (-1);
4114 			}
4115 
4116 			/*
4117 			 * Up to this point, tests are applied to all
4118 			 * slices uniformly.
4119 			 */
4120 
4121 			if (slice == rep_slice) {
4122 				/*
4123 				 * Tests inside the body of this
4124 				 * conditional are applied only to
4125 				 * slice seven.
4126 				 */
4127 				if (meta_check_inmeta(sp, slicenp,
4128 				    options | MDCHK_ALLOW_MDDB |
4129 				    MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0)
4130 					return (-1);
4131 
4132 				/*
4133 				 * For slice seven, a metadb is NOT an
4134 				 * automatic failure. It merely means
4135 				 * that we're not allowed to muck
4136 				 * about with the partitioning of that
4137 				 * slice.  We indicate this by masking
4138 				 * in the MD_REPART_LEAVE_REP flag.
4139 				 */
4140 				if (metahasmddb(sp, slicenp, ep)) {
4141 					assert(repart_options !=
4142 					    NULL);
4143 					*repart_options |=
4144 					    MD_REPART_LEAVE_REP;
4145 				}
4146 
4147 				/*
4148 				 * Skip the remaining tests for slice
4149 				 * seven
4150 				 */
4151 				continue;
4152 			}
4153 
4154 			/*
4155 			 * Tests below this point will be applied to
4156 			 * all slices EXCEPT for the replica slice.
4157 			 */
4158 
4159 
4160 			/* check if component is in a metadevice */
4161 			if (meta_check_inmeta(sp, slicenp, options, 0,
4162 			    -1, ep) != 0)
4163 				return (-1);
4164 
4165 			/* check to see if component has a metadb */
4166 			if (metahasmddb(sp, slicenp, ep))
4167 				return (mddeverror(ep, MDE_HAS_MDDB,
4168 				    slicenp->dev, slicenp->cname));
4169 		}
4170 		/*
4171 		 * This should be all of the testing necessary when
4172 		 * the MDCMD_USE_WHOLE_DISK flag is set; the rest of
4173 		 * meta_check_sp() is oriented towards component
4174 		 * arguments instead of disks.
4175 		 */
4176 		goto meta_check_sp_ok;
4177 
4178 	}
4179 
4180 	/* check to ensure that it is not already in use */
4181 	if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) {
4182 		return (-1);
4183 	}
4184 
4185 	if (!metaismeta(compnp)) {	/* handle non-metadevices */
4186 
4187 		/*
4188 		 * The component can have one or more soft partitions on it
4189 		 * already, but can't be part of any other type of metadevice,
4190 		 * so if it is used for a metadevice, but the metadevice
4191 		 * isn't a soft partition, return failure.
4192 		 */
4193 
4194 		if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 &&
4195 		    meta_check_insp(sp, compnp, 0, -1, ep) == 0) {
4196 			return (-1);
4197 		}
4198 	} else {			/* handle metadevices */
4199 		/* get underlying unit & check capabilities */
4200 		if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL)
4201 			return (-1);
4202 
4203 		if ((! (mdp->capabilities & MD_CAN_PARENT)) ||
4204 		    (! (mdp->capabilities & MD_CAN_SP)))
4205 			return (mdmderror(ep, MDE_INVAL_UNIT,
4206 			    meta_getminor(compnp->dev), compnp->cname));
4207 	}
4208 
4209 meta_check_sp_ok:
4210 	mdclrerror(ep);
4211 	return (0);
4212 }
4213 
4214 /*
4215  * FUNCTION:	meta_create_sp()
4216  * INPUT:	sp	- the set name to create in
4217  *		msp	- the unit structure to create
4218  *		oblist	- an optional list of requested extents (-o/-b options)
4219  *		options	- creation options
4220  *		alignment - data alignment
4221  * OUTPUT:	ep	- return error pointer
4222  * RETURNS:	int	-  0 success, -1 error
4223  * PURPOSE:	does most of the work for creating a soft partition.  If
4224  *		metainit -p -e was used, first partition the drive.  Then
4225  *		create an extent list based on the existing soft partitions
4226  *		and assume all space not used by them is free.  Storage for
4227  *		the new soft partition is allocated from the free extents
4228  *		based on the length specified on the command line or the
4229  *		oblist passed in.  The unit structure is then committed and
4230  *		the watermarks are updated.  Finally, the status is changed to
4231  *		Okay and the process is complete.
4232  */
4233 static int
4234 meta_create_sp(
4235 	mdsetname_t	*sp,
4236 	md_sp_t		*msp,
4237 	sp_ext_node_t	*oblist,
4238 	mdcmdopts_t	options,
4239 	sp_ext_length_t	alignment,
4240 	md_error_t	*ep
4241 )
4242 {
4243 	mdname_t	*np = msp->common.namep;
4244 	mdname_t	*compnp = msp->compnamep;
4245 	mp_unit_t	*mp = NULL;
4246 	mdnamelist_t	*keynlp = NULL, *spnlp = NULL;
4247 	md_set_params_t	set_params;
4248 	int		rval = -1;
4249 	diskaddr_t	comp_size;
4250 	diskaddr_t	sp_start;
4251 	sp_ext_node_t	*extlist = NULL;
4252 	int		numexts = 0;	/* number of extents */
4253 	int		count = 0;
4254 	int		committed = 0;
4255 	int		repart_options = MD_REPART_FORCE;
4256 	int		create_flag = MD_CRO_32BIT;
4257 	int		mn_set_master = 0;
4258 
4259 	md_set_desc	*sd;
4260 	md_set_mmown_params_t	*ownpar = NULL;
4261 	int		comp_is_mirror = 0;
4262 
4263 	/* validate soft partition */
4264 	if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0)
4265 		return (-1);
4266 
4267 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4268 		if ((options & MDCMD_DOIT) != 0) {
4269 			if (meta_repartition_drive(sp,
4270 			    compnp->drivenamep,
4271 			    repart_options,
4272 			    NULL, /* Don't return the VTOC */
4273 			    ep) != 0)
4274 
4275 				return (-1);
4276 		} else {
4277 			/*
4278 			 * If -n and -e are both specified, it doesn't make
4279 			 * sense to continue without actually partitioning
4280 			 * the drive.
4281 			 */
4282 			return (0);
4283 		}
4284 	}
4285 
4286 	/* populate the start_blk field of the component name */
4287 	if ((sp_start = meta_sp_get_start(sp, compnp, ep)) ==
4288 	    MD_DISKADDR_ERROR) {
4289 		rval = -1;
4290 		goto out;
4291 	}
4292 
4293 	if (options & MDCMD_DOIT) {
4294 		/* store name in namespace */
4295 		if (add_key_name(sp, compnp, &keynlp, ep) != 0) {
4296 			rval = -1;
4297 			goto out;
4298 		}
4299 	}
4300 
4301 	/*
4302 	 * Get a list of the soft partitions that currently reside on
4303 	 * the component.  We should ALWAYS force reload the cache,
4304 	 * because if this is a single creation, there will not BE a
4305 	 * cached list, and if we're using the md.tab, we must rebuild
4306 	 * the list because it won't contain the previous (if any)
4307 	 * soft partition.
4308 	 */
4309 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4310 	if (count < 0) {
4311 		/* error occured */
4312 		rval = -1;
4313 		goto out;
4314 	}
4315 
4316 	/*
4317 	 * get the size of the underlying device.  if the size is smaller
4318 	 * than or equal to the watermark size, we know there isn't
4319 	 * enough space.
4320 	 */
4321 	if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) {
4322 		rval = -1;
4323 		goto out;
4324 	} else if (comp_size <= MD_SP_WMSIZE) {
4325 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname);
4326 		rval = -1;
4327 		goto out;
4328 	}
4329 	/*
4330 	 * seed extlist with reserved space at the beginning of the volume and
4331 	 * enough space for the end watermark.  The end watermark always gets
4332 	 * updated, but if the underlying device changes size it may not be
4333 	 * pointed to until the extent before it is updated.  Since the
4334 	 * end of the reserved space is where the first watermark starts,
4335 	 * the reserved extent should never be marked for updating.
4336 	 */
4337 
4338 	meta_sp_list_insert(NULL, NULL, &extlist,
4339 	    0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4340 	meta_sp_list_insert(NULL, NULL, &extlist,
4341 	    (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE,
4342 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4343 
4344 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4345 		rval = -1;
4346 		goto out;
4347 	}
4348 
4349 	metafreenamelist(spnlp);
4350 
4351 	if (getenv(META_SP_DEBUG)) {
4352 		meta_sp_debug("meta_create_sp: list of used extents:\n");
4353 		meta_sp_list_dump(extlist);
4354 	}
4355 
4356 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4357 
4358 	/* get extent list from -o/-b options or from free space */
4359 	if (options & MDCMD_DIRECT) {
4360 		if (getenv(META_SP_DEBUG)) {
4361 			meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n");
4362 			meta_sp_list_dump(oblist);
4363 		}
4364 
4365 		numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist);
4366 		if (numexts == -1) {
4367 			(void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname);
4368 			rval = -1;
4369 			goto out;
4370 		}
4371 	} else {
4372 		numexts = meta_sp_alloc_by_len(sp, np, &extlist,
4373 		    &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment :
4374 		    meta_sp_get_default_alignment(sp, compnp, ep));
4375 		if (numexts == -1) {
4376 			(void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname);
4377 			rval = -1;
4378 			goto out;
4379 		}
4380 	}
4381 
4382 	assert(extlist != NULL);
4383 
4384 	/* create soft partition */
4385 	mp = meta_sp_createunit(msp->common.namep, msp->compnamep,
4386 	    extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep);
4387 
4388 	create_flag = meta_check_devicesize(mp->c.un_total_blocks);
4389 
4390 	/* if we're not doing anything (metainit -n), return success */
4391 	if (! (options & MDCMD_DOIT)) {
4392 		rval = 0;	/* success */
4393 		goto out;
4394 	}
4395 
4396 	(void) memset(&set_params, 0, sizeof (set_params));
4397 
4398 	if (create_flag == MD_CRO_64BIT) {
4399 		mp->c.un_revision |= MD_64BIT_META_DEV;
4400 		set_params.options = MD_CRO_64BIT;
4401 	} else {
4402 		mp->c.un_revision &= ~MD_64BIT_META_DEV;
4403 		set_params.options = MD_CRO_32BIT;
4404 	}
4405 
4406 	if (getenv(META_SP_DEBUG)) {
4407 		meta_sp_debug("meta_create_sp: printing unit structure\n");
4408 		meta_sp_printunit(mp);
4409 	}
4410 
4411 	/*
4412 	 * Check to see if we're trying to create a partition on a mirror. If so
4413 	 * we may have to enforce an ownership change before writing the
4414 	 * watermark out.
4415 	 */
4416 	if (metaismeta(compnp)) {
4417 		char *miscname;
4418 
4419 		miscname = metagetmiscname(compnp, ep);
4420 		if (miscname != NULL)
4421 			comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0);
4422 		else
4423 			comp_is_mirror = 0;
4424 	} else {
4425 		comp_is_mirror = 0;
4426 	}
4427 
4428 	/*
4429 	 * For a multi-node environment we have to ensure that the master
4430 	 * node owns an underlying mirror before we issue the MD_IOCSET ioctl.
4431 	 * If the master does not own the device we will deadlock as the
4432 	 * implicit write of the watermarks (in sp_ioctl.c) will cause an
4433 	 * ownership change that will block as the MD_IOCSET is still in
4434 	 * progress. To close this window we force an owner change to occur
4435 	 * before issuing the MD_IOCSET. We cannot simply open the device and
4436 	 * write to it as this will only work for the first soft-partition
4437 	 * creation.
4438 	 */
4439 
4440 	if (comp_is_mirror && !metaislocalset(sp)) {
4441 
4442 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4443 			rval = -1;
4444 			goto out;
4445 		}
4446 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
4447 			mn_set_master = 1;
4448 		}
4449 	}
4450 
4451 	set_params.mnum = MD_SID(mp);
4452 	set_params.size = mp->c.un_size;
4453 	set_params.mdp = (uintptr_t)mp;
4454 	MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum));
4455 
4456 	/* first phase of commit. */
4457 	if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
4458 	    np->cname) != 0) {
4459 		(void) mdstealerror(ep, &set_params.mde);
4460 		rval = -1;
4461 		goto out;
4462 	}
4463 
4464 	/* we've successfully committed the record */
4465 	committed = 1;
4466 
4467 	/* write watermarks */
4468 	/*
4469 	 * Special-case for Multi-node sets. As we now have a distributed DRL
4470 	 * update mechanism, we _will_ hit the ioctl-within-ioctl deadlock case
4471 	 * unless we use a 'special' MN-capable ioctl to stage the watermark
4472 	 * update. This only affects the master-node in an MN set.
4473 	 */
4474 	if (mn_set_master) {
4475 		if (meta_mn_sp_update_wm(sp, msp, extlist, ep) < 0) {
4476 			rval = -1;
4477 			goto out;
4478 		}
4479 	} else {
4480 		if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
4481 			rval = -1;
4482 			goto out;
4483 		}
4484 	}
4485 
4486 	/* second phase of commit, set status to MD_SP_OK */
4487 	if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) {
4488 		rval = -1;
4489 		goto out;
4490 	}
4491 	rval = 0;
4492 out:
4493 	Free(mp);
4494 	if (ownpar)
4495 		Free(ownpar);
4496 
4497 	if (extlist != NULL)
4498 		meta_sp_list_free(&extlist);
4499 
4500 	if (rval != 0 && keynlp != NULL && committed != 1)
4501 		(void) del_key_names(sp, keynlp, NULL);
4502 
4503 	metafreenamelist(keynlp);
4504 
4505 	return (rval);
4506 }
4507 
4508 /*
4509  * **************************************************************************
4510  *                      Reset (metaclear) Functions                         *
4511  * **************************************************************************
4512  */
4513 
4514 /*
4515  * FUNCTION:	meta_sp_reset_common()
4516  * INPUT:	sp	- the set name of the device to reset
4517  *		np	- the name of the device to reset
4518  *		msp	- the unit structure to reset
4519  *		options	- metaclear options
4520  * OUTPUT:	ep	- return error pointer
4521  * RETURNS:	int	-  0 success, -1 error
4522  * PURPOSE:	"resets", or more accurately deletes, the soft partition
4523  *		specified.  First the state is set to "deleting" and then the
4524  *		watermarks are all cleared out.  Once the watermarks have been
4525  *		updated, the unit structure is deleted from the metadb.
4526  */
4527 static int
4528 meta_sp_reset_common(
4529 	mdsetname_t	*sp,
4530 	mdname_t	*np,
4531 	md_sp_t		*msp,
4532 	md_sp_reset_t	reset_params,
4533 	mdcmdopts_t	options,
4534 	md_error_t	*ep
4535 )
4536 {
4537 	char	*miscname;
4538 	int	rval = -1;
4539 	int	is_open = 0;
4540 
4541 	/* make sure that nobody owns us */
4542 	if (MD_HAS_PARENT(msp->common.parent))
4543 		return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev),
4544 		    np->cname));
4545 
4546 	/* make sure that the soft partition isn't open */
4547 	if ((is_open = meta_isopen(sp, np, ep, options)) < 0)
4548 		return (-1);
4549 	else if (is_open)
4550 		return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev),
4551 		    np->cname));
4552 
4553 	/* get miscname */
4554 	if ((miscname = metagetmiscname(np, ep)) == NULL)
4555 		return (-1);
4556 
4557 	/* fill in reset params */
4558 	MD_SETDRIVERNAME(&reset_params, miscname, sp->setno);
4559 	reset_params.mnum = meta_getminor(np->dev);
4560 	reset_params.force = (options & MDCMD_FORCE) ? 1 : 0;
4561 
4562 	/*
4563 	 * clear soft partition - phase one.
4564 	 * place the soft partition into the "delete pending" state.
4565 	 */
4566 	if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0)
4567 		return (-1);
4568 
4569 	/*
4570 	 * Now clear the watermarks.  If the force flag is specified,
4571 	 * ignore any errors writing the watermarks and delete the unit
4572 	 * structure anyway.  An error may leave the on-disk format in a
4573 	 * corrupt state.  If force is not specified and we fail here,
4574 	 * the soft partition will remain in the "delete pending" state.
4575 	 */
4576 	if ((meta_sp_clear_wm(sp, msp, ep) < 0) &&
4577 	    ((options & MDCMD_FORCE) == 0))
4578 		goto out;
4579 
4580 	/*
4581 	 * clear soft partition - phase two.
4582 	 * the driver removes the soft partition from the metadb and
4583 	 * zeros out incore version.
4584 	 */
4585 	if (metaioctl(MD_IOCRESET, &reset_params,
4586 	    &reset_params.mde, np->cname) != 0) {
4587 		(void) mdstealerror(ep, &reset_params.mde);
4588 		goto out;
4589 	}
4590 
4591 	/*
4592 	 * Wait for the /dev to be cleaned up. Ignore the return
4593 	 * value since there's not much we can do.
4594 	 */
4595 	(void) meta_update_devtree(meta_getminor(np->dev));
4596 
4597 	rval = 0;	/* success */
4598 
4599 	if (options & MDCMD_PRINT) {
4600 		(void) printf(dgettext(TEXT_DOMAIN,
4601 		    "%s: Soft Partition is cleared\n"),
4602 		    np->cname);
4603 		(void) fflush(stdout);
4604 	}
4605 
4606 	/*
4607 	 * if told to recurse and on a metadevice, then attempt to
4608 	 * clear the subdevices.  Indicate failure if the clear fails.
4609 	 */
4610 	if ((options & MDCMD_RECURSE) &&
4611 	    (metaismeta(msp->compnamep)) &&
4612 	    (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0))
4613 		rval = -1;
4614 
4615 out:
4616 	meta_invalidate_name(np);
4617 	return (rval);
4618 }
4619 
4620 /*
4621  * FUNCTION:	meta_sp_reset()
4622  * INPUT:	sp	- the set name of the device to reset
4623  *		np	- the name of the device to reset
4624  *		options	- metaclear options
4625  * OUTPUT:	ep	- return error pointer
4626  * RETURNS:	int	-  0 success, -1 error
4627  * PURPOSE:	provides the entry point to the rest of libmeta for deleting a
4628  *		soft partition.  If np is NULL, then soft partitions are
4629  *		all deleted at the current level and then recursively deleted.
4630  *		Otherwise, if a name is specified either directly or as a
4631  *		result of a recursive operation, it deletes only that name.
4632  *		Since something sitting under a soft partition may be parented
4633  *		to it, we have to reparent that other device to another soft
4634  *		partition on the same component if we're deleting the one it's
4635  *		parented to.
4636  */
4637 int
4638 meta_sp_reset(
4639 	mdsetname_t	*sp,
4640 	mdname_t	*np,
4641 	mdcmdopts_t	options,
4642 	md_error_t	*ep
4643 )
4644 {
4645 	md_sp_t		*msp;
4646 	int		rval = -1;
4647 	mdnamelist_t	*spnlp = NULL, *nlp = NULL;
4648 	md_sp_reset_t	reset_params;
4649 	int		num_sp;
4650 
4651 	assert(sp != NULL);
4652 
4653 	/* reset/delete all soft paritions */
4654 	if (np == NULL) {
4655 		/*
4656 		 * meta_reset_all sets MDCMD_RECURSE, but this behavior
4657 		 * is incorrect for soft partitions.  We want to clear
4658 		 * all soft partitions at a particular level in the
4659 		 * metadevice stack before moving to the next level.
4660 		 * Thus, we clear MDCMD_RECURSE from the options.
4661 		 */
4662 		options &= ~MDCMD_RECURSE;
4663 
4664 		/* for each soft partition */
4665 		rval = 0;
4666 		if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
4667 			rval = -1;
4668 
4669 		for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) {
4670 			np = nlp->namep;
4671 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4672 				rval = -1;
4673 				break;
4674 			}
4675 			/*
4676 			 * meta_reset_all calls us twice to get soft
4677 			 * partitions at the top and bottom of the stack.
4678 			 * thus, if we have a parent, we'll get deleted
4679 			 * on the next call.
4680 			 */
4681 			if (MD_HAS_PARENT(msp->common.parent))
4682 				continue;
4683 			/*
4684 			 * If this is a multi-node set, we send a series
4685 			 * of individual metaclear commands.
4686 			 */
4687 			if (meta_is_mn_set(sp, ep)) {
4688 				if (meta_mn_send_metaclear_command(sp,
4689 				    np->cname, options, 0, ep) != 0) {
4690 					rval = -1;
4691 					break;
4692 				}
4693 			} else {
4694 				if (meta_sp_reset(sp, np, options, ep) != 0) {
4695 					rval = -1;
4696 					break;
4697 				}
4698 			}
4699 		}
4700 		/* cleanup return status */
4701 		metafreenamelist(spnlp);
4702 		return (rval);
4703 	}
4704 
4705 	/* check the name */
4706 	if (metachkmeta(np, ep) != 0)
4707 		return (-1);
4708 
4709 	/* get the unit structure */
4710 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4711 		return (-1);
4712 
4713 	/* clear out reset parameters */
4714 	(void) memset(&reset_params, 0, sizeof (reset_params));
4715 
4716 	/* if our child is a metadevice, we need to deparent/reparent it */
4717 	if (metaismeta(msp->compnamep)) {
4718 		/* get sp's on this component */
4719 		if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep,
4720 		    &spnlp, 1, ep)) <= 0)
4721 			/* no sp's on this device.  error! */
4722 			return (-1);
4723 		else if (num_sp == 1)
4724 			/* last sp on this device, so we deparent */
4725 			reset_params.new_parent = MD_NO_PARENT;
4726 		else {
4727 			/* have to reparent this metadevice */
4728 			for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4729 				if (meta_getminor(nlp->namep->dev) ==
4730 				    meta_getminor(np->dev))
4731 					continue;
4732 				/*
4733 				 * this isn't the softpart we are deleting,
4734 				 * so use this device as the new parent.
4735 				 */
4736 				reset_params.new_parent =
4737 				    meta_getminor(nlp->namep->dev);
4738 				break;
4739 			}
4740 		}
4741 		metafreenamelist(spnlp);
4742 	}
4743 
4744 	if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0)
4745 		return (-1);
4746 
4747 	return (0);
4748 }
4749 
4750 /*
4751  * FUNCTION:	meta_sp_reset_component()
4752  * INPUT:	sp	- the set name of the device to reset
4753  *		name	- the string name of the device to reset
4754  *		options	- metaclear options
4755  * OUTPUT:	ep	- return error pointer
4756  * RETURNS:	int	-  0 success, -1 error
4757  * PURPOSE:	provides the ability to delete all soft partitions on a
4758  *		specified device (metaclear -p).  It first gets all of the
4759  *		soft partitions on the component and then deletes each one
4760  *		individually.
4761  */
4762 int
4763 meta_sp_reset_component(
4764 	mdsetname_t	*sp,
4765 	char		*name,
4766 	mdcmdopts_t	options,
4767 	md_error_t	*ep
4768 )
4769 {
4770 	mdname_t	*compnp, *np;
4771 	mdnamelist_t	*spnlp = NULL;
4772 	mdnamelist_t	*nlp = NULL;
4773 	md_sp_t		*msp;
4774 	int		count;
4775 	md_sp_reset_t	reset_params;
4776 
4777 	if ((compnp = metaname(&sp, name, UNKNOWN, ep)) == NULL)
4778 		return (-1);
4779 
4780 	/* If we're starting out with no soft partitions, it's an error */
4781 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4782 	if (count == 0)
4783 		return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname));
4784 	else if (count < 0)
4785 		return (-1);
4786 
4787 	/*
4788 	 * clear all soft partitions on this component.
4789 	 * NOTE: we reparent underlying metadevices as we go so that
4790 	 * things stay sane.  Also, if we encounter an error, we stop
4791 	 * and go no further in case recovery might be needed.
4792 	 */
4793 	for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4794 		/* clear out reset parameters */
4795 		(void) memset(&reset_params, 0, sizeof (reset_params));
4796 
4797 		/* check the name */
4798 		np = nlp->namep;
4799 
4800 		if (metachkmeta(np, ep) != 0) {
4801 			metafreenamelist(spnlp);
4802 			return (-1);
4803 		}
4804 
4805 		/* get the unit structure */
4806 		if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4807 			metafreenamelist(spnlp);
4808 			return (-1);
4809 		}
4810 
4811 		/* have to deparent/reparent metadevices */
4812 		if (metaismeta(compnp)) {
4813 			if (nlp->next == NULL)
4814 				reset_params.new_parent = MD_NO_PARENT;
4815 			else
4816 				reset_params.new_parent =
4817 				    meta_getminor(spnlp->next->namep->dev);
4818 		}
4819 
4820 		/* clear soft partition */
4821 		if (meta_sp_reset_common(sp, np, msp, reset_params,
4822 		    options, ep) < 0) {
4823 			metafreenamelist(spnlp);
4824 			return (-1);
4825 		}
4826 	}
4827 	metafreenamelist(spnlp);
4828 	return (0);
4829 }
4830 
4831 /*
4832  * **************************************************************************
4833  *                      Grow (metattach) Functions                          *
4834  * **************************************************************************
4835  */
4836 
4837 /*
4838  * FUNCTION:	meta_sp_attach()
4839  * INPUT:	sp	- the set name of the device to attach to
4840  *		np	- the name of the device to attach to
4841  *		addsize	- the unparsed string holding the amount of space to add
4842  *		options	- metattach options
4843  *		alignment - data alignment
4844  * OUTPUT:	ep	- return error pointer
4845  * RETURNS:	int	-  0 success, -1 error
4846  * PURPOSE:	grows a soft partition by reading in the existing unit
4847  *		structure and setting its state to Growing, allocating more
4848  *		space (similar to meta_create_sp()), updating the watermarks,
4849  *		and then writing out the new unit structure in the Okay state.
4850  */
4851 int
4852 meta_sp_attach(
4853 	mdsetname_t	*sp,
4854 	mdname_t	*np,
4855 	char		*addsize,
4856 	mdcmdopts_t	options,
4857 	sp_ext_length_t	alignment,
4858 	md_error_t	*ep
4859 )
4860 {
4861 	md_grow_params_t	grow_params;
4862 	sp_ext_length_t		grow_len;	/* amount to grow */
4863 	mp_unit_t		*mp, *new_un;
4864 	mdname_t		*compnp = NULL;
4865 
4866 	sp_ext_node_t		*extlist = NULL;
4867 	int			numexts;
4868 	mdnamelist_t		*spnlp = NULL;
4869 	int			count;
4870 	md_sp_t			*msp;
4871 	daddr_t			start_block;
4872 
4873 	/* should have the same set */
4874 	assert(sp != NULL);
4875 	assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev)));
4876 
4877 	/* check name */
4878 	if (metachkmeta(np, ep) != 0)
4879 		return (-1);
4880 
4881 	if (meta_sp_parsesize(addsize, &grow_len) == -1) {
4882 		return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname));
4883 	}
4884 
4885 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
4886 		return (-1);
4887 
4888 	/* make sure we don't have a parent */
4889 	if (MD_HAS_PARENT(mp->c.un_parent)) {
4890 		Free(mp);
4891 		return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname));
4892 	}
4893 
4894 	if (getenv(META_SP_DEBUG)) {
4895 		meta_sp_debug("meta_sp_attach: Unit structure before new "
4896 		    "space:\n");
4897 		meta_sp_printunit(mp);
4898 	}
4899 
4900 	/*
4901 	 * NOTE: the fast option to metakeyname is 0 as opposed to 1
4902 	 * If this was not the case we would suffer the following
4903 	 * assertion failure:
4904 	 * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP
4905 	 * file meta_check.x, line 315
4906 	 * I guess this is because we have not "seen" this drive before
4907 	 * and hence hit the failure - this is of course the attach routine
4908 	 */
4909 	if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) {
4910 		Free(mp);
4911 		return (-1);
4912 	}
4913 
4914 	/* metakeyname does not fill in the key. */
4915 	compnp->key = mp->un_key;
4916 
4917 	/* work out the space on the component that we are dealing with */
4918 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
4919 
4920 	/*
4921 	 * see if the component has been soft partitioned yet, or if an
4922 	 * error occurred.
4923 	 */
4924 	if (count == 0) {
4925 		Free(mp);
4926 		return (mdmderror(ep, MDE_NOT_SP, 0, np->cname));
4927 	} else if (count < 0) {
4928 		Free(mp);
4929 		return (-1);
4930 	}
4931 
4932 	/*
4933 	 * seed extlist with reserved space at the beginning of the volume and
4934 	 * enough space for the end watermark.  The end watermark always gets
4935 	 * updated, but if the underlying device changes size it may not be
4936 	 * pointed to until the extent before it is updated.  Since the
4937 	 * end of the reserved space is where the first watermark starts,
4938 	 * the reserved extent should never be marked for updating.
4939 	 */
4940 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
4941 	    MD_DISKADDR_ERROR) {
4942 		Free(mp);
4943 		return (-1);
4944 	}
4945 
4946 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
4947 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4948 	meta_sp_list_insert(NULL, NULL, &extlist,
4949 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
4950 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4951 
4952 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4953 		Free(mp);
4954 		return (-1);
4955 	}
4956 
4957 	metafreenamelist(spnlp);
4958 
4959 	if (getenv(META_SP_DEBUG)) {
4960 		meta_sp_debug("meta_sp_attach: list of used extents:\n");
4961 		meta_sp_list_dump(extlist);
4962 	}
4963 
4964 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4965 
4966 	assert(mp->un_numexts >= 1);
4967 	numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len,
4968 	    mp->un_ext[mp->un_numexts - 1].un_poff,
4969 	    (alignment > 0) ? alignment :
4970 	    meta_sp_get_default_alignment(sp, compnp, ep));
4971 
4972 	if (numexts == -1) {
4973 		Free(mp);
4974 		return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname));
4975 	}
4976 
4977 	/* allocate new unit structure and copy in old unit */
4978 	if ((new_un = meta_sp_updateunit(np, mp, extlist,
4979 	    grow_len, numexts, ep)) == NULL) {
4980 		Free(mp);
4981 		return (-1);
4982 	}
4983 	Free(mp);
4984 
4985 	/* If running in dryrun mode (-n option), we're done here */
4986 	if ((options & MDCMD_DOIT) == 0) {
4987 		if (options & MDCMD_PRINT) {
4988 			(void) printf(dgettext(TEXT_DOMAIN,
4989 			    "%s: Soft Partition would grow\n"),
4990 			    np->cname);
4991 			(void) fflush(stdout);
4992 		}
4993 		return (0);
4994 	}
4995 
4996 	if (getenv(META_SP_DEBUG)) {
4997 		meta_sp_debug("meta_sp_attach: updated unit structure:\n");
4998 		meta_sp_printunit(new_un);
4999 	}
5000 
5001 	assert(new_un != NULL);
5002 
5003 	(void) memset(&grow_params, 0, sizeof (grow_params));
5004 	if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) {
5005 		grow_params.options = MD_CRO_64BIT;
5006 		new_un->c.un_revision |= MD_64BIT_META_DEV;
5007 	} else {
5008 		grow_params.options = MD_CRO_32BIT;
5009 		new_un->c.un_revision &= ~MD_64BIT_META_DEV;
5010 	}
5011 	grow_params.mnum = MD_SID(new_un);
5012 	grow_params.size = new_un->c.un_size;
5013 	grow_params.mdp = (uintptr_t)new_un;
5014 	MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum));
5015 
5016 	if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde,
5017 	    np->cname) != 0) {
5018 		(void) mdstealerror(ep, &grow_params.mde);
5019 		return (-1);
5020 	}
5021 
5022 	/* update all watermarks */
5023 
5024 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
5025 		return (-1);
5026 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0)
5027 		return (-1);
5028 
5029 
5030 	/* second phase of commit, set status to MD_SP_OK */
5031 	if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0)
5032 		return (-1);
5033 
5034 	meta_invalidate_name(np);
5035 
5036 	if (options & MDCMD_PRINT) {
5037 		(void) printf(dgettext(TEXT_DOMAIN,
5038 		    "%s: Soft Partition has been grown\n"),
5039 		    np->cname);
5040 		(void) fflush(stdout);
5041 	}
5042 
5043 	return (0);
5044 }
5045 
5046 /*
5047  * **************************************************************************
5048  *                    Recovery (metarecover) Functions                      *
5049  * **************************************************************************
5050  */
5051 
5052 /*
5053  * FUNCTION:	meta_recover_sp()
5054  * INPUT:	sp	- the name of the set we are recovering on
5055  *		compnp	- name pointer for device we are recovering on
5056  *		argc	- argument count
5057  *		argv	- left over arguments not parsed by metarecover command
5058  *		options	- metarecover options
5059  * OUTPUT:	ep	- return error pointer
5060  * RETURNS:	int	- 0 - success, -1 - error
5061  * PURPOSE:	parse soft partitioning-specific metarecover options and
5062  *		dispatch to the appropriate function to handle recovery.
5063  */
5064 int
5065 meta_recover_sp(
5066 	mdsetname_t	*sp,
5067 	mdname_t	*compnp,
5068 	int		argc,
5069 	char		*argv[],
5070 	mdcmdopts_t	options,
5071 	md_error_t	*ep
5072 )
5073 {
5074 	md_set_desc	*sd;
5075 
5076 	if (argc > 1) {
5077 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5078 		    argc, argv);
5079 		return (-1);
5080 	}
5081 
5082 	/*
5083 	 * For a MN set, this operation must be performed on the master
5084 	 * as it is responsible for maintaining the watermarks
5085 	 */
5086 	if (!metaislocalset(sp)) {
5087 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
5088 			return (-1);
5089 		if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) {
5090 			(void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno,
5091 			    sd->sd_mn_master_nodenm, NULL, NULL);
5092 			return (-1);
5093 		}
5094 	}
5095 	if (argc == 0) {
5096 		/*
5097 		 * if no additional arguments are passed, metarecover should
5098 		 * validate both on-disk and metadb structures as well as
5099 		 * checking that both are consistent with each other
5100 		 */
5101 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5102 			return (-1);
5103 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5104 			return (-1);
5105 		if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0)
5106 			return (-1);
5107 	} else if (strcmp(argv[0], "-d") == 0) {
5108 		/*
5109 		 * Ensure that there is no existing valid record for this
5110 		 * soft-partition. If there is we have nothing to do.
5111 		 */
5112 		if (meta_sp_validate_unit(sp, compnp, options, ep) == 0)
5113 			return (-1);
5114 		/* validate and recover from on-disk structures */
5115 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5116 			return (-1);
5117 		if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0)
5118 			return (-1);
5119 	} else if (strcmp(argv[0], "-m") == 0) {
5120 		/* validate and recover from metadb structures */
5121 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5122 			return (-1);
5123 		if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0)
5124 			return (-1);
5125 	} else {
5126 		/* syntax error */
5127 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5128 		    argc, argv);
5129 		return (-1);
5130 	}
5131 
5132 	return (0);
5133 }
5134 
5135 /*
5136  * FUNCTION:	meta_sp_display_exthdr()
5137  * INPUT:	none
5138  * OUTPUT:	none
5139  * RETURNS:	void
5140  * PURPOSE:	print header line for sp_ext_node_t information.  to be used
5141  *		in conjunction with meta_sp_display_ext().
5142  */
5143 static void
5144 meta_sp_display_exthdr(void)
5145 {
5146 	(void) printf("%20s %5s %7s %20s %20s\n",
5147 	    dgettext(TEXT_DOMAIN, "Name"),
5148 	    dgettext(TEXT_DOMAIN, "Seq#"),
5149 	    dgettext(TEXT_DOMAIN, "Type"),
5150 	    dgettext(TEXT_DOMAIN, "Offset"),
5151 	    dgettext(TEXT_DOMAIN, "Length"));
5152 }
5153 
5154 
5155 /*
5156  * FUNCTION:	meta_sp_display_ext()
5157  * INPUT:	ext	- extent to display
5158  * OUTPUT:	none
5159  * RETURNS:	void
5160  * PURPOSE:	print selected fields from sp_ext_node_t.
5161  */
5162 static void
5163 meta_sp_display_ext(sp_ext_node_t *ext)
5164 {
5165 	/* print extent information */
5166 	if (ext->ext_namep != NULL)
5167 		(void) printf("%20s ", ext->ext_namep->cname);
5168 	else
5169 		(void) printf("%20s ", "NONE");
5170 
5171 	(void) printf("%5u ", ext->ext_seq);
5172 
5173 	switch (ext->ext_type) {
5174 	case EXTTYP_ALLOC:
5175 		(void) printf("%7s ", "ALLOC");
5176 		break;
5177 	case EXTTYP_FREE:
5178 		(void) printf("%7s ", "FREE");
5179 		break;
5180 	case EXTTYP_RESERVED:
5181 		(void) printf("%7s ", "RESV");
5182 		break;
5183 	case EXTTYP_END:
5184 		(void) printf("%7s ", "END");
5185 		break;
5186 	default:
5187 		(void) printf("%7s ", "INVLD");
5188 		break;
5189 	}
5190 
5191 	(void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length);
5192 }
5193 
5194 
5195 /*
5196  * FUNCTION:	meta_sp_checkseq()
5197  * INPUT:	extlist	- list of extents to be checked
5198  * OUTPUT:	none
5199  * RETURNS:	int	- 0 - success, -1 - error
5200  * PURPOSE:	check soft partition sequence numbers.  this function assumes
5201  *		that a list of extents representing 1 or more soft partitions
5202  *		is passed in sorted in sequence number order.  within a
5203  *		single soft partition, there may not be any missing or
5204  *		duplicate sequence numbers.
5205  */
5206 static int
5207 meta_sp_checkseq(sp_ext_node_t *extlist)
5208 {
5209 	sp_ext_node_t *ext;
5210 
5211 	assert(extlist != NULL);
5212 
5213 	for (ext = extlist;
5214 	    ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC;
5215 	    ext = ext->ext_next) {
5216 		if (ext->ext_next->ext_namep != NULL &&
5217 		    strcmp(ext->ext_next->ext_namep->cname,
5218 		    ext->ext_namep->cname) != 0)
5219 				continue;
5220 
5221 		if (ext->ext_next->ext_seq != ext->ext_seq + 1) {
5222 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5223 			    "%s: sequence numbers are "
5224 			    "incorrect: %d should be %d\n"),
5225 			    ext->ext_next->ext_namep->cname,
5226 			    ext->ext_next->ext_seq, ext->ext_seq + 1);
5227 			return (-1);
5228 		}
5229 	}
5230 	return (0);
5231 }
5232 
5233 
5234 /*
5235  * FUNCTION:	meta_sp_resolve_name_conflict()
5236  * INPUT:	sp	- name of set we're are recovering in.
5237  *		old_np	- name pointer of soft partition we found on disk.
5238  * OUTPUT:	new_np	- name pointer for new soft partition name.
5239  *		ep	- error pointer returned.
5240  * RETURNS:	int	- 0 - name not replace, 1 - name replaced, -1 - error
5241  * PURPOSE:	Check to see if the name of one of the soft partitions we found
5242  *		on disk already exists in the metadb.  If so, prompt for a new
5243  *		name.  In addition, we keep a static array of names that
5244  *		will be recovered from this device since these names don't
5245  *		exist in the configuration at this point but cannot be
5246  *		recovered more than once.
5247  */
5248 static int
5249 meta_sp_resolve_name_conflict(
5250 	mdsetname_t	*sp,
5251 	mdname_t	*old_np,
5252 	mdname_t	**new_np,
5253 	md_error_t	*ep
5254 )
5255 {
5256 	char		yesno[255];
5257 	char		*yes;
5258 	char		newname[MD_SP_MAX_DEVNAME_PLUS_1];
5259 	int		nunits;
5260 	static int	*used_names = NULL;
5261 
5262 	assert(old_np != NULL);
5263 
5264 	if (used_names == NULL) {
5265 		if ((nunits = meta_get_nunits(ep)) < 0)
5266 			return (-1);
5267 		used_names = Zalloc(nunits * sizeof (int));
5268 	}
5269 
5270 	/* see if it exists already */
5271 	if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 &&
5272 	    metagetmiscname(old_np, ep) == NULL) {
5273 		if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5274 			return (-1);
5275 		else {
5276 			used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1;
5277 			mdclrerror(ep);
5278 			return (0);
5279 		}
5280 	}
5281 
5282 	/* name exists, ask the user for a new one */
5283 	(void) printf(dgettext(TEXT_DOMAIN,
5284 	    "WARNING: A soft partition named %s was found in the extent\n"
5285 	    "headers, but this name already exists in the metadb "
5286 	    "configuration.\n"
5287 	    "In order to continue recovery you must supply\n"
5288 	    "a new name for this soft partition.\n"), old_np->cname);
5289 	(void) printf(dgettext(TEXT_DOMAIN,
5290 	    "Would you like to continue and supply a new name? (yes/no) "));
5291 
5292 	(void) fflush(stdout);
5293 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
5294 	    (strlen(yesno) == 1))
5295 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
5296 		    dgettext(TEXT_DOMAIN, "no"));
5297 	yes = dgettext(TEXT_DOMAIN, "yes");
5298 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
5299 		return (-1);
5300 	}
5301 
5302 	(void) fflush(stdin);
5303 
5304 	/* get the new name */
5305 	for (;;) {
5306 		(void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name "
5307 		    "for this soft partition (dXXXX) "));
5308 		(void) fflush(stdout);
5309 		if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL)
5310 			(void) strcpy(newname, "");
5311 
5312 		/* remove newline character */
5313 		if (newname[strlen(newname) - 1] == '\n')
5314 			newname[strlen(newname) - 1] = '\0';
5315 
5316 		if (!(is_metaname(newname)) ||
5317 		    (meta_init_make_device(&sp, newname, ep) <= 0)) {
5318 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5319 			    "Invalid metadevice name\n"));
5320 			(void) fflush(stderr);
5321 			continue;
5322 		}
5323 
5324 		if ((*new_np = metaname(&sp, newname,
5325 		    META_DEVICE, ep)) == NULL) {
5326 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5327 			    "Invalid metadevice name\n"));
5328 			(void) fflush(stderr);
5329 			continue;
5330 		}
5331 
5332 		assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits);
5333 		/* make sure the name isn't already being used */
5334 		if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] ||
5335 		    metagetmiscname(*new_np, ep) != NULL) {
5336 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5337 			    "That name already exists\n"));
5338 			continue;
5339 		} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5340 			return (-1);
5341 
5342 		break;
5343 	}
5344 
5345 	/* got a new name, place in used array and return */
5346 	used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1;
5347 	mdclrerror(ep);
5348 	return (1);
5349 }
5350 
5351 /*
5352  * FUNCTION:	meta_sp_validate_wm()
5353  * INPUT:	sp	- set name we are recovering in
5354  *		compnp	- name pointer for device we are recovering from
5355  *		options	- metarecover options
5356  * OUTPUT:	ep	- error pointer returned
5357  * RETURNS:	int	- 0 - success, -1 - error
5358  * PURPOSE:	validate and display watermark configuration.  walk the
5359  *		on-disk watermark structures and validate the information
5360  *		found within.  since a watermark configuration is
5361  *		"self-defining", the act of traversing the watermarks
5362  *		is part of the validation process.
5363  */
5364 static int
5365 meta_sp_validate_wm(
5366 	mdsetname_t	*sp,
5367 	mdname_t	*compnp,
5368 	mdcmdopts_t	options,
5369 	md_error_t	*ep
5370 )
5371 {
5372 	sp_ext_node_t	*extlist = NULL;
5373 	sp_ext_node_t	*ext;
5374 	int		num_sps = 0;
5375 	int		rval;
5376 
5377 	if ((options & MDCMD_VERBOSE) != 0)
5378 		(void) printf(dgettext(TEXT_DOMAIN,
5379 		    "Verifying on-disk structures on %s.\n"),
5380 		    compnp->cname);
5381 
5382 	/*
5383 	 * for each watermark, build an ext_node, place on list.
5384 	 */
5385 	rval = meta_sp_extlist_from_wm(sp, compnp, &extlist,
5386 	    meta_sp_cmp_by_nameseq, ep);
5387 
5388 	if ((options & MDCMD_VERBOSE) != 0) {
5389 		/* print out what we found */
5390 		if (extlist == NULL)
5391 			(void) printf(dgettext(TEXT_DOMAIN,
5392 			    "No extent headers found on %s.\n"),
5393 			    compnp->cname);
5394 		else {
5395 			(void) printf(dgettext(TEXT_DOMAIN,
5396 			    "The following extent headers were found on %s.\n"),
5397 			    compnp->cname);
5398 			meta_sp_display_exthdr();
5399 		}
5400 		for (ext = extlist; ext != NULL; ext = ext->ext_next)
5401 			meta_sp_display_ext(ext);
5402 	}
5403 
5404 	if (rval < 0) {
5405 		(void) printf(dgettext(TEXT_DOMAIN,
5406 		    "%s: On-disk structures invalid or "
5407 		    "no soft partitions found.\n"),
5408 		    compnp->cname);
5409 		return (-1);
5410 	}
5411 
5412 	assert(extlist != NULL);
5413 
5414 	/* count number of soft partitions */
5415 	for (ext = extlist;
5416 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5417 	    ext = ext->ext_next) {
5418 		if (ext->ext_next != NULL &&
5419 		    ext->ext_next->ext_namep != NULL &&
5420 		    strcmp(ext->ext_next->ext_namep->cname,
5421 		    ext->ext_namep->cname) == 0)
5422 				continue;
5423 		num_sps++;
5424 	}
5425 
5426 	if ((options & MDCMD_VERBOSE) != 0)
5427 		(void) printf(dgettext(TEXT_DOMAIN,
5428 		    "Found %d soft partition(s) on %s.\n"), num_sps,
5429 		    compnp->cname);
5430 
5431 	if (num_sps == 0) {
5432 		(void) printf(dgettext(TEXT_DOMAIN,
5433 		    "%s: No soft partitions.\n"), compnp->cname);
5434 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5435 	}
5436 
5437 	/* check sequence numbers */
5438 	if ((options & MDCMD_VERBOSE) != 0)
5439 		(void) printf(dgettext(TEXT_DOMAIN,
5440 		    "Checking sequence numbers.\n"));
5441 
5442 	if (meta_sp_checkseq(extlist) != 0)
5443 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5444 
5445 	return (0);
5446 }
5447 
5448 /*
5449  * FUNCTION:	meta_sp_validate_unit()
5450  * INPUT:	sp	- name of set we are recovering in
5451  *		compnp	- name of component we are recovering from
5452  *		options	- metarecover options
5453  * OUTPUT:	ep	- error pointer returned
5454  * RETURNS:	int	- 0 - success, -1 - error
5455  * PURPOSE:	validate and display metadb configuration.  begin by getting
5456  *		all soft partitions built on the specified component.  get
5457  *		the unit structure for each one and validate the fields within.
5458  */
5459 static int
5460 meta_sp_validate_unit(
5461 	mdsetname_t	*sp,
5462 	mdname_t	*compnp,
5463 	mdcmdopts_t	options,
5464 	md_error_t	*ep
5465 )
5466 {
5467 	md_sp_t		*msp;
5468 	mdnamelist_t	*spnlp = NULL;
5469 	mdnamelist_t	*namep = NULL;
5470 	int		count;
5471 	uint_t		extn;
5472 	sp_ext_length_t	size;
5473 
5474 	if ((options & MDCMD_VERBOSE) != 0)
5475 		(void) printf(dgettext(TEXT_DOMAIN,
5476 		    "%s: Validating soft partition metadb entries.\n"),
5477 		    compnp->cname);
5478 
5479 	if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR)
5480 		return (-1);
5481 
5482 	/* get all soft partitions on component */
5483 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
5484 
5485 	if (count == 0) {
5486 		(void) printf(dgettext(TEXT_DOMAIN,
5487 		    "%s: No soft partitions.\n"), compnp->cname);
5488 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5489 	} else if (count < 0) {
5490 		return (-1);
5491 	}
5492 
5493 	/* Now go through the soft partitions and check each one */
5494 	for (namep = spnlp; namep != NULL; namep = namep->next) {
5495 		mdname_t	*curnp = namep->namep;
5496 		sp_ext_offset_t	curvoff;
5497 
5498 		/* get the unit structure */
5499 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
5500 			return (-1);
5501 
5502 		/* verify generic unit structure parameters */
5503 		if ((options & MDCMD_VERBOSE) != 0)
5504 			(void) printf(dgettext(TEXT_DOMAIN,
5505 			    "\nVerifying device %s.\n"),
5506 			    curnp->cname);
5507 
5508 		/*
5509 		 * MD_SP_LAST is an invalid state and is always the
5510 		 * highest numbered.
5511 		 */
5512 		if (msp->status >= MD_SP_LAST) {
5513 			(void) printf(dgettext(TEXT_DOMAIN,
5514 			    "%s: status value %u is out of range.\n"),
5515 			    curnp->cname, msp->status);
5516 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5517 			    0, curnp->cname));
5518 		} else if ((options & MDCMD_VERBOSE) != 0) {
5519 			uint_t	tstate = 0;
5520 
5521 			if (metaismeta(msp->compnamep)) {
5522 				if (meta_get_tstate(msp->common.namep->dev,
5523 				    &tstate, ep) != 0)
5524 					return (-1);
5525 			}
5526 			(void) printf(dgettext(TEXT_DOMAIN,
5527 			    "%s: Status \"%s\" is valid.\n"),
5528 			    curnp->cname, meta_sp_status_to_name(msp->status,
5529 			    tstate & MD_DEV_ERRORED));
5530 		}
5531 
5532 		/* Now verify each extent */
5533 		if ((options & MDCMD_VERBOSE) != 0)
5534 			(void) printf("%14s %21s %21s %21s\n",
5535 			    dgettext(TEXT_DOMAIN, "Extent Number"),
5536 			    dgettext(TEXT_DOMAIN, "Virtual Offset"),
5537 			    dgettext(TEXT_DOMAIN, "Physical Offset"),
5538 			    dgettext(TEXT_DOMAIN, "Length"));
5539 
5540 		curvoff = 0ULL;
5541 		for (extn = 0; extn < msp->ext.ext_len; extn++) {
5542 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
5543 
5544 			if ((options & MDCMD_VERBOSE) != 0)
5545 				(void) printf("%14u %21llu %21llu %21llu\n",
5546 				    extn, extp->voff, extp->poff, extp->len);
5547 
5548 			if (extp->voff != curvoff) {
5549 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5550 				    "%s: virtual offset for extent %u "
5551 				    "is inconsistent, expected %llu, "
5552 				    "got %llu.\n"), curnp->cname, extn,
5553 				    curvoff, extp->voff);
5554 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5555 				    0, compnp->cname));
5556 			}
5557 
5558 			/* make sure extent does not drop off the end */
5559 			if ((extp->poff + extp->len) == size) {
5560 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5561 				    "%s: extent %u at offset %llu, "
5562 				    "length %llu exceeds the size of the "
5563 				    "device, %llu.\n"), curnp->cname,
5564 				    extn, extp->poff, extp->len, size);
5565 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5566 				    0, compnp->cname));
5567 			}
5568 
5569 			curvoff += extp->len;
5570 		}
5571 	}
5572 	if (options & MDCMD_PRINT) {
5573 		(void) printf(dgettext(TEXT_DOMAIN,
5574 		    "%s: Soft Partition metadb configuration is valid\n"),
5575 		    compnp->cname);
5576 	}
5577 	return (0);
5578 }
5579 
5580 /*
5581  * FUNCTION:	meta_sp_validate_wm_and_unit()
5582  * INPUT:	sp	- name of set we are recovering in
5583  *		compnp	- name of device we are recovering from
5584  *		options	- metarecover options
5585  * OUTPUT:	ep	- error pointer returned
5586  * RETURNS:	int	- 0 - success, -1 error
5587  * PURPOSE:	cross-validate and display watermarks and metadb records.
5588  *		get both the unit structures for the soft partitions built
5589  *		on the specified component and the watermarks found on that
5590  *		component and check to make sure they are consistent with
5591  *		each other.
5592  */
5593 static int
5594 meta_sp_validate_wm_and_unit(
5595 	mdsetname_t	*sp,
5596 	mdname_t	*np,
5597 	mdcmdopts_t	options,
5598 	md_error_t	*ep
5599 )
5600 {
5601 	sp_ext_node_t	*wmlist = NULL;
5602 	sp_ext_node_t	*unitlist = NULL;
5603 	sp_ext_node_t	*unitext;
5604 	sp_ext_node_t	*wmext;
5605 	sp_ext_offset_t	tmpunitoff;
5606 	mdnamelist_t	*spnlp = NULL;
5607 	int		count;
5608 	int		rval = 0;
5609 	int		verbose = (options & MDCMD_VERBOSE);
5610 
5611 	/* get unit structure list */
5612 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
5613 	if (count <= 0)
5614 		return (-1);
5615 
5616 	meta_sp_list_insert(NULL, NULL, &unitlist,
5617 	    metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
5618 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
5619 
5620 	if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) {
5621 		metafreenamelist(spnlp);
5622 		return (-1);
5623 	}
5624 
5625 	metafreenamelist(spnlp);
5626 
5627 	meta_sp_list_freefill(&unitlist, metagetsize(np, ep));
5628 
5629 	if (meta_sp_extlist_from_wm(sp, np, &wmlist,
5630 	    meta_sp_cmp_by_offset, ep) < 0) {
5631 		meta_sp_list_free(&unitlist);
5632 		return (-1);
5633 	}
5634 
5635 	if (getenv(META_SP_DEBUG)) {
5636 		meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n");
5637 		meta_sp_list_dump(unitlist);
5638 		meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n");
5639 		meta_sp_list_dump(wmlist);
5640 	}
5641 
5642 	/*
5643 	 * step through both lists and compare allocated nodes.  Free
5644 	 * nodes and end watermarks may differ between the two but
5645 	 * that's generally ok, and if they're wrong will typically
5646 	 * cause misplaced allocated extents.
5647 	 */
5648 	if (verbose)
5649 		(void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb "
5650 		    "allocations match extent headers.\n"), np->cname);
5651 
5652 	unitext = unitlist;
5653 	wmext = wmlist;
5654 	while ((wmext != NULL) && (unitext != NULL)) {
5655 		/* find next allocated extents in each list */
5656 		while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC)
5657 			wmext = wmext->ext_next;
5658 
5659 		while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC)
5660 			unitext = unitext->ext_next;
5661 
5662 		if (wmext == NULL || unitext == NULL)
5663 			break;
5664 
5665 		if (verbose) {
5666 			(void) printf(dgettext(TEXT_DOMAIN,
5667 			    "Metadb extent:\n"));
5668 			meta_sp_display_exthdr();
5669 			meta_sp_display_ext(unitext);
5670 			(void) printf(dgettext(TEXT_DOMAIN,
5671 			    "Extent header extent:\n"));
5672 			meta_sp_display_exthdr();
5673 			meta_sp_display_ext(wmext);
5674 			(void) printf("\n");
5675 		}
5676 
5677 		if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0)
5678 			rval = -1;
5679 
5680 		/*
5681 		 * if the offsets aren't equal, only increment the
5682 		 * lowest one in hopes of getting the lists back in sync.
5683 		 */
5684 		tmpunitoff = unitext->ext_offset;
5685 		if (unitext->ext_offset <= wmext->ext_offset)
5686 			unitext = unitext->ext_next;
5687 		if (wmext->ext_offset <= tmpunitoff)
5688 			wmext = wmext->ext_next;
5689 	}
5690 
5691 	/*
5692 	 * if both lists aren't at the end then there are extra
5693 	 * allocated nodes in one of them.
5694 	 */
5695 	if (wmext != NULL) {
5696 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5697 		    "%s: extent headers contain allocations not in "
5698 		    "the metadb\n\n"), np->cname);
5699 		rval = -1;
5700 	}
5701 
5702 	if (unitext != NULL) {
5703 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5704 		    "%s: metadb contains allocations not in the extent "
5705 		    "headers\n\n"), np->cname);
5706 		rval = -1;
5707 	}
5708 
5709 	if (options & MDCMD_PRINT) {
5710 		if (rval == 0) {
5711 			(void) printf(dgettext(TEXT_DOMAIN,
5712 			    "%s: Soft Partition metadb matches extent "
5713 			    "header configuration\n"), np->cname);
5714 		} else {
5715 			(void) printf(dgettext(TEXT_DOMAIN,
5716 			    "%s: Soft Partition metadb does not match extent "
5717 			    "header configuration\n"), np->cname);
5718 		}
5719 	}
5720 
5721 	return (rval);
5722 }
5723 
5724 /*
5725  * FUNCTION:	meta_sp_validate_exts()
5726  * INPUT:	compnp	- name pointer for device we are recovering from
5727  *		wmext	- extent node representing watermark
5728  *		unitext	- extent node from unit structure
5729  * OUTPUT:	ep	- return error pointer
5730  * RETURNS:	int	- 0 - succes, mdmderror return code - error
5731  * PURPOSE:	Takes two extent nodes and checks them against each other.
5732  *		offset, length, sequence number, set, and name are compared.
5733  */
5734 static int
5735 meta_sp_validate_exts(
5736 	mdname_t	*compnp,
5737 	sp_ext_node_t	*wmext,
5738 	sp_ext_node_t	*unitext,
5739 	md_error_t	*ep
5740 )
5741 {
5742 	if (wmext->ext_offset != unitext->ext_offset) {
5743 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5744 		    "%s: unit structure and extent header offsets differ.\n"),
5745 		    compnp->cname);
5746 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5747 	}
5748 
5749 	if (wmext->ext_length != unitext->ext_length) {
5750 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5751 		    "%s: unit structure and extent header lengths differ.\n"),
5752 		    compnp->cname);
5753 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5754 	}
5755 
5756 	if (wmext->ext_seq != unitext->ext_seq) {
5757 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5758 		    "%s: unit structure and extent header sequence numbers "
5759 		    "differ.\n"), compnp->cname);
5760 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5761 	}
5762 
5763 	if (wmext->ext_type != unitext->ext_type) {
5764 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5765 		    "%s: unit structure and extent header types differ.\n"),
5766 		    compnp->cname);
5767 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5768 	}
5769 
5770 	/*
5771 	 * If one has a set pointer and the other doesn't, error.
5772 	 * If both extents have setnames, then make sure they match
5773 	 * If both are NULL, it's ok, they match.
5774 	 */
5775 	if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) {
5776 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5777 		    "%s: unit structure and extent header set values "
5778 		    "differ.\n"), compnp->cname);
5779 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5780 	}
5781 
5782 	if (unitext->ext_setp != NULL) {
5783 		if (strcmp(unitext->ext_setp->setname,
5784 		    wmext->ext_setp->setname) != 0) {
5785 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5786 			    "%s: unit structure and extent header set names "
5787 			    "differ.\n"), compnp->cname);
5788 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5789 			    0, compnp->cname));
5790 		}
5791 	}
5792 
5793 	/*
5794 	 * If one has a name pointer and the other doesn't, error.
5795 	 * If both extents have names, then make sure they match
5796 	 * If both are NULL, it's ok, they match.
5797 	 */
5798 	if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) {
5799 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5800 		    "%s: unit structure and extent header name values "
5801 		    "differ.\n"), compnp->cname);
5802 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5803 	}
5804 
5805 	if (unitext->ext_namep != NULL) {
5806 		if (strcmp(wmext->ext_namep->cname,
5807 		    unitext->ext_namep->cname) != 0) {
5808 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5809 			    "%s: unit structure and extent header names "
5810 			    "differ.\n"), compnp->cname);
5811 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5812 			    0, compnp->cname));
5813 		}
5814 	}
5815 
5816 	return (0);
5817 }
5818 
5819 /*
5820  * FUNCTION:	update_sp_status()
5821  * INPUT:	sp	- name of set we are recovering in
5822  *		minors	- pointer to an array of soft partition minor numbers
5823  *		num_sps	- number of minor numbers in array
5824  *		status	- new status to be applied to all soft parts in array
5825  *		mn_set	- set if current set is a multi-node set
5826  * OUTPUT:	ep	- return error pointer
5827  * RETURNS:	int	- 0 - success, -1 - error
5828  * PURPOSE:	update  status of soft partitions to new status. minors is an
5829  *		array of minor numbers to apply the new status to.
5830  *		If mn_set is set, a message is sent to all nodes in the
5831  *		cluster to update the status locally.
5832  */
5833 static int
5834 update_sp_status(
5835 	mdsetname_t	*sp,
5836 	minor_t		*minors,
5837 	int		num_sps,
5838 	sp_status_t	status,
5839 	bool_t		mn_set,
5840 	md_error_t	*ep
5841 )
5842 {
5843 	int	i;
5844 	int	err = 0;
5845 
5846 	if (mn_set) {
5847 		md_mn_msg_sp_setstat_t	sp_setstat_params;
5848 		int			result;
5849 		md_mn_result_t		*resp = NULL;
5850 
5851 		for (i = 0; i < num_sps; i++) {
5852 			sp_setstat_params.sp_setstat_mnum = minors[i];
5853 			sp_setstat_params.sp_setstat_status = status;
5854 
5855 			result = mdmn_send_message(sp->setno,
5856 			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS, 0,
5857 			    (char *)&sp_setstat_params,
5858 			    sizeof (sp_setstat_params),
5859 			    &resp, ep);
5860 			if (resp != NULL) {
5861 				if (resp->mmr_exitval != 0)
5862 					err = -1;
5863 				free_result(resp);
5864 			}
5865 			if (result != 0) {
5866 				err = -1;
5867 			}
5868 		}
5869 	} else {
5870 		if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0)
5871 			err = -1;
5872 	}
5873 	if (err < 0) {
5874 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5875 		    "Error updating status on recovered soft "
5876 		    "partitions.\n"));
5877 	}
5878 	return (err);
5879 }
5880 
5881 /*
5882  * FUNCTION:	meta_sp_recover_from_wm()
5883  * INPUT:	sp	- name of set we are recovering in
5884  *		compnp	- name pointer for component we are recovering from
5885  *		options	- metarecover options
5886  * OUTPUT:	ep	- return error pointer
5887  * RETURNS:	int	- 0 - success, -1 - error
5888  * PURPOSE:	update metadb records to match watermarks.  begin by getting
5889  *		an extlist representing all soft partitions on the component.
5890  *		then build a unit structure for each soft partition.
5891  *		notify user of changes, then commit each soft partition to
5892  *		the metadb one at a time in the "recovering" state.  update
5893  *		any watermarks that may need it	(to reflect possible name
5894  *		changes), and, finally, set the status of all recovered
5895  *		partitions to the "OK" state at once.
5896  */
5897 static int
5898 meta_sp_recover_from_wm(
5899 	mdsetname_t	*sp,
5900 	mdname_t	*compnp,
5901 	mdcmdopts_t	options,
5902 	md_error_t	*ep
5903 )
5904 {
5905 	sp_ext_node_t		*extlist = NULL;
5906 	sp_ext_node_t		*sp_list = NULL;
5907 	sp_ext_node_t		*update_list = NULL;
5908 	sp_ext_node_t		*ext;
5909 	sp_ext_node_t		*sp_ext;
5910 	mp_unit_t		*mp;
5911 	mp_unit_t		**un_array;
5912 	int			numexts = 0, num_sps = 0, i = 0;
5913 	int			err = 0;
5914 	int			not_recovered = 0;
5915 	int			committed = 0;
5916 	sp_ext_length_t		sp_length = 0LL;
5917 	mdnamelist_t		*keynlp = NULL;
5918 	mdname_t		*np;
5919 	mdname_t		*new_np;
5920 	int			new_name;
5921 	md_set_params_t		set_params;
5922 	minor_t			*minors = NULL;
5923 	char			yesno[255];
5924 	char			*yes;
5925 	bool_t			mn_set = 0;
5926 	md_set_desc		*sd;
5927 	mm_unit_t		*mm;
5928 	md_set_mmown_params_t	*ownpar = NULL;
5929 	int			comp_is_mirror = 0;
5930 
5931 	/*
5932 	 * if this component appears in another metadevice already, do
5933 	 * NOT recover from it.
5934 	 */
5935 	if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0)
5936 		return (-1);
5937 
5938 	/* set flag if dealing with a MN set */
5939 	if (!metaislocalset(sp)) {
5940 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5941 			return (-1);
5942 		}
5943 		if (MD_MNSET_DESC(sd))
5944 			mn_set = 1;
5945 	}
5946 	/*
5947 	 * for each watermark, build an ext_node, place on list.
5948 	 */
5949 	if (meta_sp_extlist_from_wm(sp, compnp, &extlist,
5950 	    meta_sp_cmp_by_nameseq, ep) < 0)
5951 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5952 
5953 	assert(extlist != NULL);
5954 
5955 	/* count number of soft partitions */
5956 	for (ext = extlist;
5957 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5958 	    ext = ext->ext_next) {
5959 		if (ext->ext_next != NULL &&
5960 		    ext->ext_next->ext_namep != NULL &&
5961 		    strcmp(ext->ext_next->ext_namep->cname,
5962 		    ext->ext_namep->cname) == 0)
5963 				continue;
5964 		num_sps++;
5965 	}
5966 
5967 	/* allocate array of unit structure pointers */
5968 	un_array = Zalloc(num_sps * sizeof (mp_unit_t *));
5969 
5970 	/*
5971 	 * build unit structures from list of ext_nodes.
5972 	 */
5973 	for (ext = extlist;
5974 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5975 	    ext = ext->ext_next) {
5976 		meta_sp_list_insert(ext->ext_setp, ext->ext_namep,
5977 		    &sp_list, ext->ext_offset, ext->ext_length,
5978 		    ext->ext_type, ext->ext_seq, ext->ext_flags,
5979 		    meta_sp_cmp_by_nameseq);
5980 
5981 		numexts++;
5982 		sp_length += ext->ext_length - MD_SP_WMSIZE;
5983 
5984 		if (ext->ext_next != NULL &&
5985 		    ext->ext_next->ext_namep != NULL &&
5986 		    strcmp(ext->ext_next->ext_namep->cname,
5987 		    ext->ext_namep->cname) == 0)
5988 				continue;
5989 
5990 		/*
5991 		 * if we made it here, we are at a soft partition
5992 		 * boundary in the list.
5993 		 */
5994 		if (getenv(META_SP_DEBUG)) {
5995 			meta_sp_debug("meta_recover_from_wm: dumping wm "
5996 			    "list:\n");
5997 			meta_sp_list_dump(sp_list);
5998 		}
5999 
6000 		assert(sp_list != NULL);
6001 		assert(sp_list->ext_namep != NULL);
6002 
6003 		if ((new_name = meta_sp_resolve_name_conflict(sp,
6004 		    sp_list->ext_namep, &new_np, ep)) < 0) {
6005 			err = 1;
6006 			goto out;
6007 		} else if (new_name) {
6008 			for (sp_ext = sp_list;
6009 			    sp_ext != NULL;
6010 			    sp_ext = sp_ext->ext_next) {
6011 				/*
6012 				 * insert into the update list for
6013 				 * watermark update.
6014 				 */
6015 				meta_sp_list_insert(sp_ext->ext_setp,
6016 				    new_np, &update_list, sp_ext->ext_offset,
6017 				    sp_ext->ext_length, sp_ext->ext_type,
6018 				    sp_ext->ext_seq, EXTFLG_UPDATE,
6019 				    meta_sp_cmp_by_offset);
6020 			}
6021 
6022 		}
6023 		if (options & MDCMD_DOIT) {
6024 			/* store name in namespace */
6025 			if (mn_set) {
6026 				/* send message to all nodes to return key */
6027 				md_mn_msg_addkeyname_t	*send_params;
6028 				int			result;
6029 				md_mn_result_t		*resp = NULL;
6030 				int			message_size;
6031 
6032 				message_size =  sizeof (*send_params) +
6033 				    strlen(compnp->cname) + 1;
6034 				send_params = Zalloc(message_size);
6035 				send_params->addkeyname_setno = sp->setno;
6036 				(void) strcpy(&send_params->addkeyname_name[0],
6037 				    compnp->cname);
6038 				result = mdmn_send_message(sp->setno,
6039 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6040 				    0, (char *)send_params, message_size, &resp,
6041 				    ep);
6042 				Free(send_params);
6043 				if (resp != NULL) {
6044 					if (resp->mmr_exitval >= 0) {
6045 						compnp->key =
6046 						    (mdkey_t)resp->mmr_exitval;
6047 					} else {
6048 						err = 1;
6049 						free_result(resp);
6050 						goto out;
6051 					}
6052 					free_result(resp);
6053 				}
6054 				if (result != 0) {
6055 					err = 1;
6056 					goto out;
6057 				}
6058 				(void) metanamelist_append(&keynlp, compnp);
6059 			} else {
6060 				if (add_key_name(sp, compnp, &keynlp,
6061 				    ep) != 0) {
6062 					err = 1;
6063 					goto out;
6064 				}
6065 			}
6066 		}
6067 
6068 		/* create the unit structure */
6069 		if ((mp = meta_sp_createunit(
6070 		    (new_name) ? new_np : sp_list->ext_namep, compnp,
6071 		    sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) {
6072 			err = 1;
6073 			goto out;
6074 		}
6075 
6076 		if (getenv(META_SP_DEBUG)) {
6077 			meta_sp_debug("meta_sp_recover_from_wm: "
6078 			    "printing newly created unit structure");
6079 			meta_sp_printunit(mp);
6080 		}
6081 
6082 		/* place in unit structure array */
6083 		un_array[i++] = mp;
6084 
6085 		/* free sp_list */
6086 		meta_sp_list_free(&sp_list);
6087 		sp_list = NULL;
6088 		numexts = 0;
6089 		sp_length = 0LL;
6090 	}
6091 
6092 	/* display configuration updates */
6093 	(void) printf(dgettext(TEXT_DOMAIN,
6094 	    "The following soft partitions were found and will be added to\n"
6095 	    "your metadevice configuration.\n"));
6096 	(void) printf("%5s %15s %18s\n",
6097 	    dgettext(TEXT_DOMAIN, "Name"),
6098 	    dgettext(TEXT_DOMAIN, "Size"),
6099 	    dgettext(TEXT_DOMAIN, "No. of Extents"));
6100 	for (i = 0; i < num_sps; i++) {
6101 		(void) printf("%5s%lu %15llu %9d\n", "d",
6102 		    MD_MIN2UNIT(MD_SID(un_array[i])),
6103 		    un_array[i]->un_length, un_array[i]->un_numexts);
6104 	}
6105 
6106 	if (!(options & MDCMD_DOIT)) {
6107 		not_recovered = 1;
6108 		goto out;
6109 	}
6110 
6111 	/* ask user for confirmation */
6112 	(void) printf(dgettext(TEXT_DOMAIN,
6113 	    "WARNING: You are about to add one or more soft partition\n"
6114 	    "metadevices to your metadevice configuration.  If there\n"
6115 	    "appears to be an error in the soft partition(s) displayed\n"
6116 	    "above, do NOT proceed with this recovery operation.\n"));
6117 	(void) printf(dgettext(TEXT_DOMAIN,
6118 	    "Are you sure you want to do this (yes/no)? "));
6119 
6120 	(void) fflush(stdout);
6121 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6122 	    (strlen(yesno) == 1))
6123 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
6124 		    dgettext(TEXT_DOMAIN, "no"));
6125 	yes = dgettext(TEXT_DOMAIN, "yes");
6126 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
6127 		not_recovered = 1;
6128 		goto out;
6129 	}
6130 
6131 	/* commit records one at a time */
6132 	for (i = 0; i < num_sps; i++) {
6133 		(void) memset(&set_params, 0, sizeof (set_params));
6134 		set_params.mnum = MD_SID(un_array[i]);
6135 		set_params.size = (un_array[i])->c.un_size;
6136 		set_params.mdp = (uintptr_t)(un_array[i]);
6137 		set_params.options =
6138 		    meta_check_devicesize(un_array[i]->un_length);
6139 		if (set_params.options == MD_CRO_64BIT) {
6140 			un_array[i]->c.un_revision |= MD_64BIT_META_DEV;
6141 		} else {
6142 			un_array[i]->c.un_revision &= ~MD_64BIT_META_DEV;
6143 		}
6144 		MD_SETDRIVERNAME(&set_params, MD_SP,
6145 		    MD_MIN2SET(set_params.mnum));
6146 
6147 		np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep);
6148 
6149 		/*
6150 		 * If this is an MN set, send the MD_IOCSET ioctl to all nodes
6151 		 */
6152 		if (mn_set) {
6153 			md_mn_msg_iocset_t	send_params;
6154 			int			result;
6155 			md_mn_result_t		*resp = NULL;
6156 			int			mess_size;
6157 
6158 			/*
6159 			 * Calculate message size. md_mn_msg_iocset_t only
6160 			 * contains one extent, so increment the size to
6161 			 * include all extents
6162 			 */
6163 			mess_size = sizeof (send_params) -
6164 			    sizeof (mp_ext_t) +
6165 			    (un_array[i]->un_numexts * sizeof (mp_ext_t));
6166 
6167 			send_params.iocset_params = set_params;
6168 			(void) memcpy(&send_params.unit, un_array[i],
6169 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
6170 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
6171 			result = mdmn_send_message(sp->setno,
6172 			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS, 0,
6173 			    (char *)&send_params, mess_size, &resp,
6174 			    ep);
6175 			if (resp != NULL) {
6176 				if (resp->mmr_exitval != 0)
6177 					err = 1;
6178 				free_result(resp);
6179 			}
6180 			if (result != 0) {
6181 				err = 1;
6182 			}
6183 		} else {
6184 			if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
6185 			    np->cname) != 0) {
6186 				err = 1;
6187 			}
6188 		}
6189 
6190 		if (err == 1) {
6191 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6192 			    "%s: Error committing record to metadb.\n"),
6193 			    np->cname);
6194 			goto out;
6195 		}
6196 
6197 		/* note that we've committed a record */
6198 		if (!committed)
6199 			committed = 1;
6200 
6201 		/* update any watermarks that need it */
6202 		if (update_list != NULL) {
6203 			md_sp_t *msp;
6204 
6205 			/*
6206 			 * Check to see if we're trying to create a partition
6207 			 * on a mirror. If so we may have to enforce an
6208 			 * ownership change before writing the watermark out.
6209 			 */
6210 			if (metaismeta(compnp)) {
6211 				char *miscname;
6212 
6213 				miscname = metagetmiscname(compnp, ep);
6214 				if (miscname != NULL)
6215 					comp_is_mirror = (strcmp(miscname,
6216 					    MD_MIRROR) == 0);
6217 				else
6218 					comp_is_mirror = 0;
6219 			}
6220 			/*
6221 			 * If this is a MN set and the component is a mirror,
6222 			 * change ownership to this node in order to write the
6223 			 * watermarks
6224 			 */
6225 			if (mn_set && comp_is_mirror) {
6226 				mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
6227 				if (mm == NULL) {
6228 					err = 1;
6229 					goto out;
6230 				} else {
6231 					err = meta_mn_change_owner(&ownpar,
6232 					    sp->setno,
6233 					    meta_getminor(compnp->dev),
6234 					    sd->sd_mn_mynode->nd_nodeid,
6235 					    MD_MN_MM_PREVENT_CHANGE |
6236 					    MD_MN_MM_SPAWN_THREAD);
6237 					if (err != 0)
6238 						goto out;
6239 				}
6240 			}
6241 
6242 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
6243 				err = 1;
6244 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6245 				    "%s: Error updating extent headers.\n"),
6246 				    np->cname);
6247 				goto out;
6248 			}
6249 			if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) {
6250 				err = 1;
6251 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6252 				    "%s: Error updating extent headers "
6253 				    "on disk.\n"), np->cname);
6254 				goto out;
6255 			}
6256 		}
6257 		/*
6258 		 * If we have changed ownership earlier and prevented any
6259 		 * ownership changes, we can now allow ownership changes
6260 		 * again.
6261 		 */
6262 		if (ownpar) {
6263 			(void) meta_mn_change_owner(&ownpar, sp->setno,
6264 			    ownpar->d.mnum,
6265 			    ownpar->d.owner,
6266 			    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
6267 		}
6268 	}
6269 
6270 	/* update status of all soft partitions to OK */
6271 	minors = Zalloc(num_sps * sizeof (minor_t));
6272 	for (i = 0; i < num_sps; i++)
6273 		minors[i] = MD_SID(un_array[i]);
6274 
6275 	err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep);
6276 	if (err != 0)
6277 		goto out;
6278 
6279 	if (options & MDCMD_PRINT)
6280 		(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6281 		    "Soft Partitions recovered from device.\n"),
6282 		    compnp->cname);
6283 out:
6284 	/* free memory */
6285 	if (extlist != NULL)
6286 		meta_sp_list_free(&extlist);
6287 	if (sp_list != NULL)
6288 		meta_sp_list_free(&sp_list);
6289 	if (update_list != NULL)
6290 		meta_sp_list_free(&update_list);
6291 	if (un_array != NULL)	{
6292 		for (i = 0; i < num_sps; i++)
6293 			Free(un_array[i]);
6294 		Free(un_array);
6295 	}
6296 	if (minors != NULL)
6297 		Free(minors);
6298 	if (ownpar != NULL)
6299 		Free(ownpar);
6300 	(void) fflush(stdout);
6301 
6302 	if ((keynlp != NULL) && (committed != 1)) {
6303 		/*
6304 		 * if we haven't committed any softparts, either because of an
6305 		 * error or because the user decided not to proceed, delete
6306 		 * namelist key for the component
6307 		 */
6308 		if (mn_set) {
6309 			mdnamelist_t	*p;
6310 
6311 			for (p = keynlp; (p != NULL); p = p->next) {
6312 				mdname_t		*np = p->namep;
6313 				md_mn_msg_delkeyname_t	send_params;
6314 				md_mn_result_t		*resp = NULL;
6315 
6316 				send_params.delkeyname_dev = np->dev;
6317 				send_params.delkeyname_setno = sp->setno;
6318 				send_params.delkeyname_key = np->key;
6319 				(void) mdmn_send_message(sp->setno,
6320 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6321 				    0, (char *)&send_params,
6322 				    sizeof (send_params),
6323 				    &resp, ep);
6324 				if (resp != NULL) {
6325 					free_result(resp);
6326 				}
6327 			}
6328 		} else {
6329 			(void) del_key_names(sp, keynlp, NULL);
6330 		}
6331 	}
6332 
6333 	metafreenamelist(keynlp);
6334 
6335 	if (err)
6336 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
6337 
6338 	if (not_recovered)
6339 		if (options & MDCMD_PRINT)
6340 			(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6341 			    "Soft Partitions NOT recovered from device.\n"),
6342 			    compnp->cname);
6343 	return (0);
6344 }
6345 
6346 /*
6347  * FUNCTION:	meta_sp_recover_from_unit()
6348  * INPUT:	sp	- name of set we are recovering in
6349  *		compnp	- name of component we are recovering from
6350  *		options	- metarecover options
6351  * OUTPUT:	ep	- return error pointer
6352  * RETURNS:	int	- 0 - success, -1 - error
6353  * PURPOSE:	update watermarks to match metadb records.  begin by getting
6354  *		a namelist representing all soft partitions on the specified
6355  *		component.  then, build an extlist representing the soft
6356  *		partitions, filling in the freespace extents.  notify user
6357  *		of changes, place all soft partitions into the "recovering"
6358  *		state and update the watermarks.  finally, return all soft
6359  *		partitions to the "OK" state.
6360  */
6361 static int
6362 meta_sp_recover_from_unit(
6363 	mdsetname_t	*sp,
6364 	mdname_t	*compnp,
6365 	mdcmdopts_t	options,
6366 	md_error_t	*ep
6367 )
6368 {
6369 	mdnamelist_t	*spnlp = NULL;
6370 	mdnamelist_t	*nlp = NULL;
6371 	sp_ext_node_t	*ext = NULL;
6372 	sp_ext_node_t	*extlist = NULL;
6373 	int		count;
6374 	char		yesno[255];
6375 	char		*yes;
6376 	int		rval = 0;
6377 	minor_t		*minors = NULL;
6378 	int		i;
6379 	md_sp_t		*msp;
6380 	md_set_desc	*sd;
6381 	bool_t		mn_set = 0;
6382 	daddr_t		start_block;
6383 
6384 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
6385 	if (count <= 0)
6386 		return (-1);
6387 
6388 	/* set flag if dealing with a MN set */
6389 	if (!metaislocalset(sp)) {
6390 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
6391 			return (-1);
6392 		}
6393 		if (MD_MNSET_DESC(sd))
6394 			mn_set = 1;
6395 	}
6396 	/*
6397 	 * Save the XDR unit structure for one of the soft partitions;
6398 	 * we'll use this later to provide metadevice context to
6399 	 * update the watermarks so the device can be resolved by
6400 	 * devid instead of dev_t.
6401 	 */
6402 	if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) {
6403 		metafreenamelist(spnlp);
6404 		return (-1);
6405 	}
6406 
6407 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
6408 	    MD_DISKADDR_ERROR) {
6409 		return (-1);
6410 	}
6411 
6412 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
6413 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
6414 	meta_sp_list_insert(NULL, NULL, &extlist,
6415 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
6416 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
6417 
6418 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
6419 		metafreenamelist(spnlp);
6420 		return (-1);
6421 	}
6422 
6423 	assert(extlist != NULL);
6424 	if ((options & MDCMD_VERBOSE) != 0) {
6425 		(void) printf(dgettext(TEXT_DOMAIN,
6426 		    "Updating extent headers on device %s from metadb.\n\n"),
6427 		    compnp->cname);
6428 		(void) printf(dgettext(TEXT_DOMAIN,
6429 		    "The following extent headers will be written:\n"));
6430 		meta_sp_display_exthdr();
6431 	}
6432 
6433 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
6434 
6435 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
6436 
6437 		/* mark every node for updating except the reserved space */
6438 		if (ext->ext_type != EXTTYP_RESERVED) {
6439 			ext->ext_flags |= EXTFLG_UPDATE;
6440 
6441 			/* print extent information */
6442 			if ((options & MDCMD_VERBOSE) != 0)
6443 				meta_sp_display_ext(ext);
6444 		}
6445 	}
6446 
6447 	/* request verification and then update all watermarks */
6448 	if ((options & MDCMD_DOIT) != 0) {
6449 
6450 		(void) printf(dgettext(TEXT_DOMAIN,
6451 		    "\nWARNING: You are about to overwrite portions of %s\n"
6452 		    "with soft partition metadata. The extent headers will be\n"
6453 		    "written to match the existing metadb configuration.  If\n"
6454 		    "the device was not previously setup with this\n"
6455 		    "configuration, data loss may result.\n\n"),
6456 		    compnp->cname);
6457 		(void) printf(dgettext(TEXT_DOMAIN,
6458 		    "Are you sure you want to do this (yes/no)? "));
6459 
6460 		(void) fflush(stdout);
6461 		if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6462 		    (strlen(yesno) == 1))
6463 			(void) snprintf(yesno, sizeof (yesno),
6464 			    "%s\n", dgettext(TEXT_DOMAIN, "no"));
6465 		yes = dgettext(TEXT_DOMAIN, "yes");
6466 		if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) {
6467 			/* place soft partitions into recovering state */
6468 			minors = Zalloc(count * sizeof (minor_t));
6469 			for (nlp = spnlp, i = 0;
6470 			    nlp != NULL && i < count;
6471 			    nlp = nlp->next, i++) {
6472 				assert(nlp->namep != NULL);
6473 				minors[i] = meta_getminor(nlp->namep->dev);
6474 			}
6475 			if (update_sp_status(sp, minors, count,
6476 			    MD_SP_RECOVER, mn_set, ep) != 0) {
6477 				rval = -1;
6478 				goto out;
6479 			}
6480 
6481 			/* update the watermarks */
6482 			if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
6483 				rval = -1;
6484 				goto out;
6485 			}
6486 
6487 			if (options & MDCMD_PRINT) {
6488 				(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6489 				    "Soft Partitions recovered from metadb\n"),
6490 				    compnp->cname);
6491 			}
6492 
6493 			/* return soft partitions to the OK state */
6494 			if (update_sp_status(sp, minors, count,
6495 			    MD_SP_OK, mn_set, ep) != 0) {
6496 				rval = -1;
6497 				goto out;
6498 			}
6499 
6500 			rval = 0;
6501 			goto out;
6502 		}
6503 	}
6504 
6505 	if (options & MDCMD_PRINT) {
6506 		(void) printf(dgettext(TEXT_DOMAIN,
6507 		    "%s: Soft Partitions NOT recovered from metadb\n"),
6508 		    compnp->cname);
6509 	}
6510 
6511 out:
6512 	if (minors != NULL)
6513 		Free(minors);
6514 	metafreenamelist(spnlp);
6515 	meta_sp_list_free(&extlist);
6516 	(void) fflush(stdout);
6517 	return (rval);
6518 }
6519 
6520 
6521 /*
6522  * FUNCTION:	meta_sp_update_abr()
6523  * INPUT:	sp	- name of set we are recovering in
6524  * OUTPUT:	ep	- return error pointer
6525  * RETURNS:	int	- 0 - success, -1 - error
6526  * PURPOSE:	update the ABR state for all soft partitions in the set. This
6527  *		is called when joining a set. It sends a message to the master
6528  *		node for each soft partition to get the value of tstate and
6529  *		then sets ABR ,if required, by opening the sp, setting ABR
6530  *		and then closing the sp. This approach is taken rather that
6531  *		just issuing the MD_MN_SET_CAP ioctl, in order to deal with
6532  *		the case when we have another node simultaneously unsetting ABR.
6533  */
6534 int
6535 meta_sp_update_abr(
6536 	mdsetname_t	*sp,
6537 	md_error_t	*ep
6538 )
6539 {
6540 	mdnamelist_t	*devnlp = NULL;
6541 	mdnamelist_t	*p;
6542 	mdname_t	*devnp = NULL;
6543 	md_unit_t	*un;
6544 	char		fname[MAXPATHLEN];
6545 	int		mnum, fd;
6546 	volcap_t	vc;
6547 	uint_t		tstate;
6548 
6549 
6550 	if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
6551 		return (-1);
6552 	}
6553 
6554 	/* Exit if no soft partitions in this set */
6555 	if (devnlp == NULL)
6556 		return (0);
6557 
6558 	/* For each soft partition */
6559 	for (p = devnlp; (p != NULL); p = p->next) {
6560 		devnp = p->namep;
6561 
6562 		/* check if this is a top level metadevice */
6563 		if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL)
6564 			goto out;
6565 		if (MD_HAS_PARENT(MD_PARENT(un))) {
6566 			Free(un);
6567 			continue;
6568 		}
6569 		Free(un);
6570 
6571 		/* Get tstate from Master */
6572 		if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) {
6573 			mdname_t	*np;
6574 			np = metamnumname(&sp, meta_getminor(devnp->dev), 0,
6575 			    ep);
6576 			if (np) {
6577 				md_perror(dgettext(TEXT_DOMAIN,
6578 				    "Unable to get tstate for %s"), np->cname);
6579 			}
6580 			continue;
6581 		}
6582 		/* If not set on the master, nothing to do */
6583 		if (!(tstate & MD_ABR_CAP))
6584 			continue;
6585 
6586 		mnum = meta_getminor(devnp->dev);
6587 		(void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u",
6588 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
6589 		if ((fd = open(fname, O_RDWR, 0)) < 0) {
6590 			md_perror(dgettext(TEXT_DOMAIN,
6591 			    "Could not open device %s"), fname);
6592 			continue;
6593 		}
6594 
6595 		/* Set ABR state */
6596 		vc.vc_info = 0;
6597 		vc.vc_set = 0;
6598 		if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
6599 			(void) close(fd);
6600 			continue;
6601 		}
6602 
6603 		vc.vc_set = DKV_ABR_CAP;
6604 		if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
6605 			(void) close(fd);
6606 			goto out;
6607 		}
6608 
6609 		(void) close(fd);
6610 	}
6611 	metafreenamelist(devnlp);
6612 	return (0);
6613 out:
6614 	metafreenamelist(devnlp);
6615 	return (-1);
6616 }
6617 
6618 /*
6619  * FUNCTION:	meta_mn_sp_update_abr()
6620  * INPUT:	arg	- Given set.
6621  * PURPOSE:	update the ABR state for all soft partitions in the set by
6622  *		forking a process to call meta_sp_update_abr()
6623  *		This function is only called via rpc.metad when adding a node
6624  *		to a set, ie this node is beong joined to the set by another
6625  *		node.
6626  */
6627 void *
6628 meta_mn_sp_update_abr(void *arg)
6629 {
6630 	set_t		setno = *((set_t *)arg);
6631 	mdsetname_t	*sp;
6632 	md_error_t	mde = mdnullerror;
6633 	int		fval;
6634 
6635 	/* should have a set */
6636 	assert(setno != NULL);
6637 
6638 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6639 		mde_perror(&mde, "");
6640 		return (NULL);
6641 	}
6642 
6643 	if (!(meta_is_mn_set(sp, &mde))) {
6644 		mde_perror(&mde, "");
6645 		return (NULL);
6646 	}
6647 
6648 	/* fork a process */
6649 	if ((fval = md_daemonize(sp, &mde)) != 0) {
6650 		/*
6651 		 * md_daemonize will fork off a process.  The is the
6652 		 * parent or error.
6653 		 */
6654 		if (fval > 0) {
6655 			return (NULL);
6656 		}
6657 		mde_perror(&mde, "");
6658 		return (NULL);
6659 	}
6660 	/*
6661 	 * Child process should never return back to rpc.metad, but
6662 	 * should exit.
6663 	 * Flush all internally cached data inherited from parent process
6664 	 * since cached data will be cleared when parent process RPC request
6665 	 * has completed (which is possibly before this child process
6666 	 * can complete).
6667 	 * Child process can retrieve and cache its own copy of data from
6668 	 * rpc.metad that won't be changed by the parent process.
6669 	 *
6670 	 * Reset md_in_daemon since this child will be a client of rpc.metad
6671 	 * not part of the rpc.metad daemon itself.
6672 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
6673 	 * this thread is rpc.metad or any other thread.  (If this thread
6674 	 * was rpc.metad it could use some short circuit code to get data
6675 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
6676 	 */
6677 	md_in_daemon = 0;
6678 	metaflushsetname(sp);
6679 	sr_cache_flush_setno(setno);
6680 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6681 		mde_perror(&mde, "");
6682 		md_exit(sp, 1);
6683 	}
6684 
6685 
6686 	/*
6687 	 * Closing stdin/out/err here.
6688 	 */
6689 	(void) close(0);
6690 	(void) close(1);
6691 	(void) close(2);
6692 	assert(fval == 0);
6693 
6694 	(void) meta_sp_update_abr(sp, &mde);
6695 
6696 	md_exit(sp, 0);
6697 	/*NOTREACHED*/
6698 	return (NULL);
6699 }
6700 
6701 int
6702 meta_sp_check_component(
6703 	mdsetname_t	*sp,
6704 	mdname_t	*np,
6705 	md_error_t	*ep
6706 )
6707 {
6708 	md_sp_t	*msp;
6709 	minor_t	mnum = 0;
6710 	md_dev64_t	dev = 0;
6711 	mdnm_params_t	nm;
6712 	md_getdevs_params_t	mgd;
6713 	side_t	sideno;
6714 	char	*miscname;
6715 	md_dev64_t	*mydev = NULL;
6716 	char	*pname = NULL, *t;
6717 	char	*ctd_name = NULL;
6718 	char	*devname = NULL;
6719 	int	len;
6720 	int	rval = -1;
6721 
6722 	(void) memset(&nm, '\0', sizeof (nm));
6723 	if ((msp = meta_get_sp_common(sp, np, 0, ep)) == NULL)
6724 		return (-1);
6725 
6726 	if ((miscname = metagetmiscname(np, ep)) == NULL)
6727 		return (-1);
6728 
6729 	sideno = getmyside(sp, ep);
6730 
6731 	meta_sp_debug("meta_sp_check_component: %s is on %s key: %d"
6732 	    " dev: %llu\n",
6733 	    np->cname, msp->compnamep->cname, msp->compnamep->key,
6734 	    msp->compnamep->dev);
6735 
6736 	/*
6737 	 * Now get the data from the unit structure. The compnamep stuff
6738 	 * contains the data from the namespace and we need the un_dev
6739 	 * from the unit structure.
6740 	 */
6741 	(void) memset(&mgd, '\0', sizeof (mgd));
6742 	MD_SETDRIVERNAME(&mgd, miscname, sp->setno);
6743 	mgd.cnt = 1;		    /* sp's only have one subdevice */
6744 	mgd.mnum = meta_getminor(np->dev);
6745 
6746 	mydev = Zalloc(sizeof (*mydev));
6747 	mgd.devs = (uintptr_t)mydev;
6748 
6749 	if (metaioctl(MD_IOCGET_DEVS, &mgd, &mgd.mde, np->cname) != 0) {
6750 		meta_sp_debug("meta_sp_check_component: ioctl failed\n");
6751 		(void) mdstealerror(ep, &mgd.mde);
6752 		rval = 0;
6753 		goto out;
6754 	} else if (mgd.cnt <= 0) {
6755 		assert(mgd.cnt >= 0);
6756 		rval = 0;
6757 		goto out;
6758 	}
6759 
6760 	/* Get the devname from the name space. */
6761 	if ((devname = meta_getnmentbykey(sp->setno, sideno,
6762 	    msp->compnamep->key, NULL, &mnum, &dev, ep)) == NULL) {
6763 		meta_sp_debug("meta_sp_check_component: key %d not"
6764 		    "found\n", msp->compnamep->key);
6765 		goto out;
6766 	}
6767 
6768 	meta_sp_debug("dev %s from component: (%lu, %lu)\n",
6769 	    devname,
6770 	    meta_getmajor(*mydev),
6771 	    meta_getminor(*mydev));
6772 	meta_sp_debug("minor from the namespace: %lu\n", mnum);
6773 
6774 	if (mnum != meta_getminor(*mydev)) {
6775 		/*
6776 		 * The minor numbers are different. Update the namespace
6777 		 * with the information from the component.
6778 		 */
6779 
6780 		t = strrchr(devname, '/');
6781 		t++;
6782 		ctd_name = Strdup(t);
6783 
6784 		meta_sp_debug("meta_sp_check_component: ctd_name: %s\n",
6785 		    ctd_name);
6786 
6787 		len = strlen(devname);
6788 		t = strrchr(devname, '/');
6789 		t++;
6790 		pname = Zalloc((len - strlen(t)) + 1);
6791 		(void) strncpy(pname, devname, (len - strlen(t)));
6792 		meta_sp_debug("pathname: %s\n", pname);
6793 
6794 		meta_sp_debug("updating the minor number to %lu\n", nm.mnum);
6795 
6796 		if (meta_update_namespace(sp->setno, sideno,
6797 		    ctd_name, *mydev, msp->compnamep->key, pname,
6798 		    ep) != 0) {
6799 			goto out;
6800 		}
6801 	}
6802 out:
6803 	if (pname != NULL)
6804 		Free(pname);
6805 	if (ctd_name != NULL)
6806 		Free(ctd_name);
6807 	if (devname != NULL)
6808 		Free(devname);
6809 	if (mydev != NULL)
6810 		Free(mydev);
6811 	return (rval);
6812 }
6813