xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_sp.c (revision 9ec394dbf343c1f23c6e13c39df427f238e5a369)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * soft partition operations
38  *
39  * Soft Partitions provide a virtual disk mechanism which is used to
40  * divide a large volume into many small pieces, each appearing as a
41  * separate device.  A soft partition consists of a series of extents,
42  * each having an offset and a length.  The extents are logically
43  * contiguous, so where the first extent leaves off the second extent
44  * picks up.  Which extent a given "virtual offset" belongs to is
45  * dependent on the size of all the previous extents in the soft
46  * partition.
47  *
48  * Soft partitions are represented in memory by an extent node
49  * (sp_ext_node_t) which contains all of the information necessary to
50  * create a unit structure and update the on-disk format, called
51  * "watermarks".  These extent nodes are typically kept in a doubly
52  * linked list and are manipulated by list manipulation routines.  A
53  * list of extents may represent all of the soft partitions on a volume,
54  * a single soft partition, or perhaps just a set of extents that need
55  * to be updated.  Extent lists may be sorted by extent or by name/seq#,
56  * depending on which compare function is used.  Most of the routines
57  * require the list be sorted by offset to work, and that's the typical
58  * configuration.
59  *
60  * In order to do an allocation, knowledge of all soft partitions on the
61  * volume is required.  Then free space is determined from the space
62  * that is not allocated, and new allocations can be made from the free
63  * space.  Once the new allocations are made, a unit structure is created
64  * and the watermarks are updated.  The status is then changed to "okay"
65  * on the unit structure to commit the transaction.  If updating the
66  * watermarks fails, the unit structure is in an intermediate state and
67  * the driver will not allow access to the device.
68  *
69  * A typical sequence of events is:
70  *     1. Fetch the list of names for all soft partitions on a volume
71  *         meta_sp_get_by_component()
72  *     2. Construct an extent list from the name list
73  *         meta_sp_extlist_from_namelist()
74  *     3. Fill the gaps in the extent list with free extents
75  *         meta_sp_list_freefill()
76  *     4. Allocate from the free extents
77  *         meta_sp_alloc_by_len()
78  *         meta_sp_alloc_by_list()
79  *     5. Create the unit structure from the extent list
80  *         meta_sp_createunit()
81  *         meta_sp_updateunit()
82  *     6. Write out the watermarks
83  *         meta_sp_update_wm()
84  *     7. Set the status to "Okay"
85  *         meta_sp_setstatus()
86  *
87  */
88 
89 #include <stdio.h>
90 #include <meta.h>
91 #include "meta_repartition.h"
92 #include <sys/lvm/md_sp.h>
93 #include <sys/lvm/md_crc.h>
94 #include <strings.h>
95 #include <sys/lvm/md_mirror.h>
96 #include <sys/bitmap.h>
97 
98 extern int	md_in_daemon;
99 
100 typedef struct sp_ext_node {
101 	struct sp_ext_node	*ext_next;	/* next element */
102 	struct sp_ext_node	*ext_prev;	/* previous element */
103 	sp_ext_type_t		ext_type;	/* type of extent */
104 	sp_ext_offset_t		ext_offset;	/* starting offset */
105 	sp_ext_length_t		ext_length;	/* length of this node */
106 	uint_t			ext_flags;	/* extent flags */
107 	uint32_t		ext_seq;	/* watermark seq no */
108 	mdname_t		*ext_namep;	/* name pointer */
109 	mdsetname_t		*ext_setp;	/* set pointer */
110 } sp_ext_node_t;
111 
112 /* extent flags */
113 #define	EXTFLG_UPDATE	(1)
114 
115 /* Extent node compare function for list sorting */
116 typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *);
117 
118 
119 /* Function Prototypes */
120 
121 /* Debugging Functions */
122 static void meta_sp_debug(char *format, ...);
123 static void meta_sp_printunit(mp_unit_t *mp);
124 
125 /* Misc Support Functions */
126 int meta_sp_parsesize(char *s, sp_ext_length_t *szp);
127 static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp);
128 static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp,
129 	md_error_t *ep);
130 static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp,
131     mdnamelist_t **nlpp, int force, md_error_t *ep);
132 static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp,
133     mdname_t *compnp, md_error_t *ep);
134 
135 /* Extent List Manipulation Functions */
136 static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2);
137 static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2);
138 static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np,
139     sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length,
140     sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare);
141 static void meta_sp_list_free(sp_ext_node_t **head);
142 static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext);
143 static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head,
144     sp_ext_type_t exttype, int exclude_wm);
145 static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head,
146     sp_ext_offset_t offset);
147 static void meta_sp_list_freefill(sp_ext_node_t **extlist,
148     sp_ext_length_t size);
149 static void meta_sp_list_dump(sp_ext_node_t *head);
150 static int meta_sp_list_overlaps(sp_ext_node_t *head);
151 
152 /* Extent List Query Functions */
153 static boolean_t meta_sp_enough_space(int desired_number_of_sps,
154 	blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp,
155 	sp_ext_length_t alignment);
156 static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep,
157 	mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp,
158 	md_error_t *ep);
159 static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep,
160 	mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp);
161 
162 
163 /* Extent Allocation Functions */
164 static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np,
165     sp_ext_node_t **extlist, sp_ext_node_t *free_ext,
166     sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq);
167 static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np,
168     sp_ext_node_t **extlist, sp_ext_length_t *lp,
169     sp_ext_offset_t last_off, sp_ext_length_t alignment);
170 static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np,
171     sp_ext_node_t **extlist, sp_ext_node_t *oblist);
172 
173 /* Extent List Population Functions */
174 static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp,
175     sp_ext_node_t **extlist, md_error_t *ep);
176 static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp,
177     sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep);
178 
179 /* Print (metastat) Functions */
180 static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp,
181     mdprtopts_t options, md_error_t *ep);
182 static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate);
183 static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp,
184     char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep);
185 
186 /* Watermark Manipulation Functions */
187 static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp,
188     sp_ext_node_t *extlist, md_error_t *ep);
189 static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep);
190 static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp,
191     mp_watermark_t *wm, sp_ext_offset_t offset,  md_error_t *ep);
192 static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp,
193     md_error_t *ep);
194 
195 /* Unit Structure Manipulation Functions */
196 static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist);
197 static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp,
198     sp_ext_node_t *extlist, int numexts, sp_ext_length_t len,
199     sp_status_t status, md_error_t *ep);
200 static mp_unit_t *meta_sp_updateunit(mdname_t *np,  mp_unit_t *old_un,
201     sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts,
202     md_error_t *ep);
203 static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist,
204     mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep);
205 static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options,
206     int *repart_options, md_error_t *ep);
207 
208 /* Reset (metaclear) Functions */
209 static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp,
210     md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep);
211 
212 /* Recovery (metarecover) Functions */
213 static void meta_sp_display_exthdr(void);
214 static void meta_sp_display_ext(sp_ext_node_t *ext);
215 static int meta_sp_checkseq(sp_ext_node_t *extlist);
216 static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *,
217     mdname_t **, md_error_t *);
218 static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np,
219     mdcmdopts_t options, md_error_t *ep);
220 static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp,
221     mdcmdopts_t options, md_error_t *ep);
222 static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np,
223     mdcmdopts_t options, md_error_t *ep);
224 static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext,
225     sp_ext_node_t *unitext, md_error_t *ep);
226 static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp,
227     mdcmdopts_t options, md_error_t *ep);
228 static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np,
229     mdcmdopts_t options, md_error_t *ep);
230 
231 /*
232  * Private Constants
233  */
234 
235 static const int FORCE_RELOAD_CACHE = 1;
236 static const uint_t NO_FLAGS = 0;
237 static const sp_ext_offset_t NO_OFFSET = 0ULL;
238 static const uint_t NO_SEQUENCE_NUMBER = 0;
239 static const int ONE_SOFT_PARTITION = 1;
240 
241 static unsigned long sp_parent_printed[BT_BITOUL(MD_MAXUNITS)];
242 
243 #define	TEST_SOFT_PARTITION_NAMEP NULL
244 #define	TEST_SETNAMEP NULL
245 
246 #define	EXCLUDE_WM	(1)
247 #define	INCLUDE_WM	(0)
248 
249 #define	SP_UNALIGNED	(0LL)
250 
251 /*
252  * **************************************************************************
253  *                          Debugging Functions                             *
254  * **************************************************************************
255  */
256 
257 /*PRINTFLIKE1*/
258 static void
259 meta_sp_debug(char *format, ...)
260 {
261 	static int debug;
262 	static int debug_set = 0;
263 	va_list ap;
264 
265 	if (!debug_set) {
266 		debug = getenv(META_SP_DEBUG) ? 1 : 0;
267 		debug_set = 1;
268 	}
269 
270 	if (debug) {
271 		va_start(ap, format);
272 		(void) vfprintf(stderr, format, ap);
273 		va_end(ap);
274 	}
275 }
276 
277 static void
278 meta_sp_printunit(mp_unit_t *mp)
279 {
280 	int i;
281 
282 	if (mp == NULL)
283 		return;
284 
285 	/* print the common fields we know about */
286 	(void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type);
287 	(void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size);
288 	(void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp));
289 
290 	/* sp-specific fields */
291 	(void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status);
292 	(void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts);
293 	(void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length);
294 	(void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev);
295 	(void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev);
296 	(void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key);
297 
298 	/* print extent information */
299 	(void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n");
300 	for (i = 0; i < mp->un_numexts; i++) {
301 		(void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i,
302 		    mp->un_ext[i].un_voff, mp->un_ext[i].un_poff,
303 		    mp->un_ext[i].un_len);
304 	}
305 }
306 
307 /*
308  * FUNCTION:    meta_sp_parsesize()
309  * INPUT:       s       - the string to parse
310  * OUTPUT:      *szp    - disk block count (0 for "all")
311  * RETURNS:     -1 for error, 0 for success
312  * PURPOSE:     parses the command line parameter that specifies the
313  *              requested size of a soft partition.  The input string
314  *              is either the literal "all" or a numeric value
315  *              followed by a single character, b for disk blocks, k
316  *              for kilobytes, m for megabytes, g for gigabytes, or t
317  *              for terabytes.  p for petabytes and e for exabytes
318  *              have been added as undocumented features for future
319  *              expansion.  For example, 100m is 100 megabytes, while
320  *              50g is 50 gigabytes.  All values are rounded up to the
321  *              nearest block size.
322  */
323 int
324 meta_sp_parsesize(char *s, sp_ext_length_t *szp)
325 {
326 	if (s == NULL || szp == NULL) {
327 		return (-1);
328 	}
329 
330 	/* Check for literal "all" */
331 	if (strcasecmp(s, "all") == 0) {
332 		*szp = 0;
333 		return (0);
334 	}
335 
336 	return (meta_sp_parsesizestring(s, szp));
337 }
338 
339 /*
340  * FUNCTION:	meta_sp_parsesizestring()
341  * INPUT:	s	- the string to parse
342  * OUTPUT:	*szp	- disk block count
343  * RETURNS:	-1 for error, 0 for success
344  * PURPOSE:	parses a string that specifies size. The input string is a
345  *		numeric value followed by a single character, b for disk blocks,
346  *		k for kilobytes, m for megabytes, g for gigabytes, or t for
347  *		terabytes.  p for petabytes and e for exabytes have been added
348  *		as undocumented features for future expansion.  For example,
349  *		100m is 100 megabytes, while 50g is 50 gigabytes.  All values
350  *		are rounded up to the nearest block size.
351  */
352 static int
353 meta_sp_parsesizestring(char *s, sp_ext_length_t *szp)
354 {
355 	sp_ext_length_t	len = 0;
356 	char		len_type[2];
357 
358 	if (s == NULL || szp == NULL) {
359 		return (-1);
360 	}
361 
362 	/*
363 	 * make sure block offset does not overflow 2^64 bytes.
364 	 */
365 	if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) ||
366 	    (len == 0LL) ||
367 	    (len > (1LL << (64 - DEV_BSHIFT))))
368 		return (-1);
369 
370 	switch (len_type[0]) {
371 	case 'B':
372 	case 'b':
373 		len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE));
374 		break;
375 	case 'K':
376 	case 'k':
377 		len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE));
378 		break;
379 	case 'M':
380 	case 'm':
381 		len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE));
382 		break;
383 	case 'g':
384 	case 'G':
385 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE));
386 		break;
387 	case 't':
388 	case 'T':
389 		len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL,
390 		    DEV_BSIZE));
391 		break;
392 	case 'p':
393 	case 'P':
394 		len = lbtodb(roundup(
395 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
396 		    DEV_BSIZE));
397 		break;
398 	case 'e':
399 	case 'E':
400 		len = lbtodb(roundup(
401 		    len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL,
402 		    DEV_BSIZE));
403 		break;
404 	default:
405 		/* error */
406 		return (-1);
407 	}
408 
409 	*szp = len;
410 	return (0);
411 }
412 
413 /*
414  * FUNCTION:	meta_sp_setgeom()
415  * INPUT:	np      - the underlying device to setup geometry for
416  *		compnp	- the underlying device to setup geometry for
417  *		mp	- the unit structure to set the geometry for
418  * OUTPUT:	ep	- return error pointer
419  * RETURNS:	int	- -1 if error, 0 otherwise
420  * PURPOSE:	establishes geometry information for a device
421  */
422 static int
423 meta_sp_setgeom(
424 	mdname_t	*np,
425 	mdname_t	*compnp,
426 	mp_unit_t	*mp,
427 	md_error_t	*ep
428 )
429 {
430 	mdgeom_t	*geomp;
431 	uint_t		round_cyl = 0;
432 
433 	if ((geomp = metagetgeom(compnp, ep)) == NULL)
434 		return (-1);
435 	if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct,
436 	    geomp->read_reinstruct, round_cyl, ep) != 0)
437 		return (-1);
438 
439 	return (0);
440 }
441 
442 /*
443  * FUNCTION:	meta_sp_setstatus()
444  * INPUT:	sp	- the set name for the devices to set the status on
445  *		minors	- an array of minor numbers of devices to set status on
446  *		num_units - number of entries in the array
447  *		status	- status value to set all units to
448  * OUTPUT:	ep	- return error pointer
449  * RETURNS:	int	- -1 if error, 0 success
450  * PURPOSE:	sets the status of one or more soft partitions to the
451  *		requested value
452  */
453 int
454 meta_sp_setstatus(
455 	mdsetname_t	*sp,
456 	minor_t		*minors,
457 	int		num_units,
458 	sp_status_t	status,
459 	md_error_t	*ep
460 )
461 {
462 	md_sp_statusset_t	status_params;
463 
464 	assert(minors != NULL);
465 
466 	/* update status of all soft partitions to the status passed in */
467 	(void) memset(&status_params, 0, sizeof (status_params));
468 	status_params.num_units = num_units;
469 	status_params.new_status = status;
470 	status_params.size = num_units * sizeof (minor_t);
471 	status_params.minors = (uintptr_t)minors;
472 	MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno);
473 	if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde,
474 	    NULL) != 0) {
475 		(void) mdstealerror(ep, &status_params.mde);
476 		return (-1);
477 	}
478 	return (0);
479 }
480 
481 /*
482  * FUNCTION:	meta_get_sp_names()
483  * INPUT:	sp	- the set name to get soft partitions from
484  *		options	- options from the command line
485  * OUTPUT:	nlpp	- list of all soft partition names
486  *		ep	- return error pointer
487  * RETURNS:	int	- -1 if error, 0 success
488  * PURPOSE:	returns a list of all soft partitions in the metadb
489  *		for all devices in the specified set
490  */
491 int
492 meta_get_sp_names(
493 	mdsetname_t	*sp,
494 	mdnamelist_t	**nlpp,
495 	int		options,
496 	md_error_t	*ep
497 )
498 {
499 	return (meta_get_names(MD_SP, sp, nlpp, options, ep));
500 }
501 
502 /*
503  * FUNCTION:	meta_get_by_component()
504  * INPUT:	sp	- the set name to get soft partitions from
505  *		compnp	- the name of the device containing the soft
506  *			  partitions that will be returned
507  *		force	- 0 - reads cached namelist if available,
508  *			  1 - reloads cached namelist, frees old namelist
509  * OUTPUT:	nlpp	- list of all soft partition names
510  *		ep	- return error pointer
511  * RETURNS:	int	- -1 error, otherwise the number of soft partitions
512  *			  found on the component (0 = none found).
513  * PURPOSE:	returns a list of all soft partitions on a given device
514  *		from the metadb information
515  */
516 static int
517 meta_sp_get_by_component(
518 	mdsetname_t	*sp,
519 	mdname_t	*compnp,
520 	mdnamelist_t	**nlpp,
521 	int		force,
522 	md_error_t	*ep
523 )
524 {
525 	static mdnamelist_t	*cached_list = NULL;	/* cached namelist */
526 	static int		cached_count = 0;	/* cached count */
527 	mdnamelist_t		*spnlp = NULL;		/* all sp names */
528 	mdnamelist_t		*namep;			/* list iterator */
529 	mdnamelist_t		**tailpp = nlpp;	/* namelist tail */
530 	mdnamelist_t		**cachetailpp;		/* cache tail */
531 	md_sp_t			*msp;			/* unit structure */
532 	int			count = 0;		/* count of sp's */
533 	int			err;
534 	mdname_t		*curnp;
535 
536 	if ((cached_list != NULL) && (!force)) {
537 		/* return a copy of the cached list */
538 		for (namep = cached_list; namep != NULL; namep = namep->next)
539 			tailpp = meta_namelist_append_wrapper(tailpp,
540 			    namep->namep);
541 		return (cached_count);
542 	}
543 
544 	/* free the cache and reset values to zeros to prepare for a new list */
545 	metafreenamelist(cached_list);
546 	cached_count = 0;
547 	cached_list = NULL;
548 	cachetailpp = &cached_list;
549 	*nlpp = NULL;
550 
551 	/* get all the softpartitions first of all */
552 	if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
553 		return (-1);
554 
555 	/*
556 	 * Now for each sp, see if it resides on the component we
557 	 * are interested in, if so then add it to our list
558 	 */
559 	for (namep = spnlp; namep != NULL; namep = namep->next) {
560 		curnp = namep->namep;
561 
562 		/* get the unit structure */
563 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
564 			continue;
565 
566 		/*
567 		 * If the current soft partition is not on the same
568 		 * component, continue the search.  If it is on the same
569 		 * component, add it to our namelist.
570 		 */
571 		err = meta_check_samedrive(compnp, msp->compnamep, ep);
572 		if (err <= 0) {
573 			/* not on the same device, check the next one */
574 			continue;
575 		}
576 
577 		/* it's on the same drive */
578 
579 		/*
580 		 * Check for overlapping partitions if the component is not
581 		 * a metadevice.
582 		 */
583 		if (!metaismeta(msp->compnamep)) {
584 			/*
585 			 * if they're on the same drive, neither
586 			 * should be a metadevice if one isn't
587 			 */
588 			assert(!metaismeta(compnp));
589 
590 			if (meta_check_overlap(msp->compnamep->cname,
591 			    compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0)
592 				continue;
593 
594 			/* in this case it's not an error for them to overlap */
595 			mdclrerror(ep);
596 		}
597 
598 		/* Component is on the same device, add to the used list */
599 		tailpp = meta_namelist_append_wrapper(tailpp, curnp);
600 		cachetailpp = meta_namelist_append_wrapper(cachetailpp,
601 		    curnp);
602 
603 		++count;
604 		++cached_count;
605 	}
606 
607 	assert(count == cached_count);
608 	return (count);
609 
610 out:
611 	metafreenamelist(*nlpp);
612 	*nlpp = NULL;
613 	return (-1);
614 }
615 
616 /*
617  * FUNCTION:    meta_sp_get_default_alignment()
618  * INPUT:       sp      - the pertinent set name
619  *              compnp  - the name of the underlying component
620  * OUTPUT:      ep      - return error pointer
621  * RETURNS:     sp_ext_length_t =0: no default alignment
622  *                              >0: default alignment
623  * PURPOSE:     returns the default alignment for soft partitions to
624  *              be built on top of the specified component or
625  *              metadevice
626  */
627 static sp_ext_length_t
628 meta_sp_get_default_alignment(
629 	mdsetname_t	*sp,
630 	mdname_t	*compnp,
631 	md_error_t	*ep
632 )
633 {
634 	sp_ext_length_t	a = SP_UNALIGNED;
635 	char		*mname;
636 
637 	assert(compnp != NULL);
638 
639 	/*
640 	 * We treat raw devices as opaque, and assume nothing about
641 	 * their alignment requirements.
642 	 */
643 	if (!metaismeta(compnp))
644 		return (SP_UNALIGNED);
645 
646 	/*
647 	 * We already know it's a metadevice from the previous test;
648 	 * metagetmiscname() will tell us which metadevice type we
649 	 * have
650 	 */
651 	mname = metagetmiscname(compnp, ep);
652 	if (mname == NULL)
653 		goto out;
654 
655 	/*
656 	 * For a mirror, we want to deal with the stripe that is the
657 	 * primary side.  If it happens to be asymmetrically
658 	 * configured, there is no simple way to fake a universal
659 	 * alignment.  There's a chance that the least common
660 	 * denominator of the set of interlaces from all stripes of
661 	 * all submirrors would do it, but nobody that really cared
662 	 * that much about this issue would create an asymmetric
663 	 * config to start with.
664 	 *
665 	 * If the component underlying the soft partition is a mirror,
666 	 * then at the exit of this loop, compnp will have been
667 	 * updated to describe the first active submirror.
668 	 */
669 	if (strcmp(mname, MD_MIRROR) == 0) {
670 		md_mirror_t	*mp;
671 		int		smi;
672 		md_submirror_t	*smp;
673 
674 		mp = meta_get_mirror(sp, compnp, ep);
675 		if (mp == NULL)
676 			goto out;
677 
678 		for (smi = 0; smi < NMIRROR; smi++) {
679 
680 			smp = &mp->submirrors[smi];
681 			if (smp->state == SMS_UNUSED)
682 				continue;
683 
684 			compnp = smp->submirnamep;
685 			assert(compnp != NULL);
686 
687 			mname = metagetmiscname(compnp, ep);
688 			if (mname == NULL)
689 				goto out;
690 
691 			break;
692 		}
693 
694 		if (smi == NMIRROR)
695 			goto out;
696 	}
697 
698 	/*
699 	 * Handle stripes and submirrors identically; just return the
700 	 * interlace of the first row.
701 	 */
702 	if (strcmp(mname, MD_STRIPE) == 0) {
703 		md_stripe_t	*stp;
704 
705 		stp = meta_get_stripe(sp, compnp, ep);
706 		if (stp == NULL)
707 			goto out;
708 
709 		a = stp->rows.rows_val[0].interlace;
710 		goto out;
711 	}
712 
713 	/*
714 	 * Raid is even more straightforward; the interlace applies to
715 	 * the entire device.
716 	 */
717 	if (strcmp(mname, MD_RAID) == 0) {
718 		md_raid_t	*rp;
719 
720 		rp = meta_get_raid(sp, compnp, ep);
721 		if (rp == NULL)
722 			goto out;
723 
724 		a = rp->interlace;
725 		goto out;
726 	}
727 
728 	/*
729 	 * If we have arrived here with the alignment still not set,
730 	 * then we expect the error to have been set by one of the
731 	 * routines we called.  If neither is the case, something has
732 	 * really gone wrong above.  (Probably the submirror walk
733 	 * failed to produce a valid submirror, but that would be
734 	 * really bad...)
735 	 */
736 out:
737 	meta_sp_debug("meta_sp_get_default_alignment: miscname %s, "
738 	    "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a);
739 
740 	if (getenv(META_SP_DEBUG) && !mdisok(ep)) {
741 		mde_perror(ep, NULL);
742 	}
743 
744 	assert((a > 0) || (!mdisok(ep)));
745 
746 	return (a);
747 }
748 
749 
750 
751 /*
752  * FUNCTION:	meta_check_insp()
753  * INPUT:	sp	- the set name for the device to check
754  *		np	- the name of the device to check
755  *		slblk	- the starting offset of the device to check
756  *		nblks	- the number of blocks in the device to check
757  * OUTPUT:	ep	- return error pointer
758  * RETURNS:	int	-  0 - device contains soft partitions
759  *			  -1 - device does not contain soft partitions
760  * PURPOSE:	determines whether a device contains any soft partitions
761  */
762 /* ARGSUSED */
763 int
764 meta_check_insp(
765 	mdsetname_t	*sp,
766 	mdname_t	*np,
767 	diskaddr_t	slblk,
768 	diskaddr_t	nblks,
769 	md_error_t	*ep
770 )
771 {
772 	mdnamelist_t	*spnlp = NULL;	/* soft partition name list */
773 	int		count;
774 	int		rval;
775 
776 	/* check set pointer */
777 	assert(sp != NULL);
778 
779 	/*
780 	 * Get a list of the soft partitions that currently reside on
781 	 * the component.  We should ALWAYS force reload the cache,
782 	 * because if we're using the md.tab, we must rebuild
783 	 * the list because it won't contain the previous (if any)
784 	 * soft partition.
785 	 */
786 	/* find all soft partitions on the component */
787 	count = meta_sp_get_by_component(sp, np, &spnlp, 1, ep);
788 
789 	if (count == -1) {
790 		rval = -1;
791 	} else if (count > 0) {
792 		rval = mduseerror(ep, MDE_ALREADY, np->dev,
793 		    spnlp->namep->cname, np->cname);
794 	} else {
795 		rval = 0;
796 	}
797 
798 	metafreenamelist(spnlp);
799 	return (rval);
800 }
801 
802 /*
803  * **************************************************************************
804  *                    Extent List Manipulation Functions                    *
805  * **************************************************************************
806  */
807 
808 /*
809  * FUNCTION:	meta_sp_cmp_by_nameseq()
810  * INPUT:	e1	- first node to compare
811  *		e2	- second node to compare
812  * OUTPUT:	none
813  * RETURNS:	int	- =0 - nodes are equal
814  *			  <0 - e1 should go before e2
815  *			  >0 - e1 should go after e2
816  * PURPOSE:	used for sorted list inserts to build a list sorted by
817  *		name first and sequence number second.
818  */
819 static int
820 meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2)
821 {
822 	int rval;
823 
824 	if (e1->ext_namep == NULL)
825 		return (1);
826 	if (e2->ext_namep == NULL)
827 		return (-1);
828 	if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0)
829 		return (rval);
830 
831 	/* the names are equal, compare sequence numbers */
832 	if (e1->ext_seq > e2->ext_seq)
833 		return (1);
834 	if (e1->ext_seq < e2->ext_seq)
835 		return (-1);
836 	/* sequence numbers are also equal */
837 	return (0);
838 }
839 
840 /*
841  * FUNCTION:	meta_sp_cmp_by_offset()
842  * INPUT:	e1	- first node to compare
843  *		e2	- second node to compare
844  * OUTPUT:	none
845  * RETURNS:	int	- =0 - nodes are equal
846  *			  <0 - e1 should go before e2
847  *			  >0 - e1 should go after e2
848  * PURPOSE:	used for sorted list inserts to build a list sorted by offset
849  */
850 static int
851 meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2)
852 {
853 	if (e1->ext_offset > e2->ext_offset)
854 		return (1);
855 	if (e1->ext_offset < e2->ext_offset)
856 		return (-1);
857 	/* offsets are equal */
858 	return (0);
859 }
860 
861 /*
862  * FUNCTION:	meta_sp_list_insert()
863  * INPUT:	sp	- the set name for the device the node belongs to
864  *		np	- the name of the device the node belongs to
865  *		head	- the head of the list, must be NULL for empty list
866  *		offset	- the physical offset of this extent in sectors
867  *		length	- the length of this extent in sectors
868  *		type	- the type of the extent being inserted
869  *		seq	- the sequence number of the extent being inserted
870  *		flags	- extent flags (eg. whether it needs to be updated)
871  *		compare	- the compare function to use
872  * OUTPUT:	head	- points to the new head if a node was inserted
873  *			  at the beginning
874  * RETURNS:	void
875  * PURPOSE:	inserts an extent node into a sorted doubly linked list.
876  *		The sort order is determined by the compare function.
877  *		Memory is allocated for the node in this function and it
878  *		is up to the caller to free it, possibly using
879  *		meta_sp_list_free().  If a node is inserted at the
880  *		beginning of the list, the head pointer is updated to
881  *		point to the new first node.
882  */
883 static void
884 meta_sp_list_insert(
885 	mdsetname_t	*sp,
886 	mdname_t	*np,
887 	sp_ext_node_t	**head,
888 	sp_ext_offset_t	offset,
889 	sp_ext_length_t	length,
890 	sp_ext_type_t	type,
891 	uint_t		seq,
892 	uint_t		flags,
893 	ext_cmpfunc_t	compare
894 )
895 {
896 	sp_ext_node_t	*newext;
897 	sp_ext_node_t	*curext;
898 
899 	assert(head != NULL);
900 
901 	/* Don't bother adding zero length nodes */
902 	if (length == 0ULL)
903 		return;
904 
905 	/* allocate and fill in new ext_node */
906 	newext = Zalloc(sizeof (sp_ext_node_t));
907 
908 	newext->ext_offset = offset;
909 	newext->ext_length = length;
910 	newext->ext_flags = flags;
911 	newext->ext_type = type;
912 	newext->ext_seq = seq;
913 	newext->ext_setp = sp;
914 	newext->ext_namep = np;
915 
916 	/* first node in the list */
917 	if (*head == NULL) {
918 		newext->ext_next = newext->ext_prev = NULL;
919 		*head = newext;
920 	} else if ((*compare)(*head, newext) >= 0) {
921 		/* the first node has a bigger offset, so insert before it */
922 		assert((*head)->ext_prev == NULL);
923 
924 		newext->ext_prev = NULL;
925 		newext->ext_next = *head;
926 		(*head)->ext_prev = newext;
927 		*head = newext;
928 	} else {
929 		/*
930 		 * find the next node whose offset is greater than
931 		 * the one we want to insert, or the end of the list.
932 		 */
933 		for (curext = *head;
934 		    (curext->ext_next != NULL) &&
935 		    ((*compare)(curext->ext_next, newext) < 0);
936 		    (curext = curext->ext_next))
937 			;
938 
939 		/* link the new node in after the current node */
940 		newext->ext_next = curext->ext_next;
941 		newext->ext_prev = curext;
942 
943 		if (curext->ext_next != NULL)
944 			curext->ext_next->ext_prev = newext;
945 
946 		curext->ext_next = newext;
947 	}
948 }
949 
950 /*
951  * FUNCTION:	meta_sp_list_free()
952  * INPUT:	head	- the head of the list, must be NULL for empty list
953  * OUTPUT:	head	- points to NULL on return
954  * RETURNS:	void
955  * PURPOSE:	walks a double linked extent list and frees each node
956  */
957 static void
958 meta_sp_list_free(sp_ext_node_t **head)
959 {
960 	sp_ext_node_t	*ext;
961 	sp_ext_node_t	*next;
962 
963 	assert(head != NULL);
964 
965 	ext = *head;
966 	while (ext) {
967 		next = ext->ext_next;
968 		Free(ext);
969 		ext = next;
970 	}
971 	*head = NULL;
972 }
973 
974 /*
975  * FUNCTION:	meta_sp_list_remove()
976  * INPUT:	head	- the head of the list, must be NULL for empty list
977  *		ext	- the extent to remove, must be a member of the list
978  * OUTPUT:	head	- points to the new head of the list
979  * RETURNS:	void
980  * PURPOSE:	unlinks the node specified by ext from the list and
981  *		frees it, possibly moving the head pointer forward if
982  *		the head is the node being removed.
983  */
984 static void
985 meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext)
986 {
987 	assert(head != NULL);
988 	assert(*head != NULL);
989 
990 	if (*head == ext)
991 		*head = ext->ext_next;
992 
993 	if (ext->ext_prev != NULL)
994 		ext->ext_prev->ext_next = ext->ext_next;
995 	if (ext->ext_next != NULL)
996 		ext->ext_next->ext_prev = ext->ext_prev;
997 	Free(ext);
998 }
999 
1000 /*
1001  * FUNCTION:	meta_sp_list_size()
1002  * INPUT:	head	- the head of the list, must be NULL for empty list
1003  *		exttype	- the type of the extents to sum
1004  *		exclude_wm - subtract space for extent headers from total
1005  * OUTPUT:	none
1006  * RETURNS:	sp_ext_length_t	- the sum of all of the lengths
1007  * PURPOSE:	sums the lengths of all extents in the list matching the
1008  *		specified type.  This could be used for computing the
1009  *		amount of free or used space, for example.
1010  */
1011 static sp_ext_length_t
1012 meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm)
1013 {
1014 	sp_ext_node_t	*ext;
1015 	sp_ext_length_t	size = 0LL;
1016 
1017 	for (ext = head; ext != NULL; ext = ext->ext_next)
1018 		if (ext->ext_type == exttype)
1019 			size += ext->ext_length -
1020 			    ((exclude_wm) ? MD_SP_WMSIZE : 0);
1021 
1022 	return (size);
1023 }
1024 
1025 /*
1026  * FUNCTION:	meta_sp_list_find()
1027  * INPUT:	head	- the head of the list, must be NULL for empty list
1028  *		offset	- the offset contained by the node to find
1029  * OUTPUT:	none
1030  * RETURNS:	sp_ext_node_t *	- the node containing the requested offset
1031  *				  or NULL if no such nodes were found.
1032  * PURPOSE:	finds a node in a list containing the requested offset
1033  *		(inclusive).  If multiple nodes contain this offset then
1034  *		only the first will be returned, though typically these
1035  *		lists are managed with non-overlapping nodes.
1036  *
1037  *		*The list MUST be sorted by offset for this function to work.*
1038  */
1039 static sp_ext_node_t *
1040 meta_sp_list_find(
1041 	sp_ext_node_t	*head,
1042 	sp_ext_offset_t	offset
1043 )
1044 {
1045 	sp_ext_node_t	*ext;
1046 
1047 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1048 		/* check if the offset lies within this extent */
1049 		if ((offset >= ext->ext_offset) &&
1050 		    (offset < ext->ext_offset + ext->ext_length)) {
1051 			/*
1052 			 * the requested extent should always be a
1053 			 * subset of an extent in the list.
1054 			 */
1055 			return (ext);
1056 		}
1057 	}
1058 	return (NULL);
1059 }
1060 
1061 /*
1062  * FUNCTION:	meta_sp_list_freefill()
1063  * INPUT:	head	- the head of the list, must be NULL for empty list
1064  *		size	- the size of the volume this extent list is
1065  *			  representing
1066  * OUTPUT:	head	- the new head of the list
1067  * RETURNS:	void
1068  * PURPOSE:	finds gaps in the extent list and fills them with a free
1069  *		node.  If there is a gap at the beginning the head
1070  *		pointer will be changed to point to the new free node.
1071  *		If there is free space at the end, the last free extent
1072  *		will extend all the way out to the size specified.
1073  *
1074  *		*The list MUST be sorted by offset for this function to work.*
1075  */
1076 static void
1077 meta_sp_list_freefill(
1078 	sp_ext_node_t	**head,
1079 	sp_ext_length_t	size
1080 )
1081 {
1082 	sp_ext_node_t	*ext;
1083 	sp_ext_offset_t	curoff = 0LL;
1084 
1085 	for (ext = *head; ext != NULL; ext = ext->ext_next) {
1086 		if (curoff < ext->ext_offset)
1087 			meta_sp_list_insert(NULL, NULL, head,
1088 			    curoff, ext->ext_offset - curoff,
1089 			    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1090 		curoff = ext->ext_offset + ext->ext_length;
1091 	}
1092 
1093 	/* pad inverse list out to the end */
1094 	if (curoff < size)
1095 		meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff,
1096 		    EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset);
1097 
1098 	if (getenv(META_SP_DEBUG)) {
1099 		meta_sp_debug("meta_sp_list_freefill: Extent list with "
1100 		    "holes freefilled:\n");
1101 		meta_sp_list_dump(*head);
1102 	}
1103 }
1104 
1105 /*
1106  * FUNCTION:	meta_sp_list_dump()
1107  * INPUT:	head	- the head of the list, must be NULL for empty list
1108  * OUTPUT:	none
1109  * RETURNS:	void
1110  * PURPOSE:	dumps the entire extent list to stdout for easy debugging
1111  */
1112 static void
1113 meta_sp_list_dump(sp_ext_node_t *head)
1114 {
1115 	sp_ext_node_t	*ext;
1116 
1117 	meta_sp_debug("meta_sp_list_dump: dumping extent list:\n");
1118 	meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name",
1119 	    "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev",
1120 	    "Next");
1121 	for (ext = head; ext != NULL; ext = ext->ext_next) {
1122 		if (ext->ext_namep != NULL)
1123 			meta_sp_debug("%5s", ext->ext_namep->cname);
1124 		else
1125 			meta_sp_debug("%5s", "NONE");
1126 
1127 		meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq);
1128 		switch (ext->ext_type) {
1129 		case EXTTYP_ALLOC:
1130 			meta_sp_debug("%7s ", "ALLOC");
1131 			break;
1132 		case EXTTYP_FREE:
1133 			meta_sp_debug("%7s ", "FREE");
1134 			break;
1135 		case EXTTYP_END:
1136 			meta_sp_debug("%7s ", "END");
1137 			break;
1138 		case EXTTYP_RESERVED:
1139 			meta_sp_debug("%7s ", "RESV");
1140 			break;
1141 		default:
1142 			meta_sp_debug("%7s ", "INVLD");
1143 			break;
1144 		}
1145 
1146 		meta_sp_debug("%10llu %10llu %5u %10p %10p\n",
1147 		    ext->ext_offset, ext->ext_length,
1148 		    ext->ext_flags, (void *) ext->ext_prev,
1149 		    (void *) ext->ext_next);
1150 	}
1151 	meta_sp_debug("\n");
1152 }
1153 
1154 /*
1155  * FUNCTION:	meta_sp_list_overlaps()
1156  * INPUT:	head	- the head of the list, must be NULL for empty list
1157  * OUTPUT:	none
1158  * RETURNS:	int	- 1 if extents overlap, 0 if ok
1159  * PURPOSE:	checks a list for overlaps.  The list MUST be sorted by
1160  *		offset for this function to work properly.
1161  */
1162 static int
1163 meta_sp_list_overlaps(sp_ext_node_t *head)
1164 {
1165 	sp_ext_node_t	*ext;
1166 
1167 	for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) {
1168 		if (ext->ext_offset + ext->ext_length >
1169 		    ext->ext_next->ext_offset)
1170 			return (1);
1171 	}
1172 	return (0);
1173 }
1174 
1175 /*
1176  * **************************************************************************
1177  *                        Extent Allocation Functions                       *
1178  * **************************************************************************
1179  */
1180 
1181 /*
1182  * FUNCTION:	meta_sp_alloc_by_ext()
1183  * INPUT:	sp	- the set name for the device the node belongs to
1184  *		np	- the name of the device the node belongs to
1185  *		head	- the head of the list, must be NULL for empty list
1186  *		free_ext	- the free extent being allocated from
1187  *		alloc_offset	- the offset of the allocation
1188  *		alloc_len	- the length of the allocation
1189  *		seq		- the sequence number of the allocation
1190  * OUTPUT:	head	- the new head pointer
1191  * RETURNS:	void
1192  * PURPOSE:	allocates a portion of the free extent free_ext.  The
1193  *		allocated portion starts at alloc_offset and is
1194  *		alloc_length long.  Both (alloc_offset) and (alloc_offset +
1195  *		alloc_length) must be contained within the free extent.
1196  *
1197  *		The free extent is split into as many as 3 pieces - a
1198  *		free extent containing [ free_offset .. alloc_offset ), an
1199  *		allocated extent containing the range [ alloc_offset ..
1200  *		alloc_end ], and another free extent containing the
1201  *		range ( alloc_end .. free_end ].  If either of the two
1202  *		new free extents would be zero length, they are not created.
1203  *
1204  *		Finally, the original free extent is removed.  All newly
1205  *		created extents have the EXTFLG_UPDATE flag set.
1206  */
1207 static void
1208 meta_sp_alloc_by_ext(
1209 	mdsetname_t	*sp,
1210 	mdname_t	*np,
1211 	sp_ext_node_t	**head,
1212 	sp_ext_node_t	*free_ext,
1213 	sp_ext_offset_t	alloc_offset,
1214 	sp_ext_length_t	alloc_length,
1215 	uint_t		seq
1216 )
1217 {
1218 	sp_ext_offset_t	free_offset = free_ext->ext_offset;
1219 	sp_ext_length_t	free_length = free_ext->ext_length;
1220 
1221 	sp_ext_offset_t	alloc_end = alloc_offset + alloc_length;
1222 	sp_ext_offset_t	free_end  = free_offset  + free_length;
1223 
1224 	/* allocated extent must be a subset of the free extent */
1225 	assert(free_offset <= alloc_offset);
1226 	assert(free_end >= alloc_end);
1227 
1228 	meta_sp_list_remove(head, free_ext);
1229 
1230 	if (free_offset < alloc_offset) {
1231 		meta_sp_list_insert(NULL, NULL, head, free_offset,
1232 		    (alloc_offset - free_offset), EXTTYP_FREE, 0,
1233 		    EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1234 	}
1235 
1236 	if (free_end > alloc_end) {
1237 		meta_sp_list_insert(NULL, NULL, head, alloc_end,
1238 		    (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE,
1239 		    meta_sp_cmp_by_offset);
1240 	}
1241 
1242 	meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length,
1243 	    EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
1244 
1245 	if (getenv(META_SP_DEBUG)) {
1246 		meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n");
1247 		meta_sp_list_dump(*head);
1248 	}
1249 }
1250 
1251 /*
1252  * FUNCTION:	meta_sp_alloc_by_len()
1253  * INPUT:	sp	- the set name for the device the node belongs to
1254  *		np	- the name of the device the node belongs to
1255  *		head	- the head of the list, must be NULL for empty list
1256  *		*lp	- the requested length to allocate
1257  *		last_off	- the last offset already allocated.
1258  *		alignment	- the desired extent alignmeent
1259  * OUTPUT:	head	- the new head pointer
1260  *		*lp	- the length allocated
1261  * RETURNS:	int	- -1 if error, the number of new extents on success
1262  * PURPOSE:	allocates extents from free space to satisfy the requested
1263  *		length.  If requested length is zero, allocates all
1264  *		remaining free space.  This function provides the meat
1265  *		of the extent allocation algorithm.  Allocation is a
1266  *		three tier process:
1267  *
1268  *		1. If last_off is nonzero and there is free space following
1269  *		   that node, then it is extended to allocate as much of that
1270  *		   free space as possible.  This is useful for metattach.
1271  *		2. If a free extent can be found to satisfy the remaining
1272  *		   requested space, then satisfy the rest of the request
1273  *		   from that extent.
1274  *		3. Start allocating space from any remaining free extents until
1275  *		   the remainder of the request is satisified.
1276  *
1277  *              If alignment is non-zero, then every extent modified
1278  *              or newly allocated will be aligned modulo alignment,
1279  *              with a length that is an integer multiple of
1280  *              alignment.
1281  *
1282  *		The EXTFLG_UPDATE flag is set for all nodes (free and
1283  *		allocated) that require updated watermarks.
1284  *
1285  *		This algorithm may have a negative impact on fragmentation
1286  *		in pathological cases and may be improved if it turns out
1287  *		to be a problem.  This may be exacerbated by particularly
1288  *		large alignments.
1289  *
1290  * NOTE:	It's confusing, so it demands an explanation:
1291  *		- len is used to represent requested data space; it
1292  *		  does not include room for a watermark.  On each full
1293  *		  or partial allocation, len will be decremented by
1294  *		  alloc_len (see next paragraph) until it reaches
1295  *		  zero.
1296  *		- alloc_len is used to represent data space allocated
1297  *		  from a particular extent; it does not include space
1298  *		  for a watermark.  In the rare event that a_length
1299  *		  (see next paragraph) is equal to MD_SP_WMSIZE,
1300  *		  alloc_len will be zero and the resulting MD_SP_WMSIZE
1301  *		  fragment of space will be utterly unusable.
1302  *		- a_length is used to represent all space to be
1303  *		  allocated from a particular extent; it DOES include
1304  *		  space for a watermark.
1305  */
1306 static int
1307 meta_sp_alloc_by_len(
1308 	mdsetname_t	*sp,
1309 	mdname_t	*np,
1310 	sp_ext_node_t	**head,
1311 	sp_ext_length_t	*lp,
1312 	sp_ext_offset_t	last_off,
1313 	sp_ext_offset_t	alignment
1314 )
1315 {
1316 	sp_ext_node_t	*free_ext;
1317 	sp_ext_node_t	*alloc_ext;
1318 	uint_t		last_seq = 0;
1319 	uint_t		numexts = 0;
1320 	sp_ext_length_t	freespace;
1321 	sp_ext_length_t	alloc_len;
1322 	sp_ext_length_t	len;
1323 
1324 	/* We're DOA if we can't read *lp */
1325 	assert(lp != NULL);
1326 	len = *lp;
1327 
1328 	/*
1329 	 * Process the nominal case first: we've been given an actual
1330 	 * size argument, rather than the literal "all"
1331 	 */
1332 
1333 	if (len != 0) {
1334 
1335 		/*
1336 		 * Short circuit the check for free space.  This may
1337 		 * tell us we have enough space when we really don't
1338 		 * because each extent loses space to a watermark, but
1339 		 * it will always tell us there isn't enough space
1340 		 * correctly.  Worst case we do some extra work.
1341 		 */
1342 		freespace = meta_sp_list_size(*head, EXTTYP_FREE,
1343 		    INCLUDE_WM);
1344 
1345 		if (freespace < len)
1346 			return (-1);
1347 
1348 		/*
1349 		 * First see if we can extend the last extent for an
1350 		 * attach.
1351 		 */
1352 		if (last_off != 0LL) {
1353 			int align = 0;
1354 
1355 			alloc_ext =
1356 			    meta_sp_list_find(*head, last_off);
1357 			assert(alloc_ext != NULL);
1358 
1359 			/*
1360 			 * The offset test reflects the
1361 			 * inclusion of the watermark in the extent
1362 			 */
1363 			align = (alignment > 0) &&
1364 			    (((alloc_ext->ext_offset + MD_SP_WMSIZE) %
1365 			    alignment) == 0);
1366 
1367 			/*
1368 			 * If we decided not to align here, we should
1369 			 * also reset "alignment" so we don't bother
1370 			 * later, either.
1371 			 */
1372 			if (!align) {
1373 				alignment = 0;
1374 			}
1375 
1376 			last_seq = alloc_ext->ext_seq;
1377 
1378 			free_ext = meta_sp_list_find(*head,
1379 			    alloc_ext->ext_offset +
1380 			    alloc_ext->ext_length);
1381 
1382 			/*
1383 			 * If a free extent follows our last allocated
1384 			 * extent, then remove the last allocated
1385 			 * extent and increase the size of the free
1386 			 * extent to overlap it, then allocate the
1387 			 * total space from the new free extent.
1388 			 */
1389 			if (free_ext != NULL &&
1390 			    free_ext->ext_type == EXTTYP_FREE) {
1391 				assert(free_ext->ext_offset ==
1392 				    alloc_ext->ext_offset +
1393 				    alloc_ext->ext_length);
1394 
1395 				alloc_len =
1396 				    MIN(len, free_ext->ext_length);
1397 
1398 				if (align && (alloc_len < len)) {
1399 					/* No watermark space needed */
1400 					alloc_len -= alloc_len % alignment;
1401 				}
1402 
1403 				if (alloc_len > 0) {
1404 					free_ext->ext_offset -=
1405 					    alloc_ext->ext_length;
1406 					free_ext->ext_length +=
1407 					    alloc_ext->ext_length;
1408 
1409 					meta_sp_alloc_by_ext(sp, np, head,
1410 					    free_ext, free_ext->ext_offset,
1411 					    alloc_ext->ext_length + alloc_len,
1412 					    last_seq);
1413 
1414 					/*
1415 					 * now remove the original allocated
1416 					 * node.  We may have overlapping
1417 					 * extents for a short time before
1418 					 * this node is removed.
1419 					 */
1420 					meta_sp_list_remove(head, alloc_ext);
1421 					len -= alloc_len;
1422 				}
1423 			}
1424 			last_seq++;
1425 		}
1426 
1427 		if (len == 0LL)
1428 			goto out;
1429 
1430 		/*
1431 		 * Next, see if we can find a single allocation for
1432 		 * the remainder.  This may make fragmentation worse
1433 		 * in some cases, but there's no good way to allocate
1434 		 * that doesn't have a highly fragmented corner case.
1435 		 */
1436 		for (free_ext = *head; free_ext != NULL;
1437 		    free_ext = free_ext->ext_next) {
1438 			sp_ext_offset_t	a_offset;
1439 			sp_ext_offset_t	a_length;
1440 
1441 			if (free_ext->ext_type != EXTTYP_FREE)
1442 				continue;
1443 
1444 			/*
1445 			 * The length test should include space for
1446 			 * the watermark
1447 			 */
1448 
1449 			a_offset = free_ext->ext_offset;
1450 			a_length = free_ext->ext_length;
1451 
1452 			if (alignment > 0) {
1453 
1454 				/*
1455 				 * Shortcut for extents that have been
1456 				 * previously added to pad out the
1457 				 * data space
1458 				 */
1459 				if (a_length < alignment) {
1460 					continue;
1461 				}
1462 
1463 				/*
1464 				 * Round up so the data space begins
1465 				 * on a properly aligned boundary.
1466 				 */
1467 				a_offset += alignment -
1468 				    (a_offset % alignment) - MD_SP_WMSIZE;
1469 
1470 				/*
1471 				 * This is only necessary in case the
1472 				 * watermark size is ever greater than
1473 				 * one.  It'll never happen, of
1474 				 * course; we'll get rid of watermarks
1475 				 * before we make 'em bigger.
1476 				 */
1477 				if (a_offset < free_ext->ext_offset) {
1478 					a_offset += alignment;
1479 				}
1480 
1481 				/*
1482 				 * Adjust the length to account for
1483 				 * the space lost above (if any)
1484 				 */
1485 				a_length -=
1486 				    (a_offset - free_ext->ext_offset);
1487 			}
1488 
1489 			if (a_length >= len + MD_SP_WMSIZE) {
1490 				meta_sp_alloc_by_ext(sp, np, head,
1491 				    free_ext, a_offset,
1492 				    len + MD_SP_WMSIZE, last_seq);
1493 
1494 				len = 0LL;
1495 				numexts++;
1496 				break;
1497 			}
1498 		}
1499 
1500 		if (len == 0LL)
1501 			goto out;
1502 
1503 
1504 		/*
1505 		 * If the request could not be satisfied by extending
1506 		 * the last extent or by a single extent, then put
1507 		 * multiple smaller extents together until the request
1508 		 * is satisfied.
1509 		 */
1510 		for (free_ext = *head; (free_ext != NULL) && (len > 0);
1511 		    free_ext = free_ext->ext_next) {
1512 			sp_ext_offset_t a_offset;
1513 			sp_ext_length_t a_length;
1514 
1515 			if (free_ext->ext_type != EXTTYP_FREE)
1516 				continue;
1517 
1518 			a_offset = free_ext->ext_offset;
1519 			a_length = free_ext->ext_length;
1520 
1521 			if (alignment > 0) {
1522 
1523 				/*
1524 				 * Shortcut for extents that have been
1525 				 * previously added to pad out the
1526 				 * data space
1527 				 */
1528 				if (a_length < alignment) {
1529 					continue;
1530 				}
1531 
1532 				/*
1533 				 * Round up so the data space begins
1534 				 * on a properly aligned boundary.
1535 				 */
1536 				a_offset += alignment -
1537 				    (a_offset % alignment) - MD_SP_WMSIZE;
1538 
1539 				/*
1540 				 * This is only necessary in case the
1541 				 * watermark size is ever greater than
1542 				 * one.  It'll never happen, of
1543 				 * course; we'll get rid of watermarks
1544 				 * before we make 'em bigger.
1545 				 */
1546 				if (a_offset < free_ext->ext_offset) {
1547 					a_offset += alignment;
1548 				}
1549 
1550 				/*
1551 				 * Adjust the length to account for
1552 				 * the space lost above (if any)
1553 				 */
1554 				a_length -=
1555 				    (a_offset - free_ext->ext_offset);
1556 
1557 				/*
1558 				 * Adjust the length to be properly
1559 				 * aligned if it is NOT to be the
1560 				 * last extent in the soft partition.
1561 				 */
1562 				if ((a_length - MD_SP_WMSIZE) < len)
1563 					a_length -=
1564 					    (a_length - MD_SP_WMSIZE)
1565 					    % alignment;
1566 			}
1567 
1568 			alloc_len = MIN(len, a_length - MD_SP_WMSIZE);
1569 			if (alloc_len == 0)
1570 				continue;
1571 
1572 			/*
1573 			 * meta_sp_alloc_by_ext() expects the
1574 			 * allocation length to include the watermark
1575 			 * size, which is why we don't simply pass in
1576 			 * alloc_len here.
1577 			 */
1578 			meta_sp_alloc_by_ext(sp, np, head, free_ext,
1579 			    a_offset, MIN(len + MD_SP_WMSIZE, a_length),
1580 			    last_seq);
1581 
1582 			len -= alloc_len;
1583 			numexts++;
1584 			last_seq++;
1585 		}
1586 
1587 
1588 		/*
1589 		 * If there was not enough space we can throw it all
1590 		 * away since no real work has been done yet.
1591 		 */
1592 		if (len != 0) {
1593 			meta_sp_list_free(head);
1594 			return (-1);
1595 		}
1596 	}
1597 
1598 	/*
1599 	 * Otherwise, the literal "all" was specified: allocate all
1600 	 * available free space.  Don't bother with alignment.
1601 	 */
1602 	else {
1603 		/* First, extend the last extent if this is a grow */
1604 		if (last_off != 0LL) {
1605 			alloc_ext =
1606 			    meta_sp_list_find(*head, last_off);
1607 			assert(alloc_ext != NULL);
1608 
1609 			last_seq = alloc_ext->ext_seq;
1610 
1611 			free_ext = meta_sp_list_find(*head,
1612 			    alloc_ext->ext_offset +
1613 			    alloc_ext->ext_length);
1614 
1615 			/*
1616 			 * If a free extent follows our last allocated
1617 			 * extent, then remove the last allocated
1618 			 * extent and increase the size of the free
1619 			 * extent to overlap it, then allocate the
1620 			 * total space from the new free extent.
1621 			 */
1622 			if (free_ext != NULL &&
1623 			    free_ext->ext_type == EXTTYP_FREE) {
1624 				assert(free_ext->ext_offset ==
1625 				    alloc_ext->ext_offset +
1626 				    alloc_ext->ext_length);
1627 
1628 				len = alloc_len =
1629 				    free_ext->ext_length;
1630 
1631 				free_ext->ext_offset -=
1632 				    alloc_ext->ext_length;
1633 				free_ext->ext_length +=
1634 				    alloc_ext->ext_length;
1635 
1636 				meta_sp_alloc_by_ext(sp, np, head,
1637 				    free_ext, free_ext->ext_offset,
1638 				    alloc_ext->ext_length + alloc_len,
1639 				    last_seq);
1640 
1641 				/*
1642 				 * now remove the original allocated
1643 				 * node.  We may have overlapping
1644 				 * extents for a short time before
1645 				 * this node is removed.
1646 				 */
1647 				meta_sp_list_remove(head, alloc_ext);
1648 			}
1649 
1650 			last_seq++;
1651 		}
1652 
1653 		/* Next, grab all remaining free space */
1654 		for (free_ext = *head; free_ext != NULL;
1655 		    free_ext = free_ext->ext_next) {
1656 
1657 			if (free_ext->ext_type == EXTTYP_FREE) {
1658 				alloc_len =
1659 				    free_ext->ext_length - MD_SP_WMSIZE;
1660 				if (alloc_len == 0)
1661 					continue;
1662 
1663 				/*
1664 				 * meta_sp_alloc_by_ext() expects the
1665 				 * allocation length to include the
1666 				 * watermark size, which is why we
1667 				 * don't simply pass in alloc_len
1668 				 * here.
1669 				 */
1670 				meta_sp_alloc_by_ext(sp, np, head,
1671 				    free_ext, free_ext->ext_offset,
1672 				    free_ext->ext_length,
1673 				    last_seq);
1674 
1675 				len += alloc_len;
1676 				numexts++;
1677 				last_seq++;
1678 			}
1679 		}
1680 	}
1681 
1682 out:
1683 	if (getenv(META_SP_DEBUG)) {
1684 		meta_sp_debug("meta_sp_alloc_by_len: Extent list after "
1685 		    "allocation:\n");
1686 		meta_sp_list_dump(*head);
1687 	}
1688 
1689 	if (*lp == 0) {
1690 		*lp = len;
1691 
1692 		/*
1693 		 * Make sure the callers hit a no space error if we
1694 		 * didn't actually find anything.
1695 		 */
1696 		if (len == 0) {
1697 			return (-1);
1698 		}
1699 	}
1700 
1701 	return (numexts);
1702 }
1703 
1704 /*
1705  * FUNCTION:	meta_sp_alloc_by_list()
1706  * INPUT:	sp	- the set name for the device the node belongs to
1707  *		np	- the name of the device the node belongs to
1708  *		head	- the head of the list, must be NULL for empty list
1709  *		oblist	- an extent list containing requested nodes to allocate
1710  * OUTPUT:	head	- the new head pointer
1711  * RETURNS:	int	- -1 if error, the number of new extents on success
1712  * PURPOSE:	allocates extents from free space to satisfy the requested
1713  *		extent list.  This is primarily used for the -o/-b options
1714  *		where the user may specifically request extents to allocate.
1715  *		Each extent in the oblist must be a subset (inclusive) of a
1716  *		free extent and may not overlap each other.  This
1717  *		function sets the EXTFLG_UPDATE flag for each node that
1718  *		requires a watermark update after allocating.
1719  */
1720 static int
1721 meta_sp_alloc_by_list(
1722 	mdsetname_t	*sp,
1723 	mdname_t	*np,
1724 	sp_ext_node_t	**head,
1725 	sp_ext_node_t	*oblist
1726 )
1727 {
1728 	sp_ext_node_t	*ext;
1729 	sp_ext_node_t	*free_ext;
1730 	uint_t		numexts = 0;
1731 
1732 	for (ext = oblist; ext != NULL; ext = ext->ext_next) {
1733 
1734 		free_ext = meta_sp_list_find(*head,
1735 		    ext->ext_offset - MD_SP_WMSIZE);
1736 
1737 		/* Make sure the allocation is within the free extent */
1738 		if ((free_ext == NULL) ||
1739 		    (ext->ext_offset + ext->ext_length >
1740 		    free_ext->ext_offset + free_ext->ext_length) ||
1741 		    (free_ext->ext_type != EXTTYP_FREE))
1742 			return (-1);
1743 
1744 		meta_sp_alloc_by_ext(sp, np, head, free_ext,
1745 		    ext->ext_offset - MD_SP_WMSIZE,
1746 		    ext->ext_length + MD_SP_WMSIZE, ext->ext_seq);
1747 
1748 		numexts++;
1749 	}
1750 
1751 	assert(meta_sp_list_overlaps(*head) == 0);
1752 
1753 	if (getenv(META_SP_DEBUG)) {
1754 		meta_sp_debug("meta_sp_alloc_by_list: Extent list after "
1755 		    "allocation:\n");
1756 		meta_sp_list_dump(*head);
1757 	}
1758 
1759 	return (numexts);
1760 }
1761 
1762 /*
1763  * **************************************************************************
1764  *                     Extent List Population Functions                     *
1765  * **************************************************************************
1766  */
1767 
1768 /*
1769  * FUNCTION:	meta_sp_extlist_from_namelist()
1770  * INPUT:	sp	- the set name for the device the node belongs to
1771  *		spnplp	- the namelist of soft partitions to build a list from
1772  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1773  *		ep	- return error pointer
1774  * RETURNS:	int	- -1 if error, 0 on success
1775  * PURPOSE:	builds an extent list representing the soft partitions
1776  *		specified in the namelist.  Each extent in each soft
1777  *		partition is added to the list with the type EXTTYP_ALLOC.
1778  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1779  *		extent in the list includes the space occupied by the
1780  *		watermark, which is not included in the unit structures.
1781  */
1782 static int
1783 meta_sp_extlist_from_namelist(
1784 	mdsetname_t	*sp,
1785 	mdnamelist_t	*spnlp,
1786 	sp_ext_node_t	**extlist,
1787 	md_error_t	*ep
1788 )
1789 {
1790 	int		extn;
1791 	md_sp_t		*msp;		/* unit structure of the sp's */
1792 	mdnamelist_t	*namep;
1793 
1794 	assert(sp != NULL);
1795 
1796 	/*
1797 	 * Now go through the soft partitions and add a node to the used
1798 	 * list for each allocated extent.
1799 	 */
1800 	for (namep = spnlp; namep != NULL; namep = namep->next) {
1801 		mdname_t	*curnp = namep->namep;
1802 
1803 		/* get the unit structure */
1804 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
1805 			return (-1);
1806 
1807 		for (extn = 0; (extn < msp->ext.ext_len); extn++) {
1808 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
1809 
1810 			/*
1811 			 * subtract from offset and add to the length
1812 			 * to account for the watermark, which is not
1813 			 * contained in the extents in the unit structure.
1814 			 */
1815 			meta_sp_list_insert(sp, curnp, extlist,
1816 			    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
1817 			    EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset);
1818 		}
1819 	}
1820 	return (0);
1821 }
1822 
1823 /*
1824  * FUNCTION:	meta_sp_extlist_from_wm()
1825  * INPUT:	sp	- the set name for the device the node belongs to
1826  *		compnp	- the name of the device to scan watermarks on
1827  * OUTPUT:	extlist	- the extent list built from the SPs in the namelist
1828  *		ep	- return error pointer
1829  * RETURNS:	int	- -1 if error, 0 on success
1830  * PURPOSE:	builds an extent list representing the soft partitions
1831  *		specified in the namelist.  Each extent in each soft
1832  *		partition is added to the list with the type EXTTYP_ALLOC.
1833  *		The EXTFLG_UPDATE flag is not set on any nodes.  Each
1834  *		extent in the list includes the space occupied by the
1835  *		watermark, which is not included in the unit structures.
1836  */
1837 static int
1838 meta_sp_extlist_from_wm(
1839 	mdsetname_t	*sp,
1840 	mdname_t	*compnp,
1841 	sp_ext_node_t	**extlist,
1842 	ext_cmpfunc_t	compare,
1843 	md_error_t	*ep
1844 )
1845 {
1846 	mp_watermark_t	wm;
1847 	mdname_t	*np = NULL;
1848 	mdsetname_t	*spsetp = NULL;
1849 	sp_ext_offset_t	cur_off;
1850 	md_set_desc	*sd;
1851 	int		init = 0;
1852 	mdkey_t		key;
1853 	minor_t		mnum;
1854 
1855 	if (!metaislocalset(sp)) {
1856 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1857 			return (-1);
1858 	}
1859 
1860 	if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR)
1861 		return (-1);
1862 
1863 	for (;;) {
1864 		if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) {
1865 			return (-1);
1866 		}
1867 
1868 		/* get the set and name pointers */
1869 		if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) {
1870 			if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) {
1871 				return (-1);
1872 			}
1873 		}
1874 
1875 		/*
1876 		 * For the MN set, meta_init_make_device needs to
1877 		 * be run on all the nodes so the entries for the
1878 		 * softpart device name and its comp can be created
1879 		 * in the same order in the replica namespace.  If
1880 		 * we have it run on mdmn_do_iocset then the mddbs
1881 		 * will be out of sync between master node and slave
1882 		 * nodes.
1883 		 */
1884 		if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) {
1885 
1886 			if (!metaislocalset(sp) && MD_MNSET_DESC(sd)) {
1887 				md_mn_msg_addmdname_t	*send_params;
1888 				int			result;
1889 				md_mn_result_t		*resp = NULL;
1890 				int			message_size;
1891 
1892 				message_size =  sizeof (*send_params) +
1893 				    strlen(wm.wm_mdname) + 1;
1894 				send_params = Zalloc(message_size);
1895 				send_params->addmdname_setno = sp->setno;
1896 				(void) strcpy(&send_params->addmdname_name[0],
1897 				    wm.wm_mdname);
1898 				result = mdmn_send_message(sp->setno,
1899 				    MD_MN_MSG_ADDMDNAME,
1900 				    MD_MSGF_PANIC_WHEN_INCONSISTENT,
1901 				    (char *)send_params, message_size, &resp,
1902 				    ep);
1903 				Free(send_params);
1904 				if (resp != NULL) {
1905 					if (resp->mmr_exitval != 0) {
1906 						free_result(resp);
1907 						return (-1);
1908 					}
1909 					free_result(resp);
1910 				}
1911 				if (result != 0)
1912 					return (-1);
1913 			} else {
1914 
1915 				if (!is_existing_meta_hsp(sp, wm.wm_mdname)) {
1916 					if ((key = meta_init_make_device(&sp,
1917 					    wm.wm_mdname, ep)) <= 0) {
1918 						return (-1);
1919 					}
1920 					init = 1;
1921 				}
1922 			}
1923 
1924 			np = metaname(&spsetp, wm.wm_mdname, META_DEVICE, ep);
1925 			if (np == NULL) {
1926 				if (init) {
1927 					if (meta_getnmentbykey(sp->setno,
1928 					    MD_SIDEWILD, key, NULL, &mnum,
1929 					    NULL, ep) != NULL) {
1930 						(void) metaioctl(MD_IOCREM_DEV,
1931 						    &mnum, ep, NULL);
1932 					}
1933 					(void) del_self_name(sp, key, ep);
1934 				}
1935 				return (-1);
1936 			}
1937 		}
1938 
1939 		/* insert watermark into extent list */
1940 		meta_sp_list_insert(spsetp, np, extlist, cur_off,
1941 		    wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq,
1942 		    EXTFLG_UPDATE, compare);
1943 
1944 		/* if we see the end watermark, we're done */
1945 		if (wm.wm_type == EXTTYP_END)
1946 			break;
1947 
1948 		cur_off += wm.wm_length + 1;
1949 
1950 		/* clear out set and name pointers for next iteration */
1951 		np = NULL;
1952 		spsetp = NULL;
1953 	}
1954 
1955 	return (0);
1956 }
1957 
1958 /*
1959  * **************************************************************************
1960  *                        Print (metastat) Functions                        *
1961  * **************************************************************************
1962  */
1963 
1964 /*
1965  * FUNCTION:	meta_sp_short_print()
1966  * INPUT:	msp	- the unit structure to display
1967  *		fp	- the file pointer to send output to
1968  *		options	- print options from the command line processor
1969  * OUTPUT:	ep	- return error pointer
1970  * RETURNS:	int	- -1 if error, 0 on success
1971  * PURPOSE:	display a short report of the soft partition in md.tab
1972  *		form, primarily used for metastat -p.
1973  */
1974 static int
1975 meta_sp_short_print(
1976 	md_sp_t		*msp,
1977 	char		*fname,
1978 	FILE		*fp,
1979 	mdprtopts_t	options,
1980 	md_error_t	*ep
1981 )
1982 {
1983 	int	extn;
1984 
1985 	if (options & PRINT_LARGEDEVICES) {
1986 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0)
1987 			return (0);
1988 	}
1989 
1990 	if (options & PRINT_FN) {
1991 		if ((msp->common.revision & MD_FN_META_DEV) == 0)
1992 			return (0);
1993 	}
1994 
1995 	/* print name and -p */
1996 	if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF)
1997 		return (mdsyserror(ep, errno, fname));
1998 
1999 	/* print the component */
2000 	/*
2001 	 * Always print the full path name
2002 	 */
2003 	if (fprintf(fp, " %s", msp->compnamep->rname) == EOF)
2004 		return (mdsyserror(ep, errno, fname));
2005 
2006 	/* print out each extent */
2007 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2008 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2009 		if (fprintf(fp, " -o %llu -b %llu ", extp->poff,
2010 		    extp->len) == EOF)
2011 			return (mdsyserror(ep, errno, fname));
2012 	}
2013 
2014 	if (fprintf(fp, "\n") == EOF)
2015 		return (mdsyserror(ep, errno, fname));
2016 
2017 	/* success */
2018 	return (0);
2019 }
2020 
2021 /*
2022  * FUNCTION:	meta_sp_status_to_name()
2023  * INPUT:	xsp_status	- the status value to convert to a string
2024  *		tstate		- transient errored device state. If set the
2025  *				  device is Unavailable
2026  * OUTPUT:	none
2027  * RETURNS:	char *	- a pointer to the string representing the status value
2028  * PURPOSE:	return an internationalized string representing the
2029  *		status value for a soft partition.  The strings are
2030  *		strdup'd and must be freed by the caller.
2031  */
2032 static char *
2033 meta_sp_status_to_name(
2034 	xsp_status_t	xsp_status,
2035 	uint_t		tstate
2036 )
2037 {
2038 	char *rval = NULL;
2039 
2040 	/*
2041 	 * Check to see if we have MD_INACCESSIBLE set. This is the only valid
2042 	 * value for an 'Unavailable' return. tstate can be set because of
2043 	 * other multi-node reasons (e.g. ABR being set)
2044 	 */
2045 	if (tstate & MD_INACCESSIBLE) {
2046 		return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable")));
2047 	}
2048 
2049 	switch (xsp_status) {
2050 	case MD_SP_CREATEPEND:
2051 		rval = Strdup(dgettext(TEXT_DOMAIN, "Creating"));
2052 		break;
2053 	case MD_SP_GROWPEND:
2054 		rval = Strdup(dgettext(TEXT_DOMAIN, "Growing"));
2055 		break;
2056 	case MD_SP_DELPEND:
2057 		rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting"));
2058 		break;
2059 	case MD_SP_OK:
2060 		rval = Strdup(dgettext(TEXT_DOMAIN, "Okay"));
2061 		break;
2062 	case MD_SP_ERR:
2063 		rval = Strdup(dgettext(TEXT_DOMAIN, "Errored"));
2064 		break;
2065 	case MD_SP_RECOVER:
2066 		rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering"));
2067 		break;
2068 	}
2069 
2070 	if (rval == NULL)
2071 		rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid"));
2072 
2073 	return (rval);
2074 }
2075 
2076 /*
2077  * FUNCTION:	meta_sp_report()
2078  * INPUT:	sp	- the set name for the unit being displayed
2079  *		msp	- the unit structure to display
2080  *		nlpp	- pass back the large devs
2081  *		fp	- the file pointer to send output to
2082  *		options	- print options from the command line processor
2083  * OUTPUT:	ep	- return error pointer
2084  * RETURNS:	int	- -1 if error, 0 on success
2085  * PURPOSE:	print a full report of the device specified
2086  */
2087 static int
2088 meta_sp_report(
2089 	mdsetname_t	*sp,
2090 	md_sp_t		*msp,
2091 	mdnamelist_t	**nlpp,
2092 	char		*fname,
2093 	FILE		*fp,
2094 	mdprtopts_t	options,
2095 	md_error_t	*ep
2096 )
2097 {
2098 	uint_t		extn;
2099 	char		*status;
2100 	char		*devid = "";
2101 	mdname_t	*didnp = NULL;
2102 	ddi_devid_t	dtp;
2103 	int		len;
2104 	uint_t		tstate = 0;
2105 
2106 	if (options & PRINT_LARGEDEVICES) {
2107 		if ((msp->common.revision & MD_64BIT_META_DEV) == 0) {
2108 			return (0);
2109 		} else {
2110 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2111 				return (-1);
2112 		}
2113 	}
2114 
2115 	if (options & PRINT_FN) {
2116 		if ((msp->common.revision & MD_FN_META_DEV) == 0) {
2117 			return (0);
2118 		} else {
2119 			if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0)
2120 				return (-1);
2121 		}
2122 	}
2123 
2124 	if (options & PRINT_HEADER) {
2125 		if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"),
2126 		    msp->common.namep->cname) == EOF)
2127 			return (mdsyserror(ep, errno, fname));
2128 	}
2129 
2130 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Device: %s\n"),
2131 	    msp->compnamep->cname) == EOF)
2132 		return (mdsyserror(ep, errno, fname));
2133 
2134 	/* Determine if device is available before displaying status */
2135 	if (metaismeta(msp->common.namep)) {
2136 		if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0)
2137 			return (-1);
2138 	}
2139 	status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED);
2140 
2141 	/* print out "State" to be consistent with other metadevices */
2142 	if (tstate & MD_ABR_CAP) {
2143 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2144 		    "    State: %s - Application Based Recovery (ABR)\n"),
2145 		    status) == EOF) {
2146 			Free(status);
2147 			return (mdsyserror(ep, errno, fname));
2148 		}
2149 	} else {
2150 		if (fprintf(fp, dgettext(TEXT_DOMAIN,
2151 		    "    State: %s\n"), status) == EOF) {
2152 			Free(status);
2153 			return (mdsyserror(ep, errno, fname));
2154 		}
2155 	}
2156 	free(status);
2157 
2158 	if (fprintf(fp, dgettext(TEXT_DOMAIN, "    Size: %llu blocks (%s)\n"),
2159 	    msp->common.size,
2160 	    meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF)
2161 		return (mdsyserror(ep, errno, fname));
2162 
2163 	/* print component details */
2164 	if (! metaismeta(msp->compnamep)) {
2165 		diskaddr_t	start_blk;
2166 		int		has_mddb;
2167 		char		*has_mddb_str;
2168 
2169 		/* print header */
2170 		/*
2171 		 * Building a format string on the fly that will
2172 		 * be used in (f)printf. This allows the length
2173 		 * of the ctd to vary from small to large without
2174 		 * looking horrible.
2175 		 */
2176 		len = strlen(msp->compnamep->cname);
2177 		len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device")));
2178 		len += 2;
2179 		if (fprintf(fp,
2180 		    "\t%-*.*s %-12.12s %-5.5s %s\n",
2181 		    len, len,
2182 		    dgettext(TEXT_DOMAIN, "Device"),
2183 		    dgettext(TEXT_DOMAIN, "Start Block"),
2184 		    dgettext(TEXT_DOMAIN, "Dbase"),
2185 		    dgettext(TEXT_DOMAIN, "Reloc")) == EOF) {
2186 			return (mdsyserror(ep, errno, fname));
2187 		}
2188 
2189 
2190 		/* get info */
2191 		if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) ==
2192 		    MD_DISKADDR_ERROR)
2193 			return (-1);
2194 
2195 		if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0)
2196 			return (-1);
2197 
2198 		if (has_mddb)
2199 			has_mddb_str = dgettext(TEXT_DOMAIN, "Yes");
2200 		else
2201 			has_mddb_str = dgettext(TEXT_DOMAIN, "No");
2202 
2203 		/* populate the key in the name_p structure */
2204 		didnp = metadevname(&sp, msp->compnamep->dev, ep);
2205 		if (didnp == NULL) {
2206 			return (-1);
2207 		}
2208 
2209 		/* determine if devid does NOT exist */
2210 		if (options & PRINT_DEVID) {
2211 			if ((dtp = meta_getdidbykey(sp->setno,
2212 			    getmyside(sp, ep), didnp->key, ep)) == NULL)
2213 				devid = dgettext(TEXT_DOMAIN, "No ");
2214 			else {
2215 				devid = dgettext(TEXT_DOMAIN, "Yes");
2216 				free(dtp);
2217 			}
2218 		}
2219 
2220 		/* print info */
2221 		/*
2222 		 * This allows the length
2223 		 * of the ctd to vary from small to large without
2224 		 * looking horrible.
2225 		 */
2226 		if (fprintf(fp, "\t%-*s %8lld     %-5.5s %s\n",
2227 		    len, msp->compnamep->cname,
2228 		    start_blk, has_mddb_str, devid) == EOF) {
2229 			return (mdsyserror(ep, errno, fname));
2230 		}
2231 		(void) fprintf(fp, "\n");
2232 	}
2233 
2234 
2235 	/* print the headers */
2236 	if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n",
2237 	    dgettext(TEXT_DOMAIN, "Extent"),
2238 	    dgettext(TEXT_DOMAIN, "Start Block"),
2239 	    dgettext(TEXT_DOMAIN, "Block count")) == EOF)
2240 		return (mdsyserror(ep, errno, fname));
2241 
2242 	/* print out each extent */
2243 	for (extn = 0; (extn < msp->ext.ext_len); extn++) {
2244 		md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
2245 
2246 		/* If PRINT_TIMES option is ever supported, add output here */
2247 		if (fprintf(fp, "\t%6u %24llu %24llu\n",
2248 		    extn, extp->poff, extp->len) == EOF)
2249 			return (mdsyserror(ep, errno, fname));
2250 	}
2251 
2252 	/* separate records with a newline */
2253 	(void) fprintf(fp, "\n");
2254 	return (0);
2255 }
2256 
2257 /*
2258  * FUNCTION:	meta_sp_print()
2259  * INPUT:	sp	- the set name for the unit being displayed
2260  *		np	- the name of the device to print
2261  *		fname	- ??? not used
2262  *		fp	- the file pointer to send output to
2263  *		options	- print options from the command line processor
2264  * OUTPUT:	ep	- return error pointer
2265  * RETURNS:	int	- -1 if error, 0 on success
2266  * PURPOSE:	print a full report of the device specified by metastat.
2267  *		This is the main entry point for printing.
2268  */
2269 int
2270 meta_sp_print(
2271 	mdsetname_t	*sp,
2272 	mdname_t	*np,
2273 	mdnamelist_t	**nlpp,
2274 	char		*fname,
2275 	FILE		*fp,
2276 	mdprtopts_t	options,
2277 	md_error_t	*ep
2278 )
2279 {
2280 	md_sp_t		*msp;
2281 	md_unit_t	*mdp;
2282 	int		rval = 0;
2283 
2284 	/* should always have the same set */
2285 	assert(sp != NULL);
2286 
2287 	/* print all the soft partitions */
2288 	if (np == NULL) {
2289 		mdnamelist_t	*nlp = NULL;
2290 		mdnamelist_t	*p;
2291 		int		cnt;
2292 
2293 		if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0)
2294 			return (-1);
2295 		else if (cnt == 0)
2296 			return (0);
2297 
2298 		/* recusively print them out */
2299 		for (p = nlp; (p != NULL); p = p->next) {
2300 			mdname_t	*curnp = p->namep;
2301 
2302 			/*
2303 			 * one problem with the rval of -1 here is that
2304 			 * the error gets "lost" when the next device is
2305 			 * printed, but we want to print them all anyway.
2306 			 */
2307 			rval = meta_sp_print(sp, curnp, nlpp, fname, fp,
2308 			    options, ep);
2309 		}
2310 
2311 		/* clean up, return success */
2312 		metafreenamelist(nlp);
2313 		return (rval);
2314 	}
2315 
2316 	/* get the unit structure */
2317 	if ((msp = meta_get_sp_common(sp, np,
2318 	    ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL)
2319 		return (-1);
2320 
2321 	/* check for parented */
2322 	if ((! (options & PRINT_SUBDEVS)) &&
2323 	    (MD_HAS_PARENT(msp->common.parent))) {
2324 		return (0);
2325 	}
2326 
2327 	/* print appropriate detail */
2328 	if (options & PRINT_SHORT) {
2329 		if (meta_sp_short_print(msp, fname, fp, options, ep) != 0)
2330 			return (-1);
2331 	} else {
2332 		if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0)
2333 			return (-1);
2334 	}
2335 
2336 	/*
2337 	 * Print underlying metadevices if they are parented to us and
2338 	 * if the info for the underlying metadevice has not been printed.
2339 	 */
2340 	if (metaismeta(msp->compnamep)) {
2341 		/* get the unit structure for the subdevice */
2342 		if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL)
2343 			return (-1);
2344 
2345 		/* If info not already printed, recurse */
2346 		if (!BT_TEST(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)))) {
2347 			if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp,
2348 			    (options | PRINT_HEADER | PRINT_SUBDEVS),
2349 			    NULL, ep) != 0) {
2350 				return (-1);
2351 			}
2352 			BT_SET(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)));
2353 		}
2354 	}
2355 	return (0);
2356 }
2357 
2358 /*
2359  * **************************************************************************
2360  *                     Watermark Manipulation Functions                     *
2361  * **************************************************************************
2362  */
2363 
2364 /*
2365  * FUNCTION:	meta_sp_get_start()
2366  * INPUT:	sp	- the operating set
2367  *		np 	- device upon which the sp is being built
2368  * OUTPUT:	ep	- return error pointer
2369  * RETURNS:	daddr_t	- -1 if error, otherwise the start block
2370  * PURPOSE:	Encapsulate the determination of the start block of the
2371  *		device upon which the sp is built or being built.
2372  */
2373 static diskaddr_t
2374 meta_sp_get_start(
2375 	mdsetname_t	*sp,
2376 	mdname_t	*np,
2377 	md_error_t	*ep
2378 )
2379 {
2380 	daddr_t		start_block;
2381 
2382 	if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR)
2383 		start_block += MD_SP_START;
2384 
2385 	return (start_block);
2386 }
2387 
2388 /*
2389  * FUNCTION:	meta_sp_update_wm()
2390  * INPUT:	sp	- the operating set
2391  *		msp	- a pointer to the XDR unit structure
2392  *		extlist	- the extent list specifying watermarks to update
2393  * OUTPUT:	ep	- return error pointer
2394  * RETURNS:	int	- -1 if error, 0 on success
2395  * PURPOSE:	steps backwards through the extent list updating
2396  *		watermarks for all extents with the EXTFLG_UPDATE flag
2397  *		set.  Writing the watermarks guarantees consistency when
2398  *		extents must be broken into pieces since the original
2399  *		watermark will be the last to be updated, and will be
2400  *		changed to point to a new watermark that is already
2401  *		known to be consistent.  If one of the writes fails, the
2402  *		original watermark stays intact and none of the changes
2403  *		are realized.
2404  */
2405 static int
2406 meta_sp_update_wm(
2407 	mdsetname_t	*sp,
2408 	md_sp_t		*msp,
2409 	sp_ext_node_t	*extlist,
2410 	md_error_t	*ep
2411 )
2412 {
2413 	sp_ext_node_t	*ext;
2414 	sp_ext_node_t	*tail;
2415 	mp_watermark_t	*wmp, *watermarks;
2416 	xsp_offset_t	*osp, *offsets;
2417 	int		update_count = 0;
2418 	int		rval = 0;
2419 	md_unit_t	*mdp;
2420 	md_sp_update_wm_t	update_params;
2421 
2422 	if (getenv(META_SP_DEBUG)) {
2423 		meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n");
2424 		meta_sp_list_dump(extlist);
2425 	}
2426 
2427 	/*
2428 	 * find the last node so we can write the watermarks backwards
2429 	 * and count watermarks to update so we can allocate space
2430 	 */
2431 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
2432 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2433 			update_count++;
2434 		}
2435 
2436 		if (ext->ext_next == NULL) {
2437 			tail = ext;
2438 		}
2439 	}
2440 	ext = tail;
2441 
2442 	wmp = watermarks =
2443 	    Zalloc(update_count * sizeof (mp_watermark_t));
2444 	osp = offsets =
2445 	    Zalloc(update_count * sizeof (sp_ext_offset_t));
2446 
2447 	while (ext != NULL) {
2448 		if ((ext->ext_flags & EXTFLG_UPDATE) != 0) {
2449 			/* update watermark */
2450 			wmp->wm_magic = MD_SP_MAGIC;
2451 			wmp->wm_version = MD_SP_VERSION;
2452 			wmp->wm_type = ext->ext_type;
2453 			wmp->wm_seq = ext->ext_seq;
2454 			wmp->wm_length = ext->ext_length - MD_SP_WMSIZE;
2455 
2456 			/* fill in the volume name and set name */
2457 			if (ext->ext_namep != NULL)
2458 				(void) strcpy(wmp->wm_mdname,
2459 				    ext->ext_namep->cname);
2460 			else
2461 				(void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME);
2462 			if (ext->ext_setp != NULL &&
2463 			    ext->ext_setp->setno != MD_LOCAL_SET)
2464 				(void) strcpy(wmp->wm_setname,
2465 				    ext->ext_setp->setname);
2466 			else
2467 				(void) strcpy(wmp->wm_setname,
2468 				    MD_SP_LOCALSETNAME);
2469 
2470 			/* Generate the checksum */
2471 			wmp->wm_checksum = 0;
2472 			crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum,
2473 			    sizeof (*wmp), NULL);
2474 
2475 			/* record the extent offset */
2476 			*osp = ext->ext_offset;
2477 
2478 			/* Advance the placeholders */
2479 			osp++; wmp++;
2480 		}
2481 		ext = ext->ext_prev;
2482 	}
2483 
2484 	mdp = meta_get_mdunit(sp, msp->common.namep, ep);
2485 	if (mdp == NULL) {
2486 		rval = -1;
2487 		goto out;
2488 	}
2489 
2490 	(void) memset(&update_params, 0, sizeof (update_params));
2491 	update_params.mnum = MD_SID(mdp);
2492 	update_params.count = update_count;
2493 	update_params.wmp = (uintptr_t)watermarks;
2494 	update_params.osp = (uintptr_t)offsets;
2495 	MD_SETDRIVERNAME(&update_params, MD_SP,
2496 	    MD_MIN2SET(update_params.mnum));
2497 
2498 	if (metaioctl(MD_IOC_SPUPDATEWM, &update_params,
2499 	    &update_params.mde, msp->common.namep->cname) != 0) {
2500 		(void) mdstealerror(ep, &update_params.mde);
2501 		rval = -1;
2502 		goto out;
2503 	}
2504 
2505 out:
2506 	Free(watermarks);
2507 	Free(offsets);
2508 
2509 	return (rval);
2510 }
2511 
2512 /*
2513  * FUNCTION:	meta_sp_clear_wm()
2514  * INPUT:	sp	- the operating set
2515  *		msp	- the unit structure for the soft partition to clear
2516  * OUTPUT:	ep	- return error pointer
2517  * RETURNS:	int	- -1 if error, 0 on success
2518  * PURPOSE:	steps through the extents for a soft partition unit and
2519  *		creates an extent list designed to mark all of the
2520  *		watermarks for those extents as free.  The extent list
2521  *		is then passed to meta_sp_update_wm() to actually write
2522  *		the watermarks out.
2523  */
2524 static int
2525 meta_sp_clear_wm(
2526 	mdsetname_t	*sp,
2527 	md_sp_t		*msp,
2528 	md_error_t	*ep
2529 )
2530 {
2531 	sp_ext_node_t	*extlist = NULL;
2532 	int		numexts = msp->ext.ext_len;
2533 	uint_t		i;
2534 	int		rval = 0;
2535 
2536 	/* for each watermark must set the flag to SP_FREE */
2537 	for (i = 0; i < numexts; i++) {
2538 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
2539 
2540 		meta_sp_list_insert(NULL, NULL, &extlist,
2541 		    extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE,
2542 		    EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
2543 	}
2544 
2545 	/* update watermarks */
2546 	rval = meta_sp_update_wm(sp, msp, extlist, ep);
2547 
2548 	meta_sp_list_free(&extlist);
2549 	return (rval);
2550 }
2551 
2552 /*
2553  * FUNCTION:	meta_sp_read_wm()
2554  * INPUT:	sp	- setname for component
2555  *		compnp	- mdname_t for component
2556  *		offset	- the offset of the watermark to read (sectors)
2557  * OUTPUT:	wm	- the watermark structure to read into
2558  *		ep	- return error pointer
2559  * RETURNS:	int	- -1 if error, 0 on success
2560  * PURPOSE:	seeks out to the requested offset and reads a watermark.
2561  *		It then verifies that the magic number is correct and
2562  *		that the checksum is valid, returning an error if either
2563  *		is wrong.
2564  */
2565 static int
2566 meta_sp_read_wm(
2567 	mdsetname_t	*sp,
2568 	mdname_t	*compnp,
2569 	mp_watermark_t	*wm,
2570 	sp_ext_offset_t	offset,
2571 	md_error_t	*ep
2572 )
2573 {
2574 	md_sp_read_wm_t	read_params;
2575 
2576 	/*
2577 	 * make sure block offset does not overflow 2^64 bytes and it's a
2578 	 * multiple of the block size.
2579 	 */
2580 	assert(offset <= (1LL << (64 - DEV_BSHIFT)));
2581 	/* LINTED */
2582 	assert((sizeof (*wm) % DEV_BSIZE) == 0);
2583 
2584 	(void) memset(wm, 0, sizeof (*wm));
2585 
2586 	(void) memset(&read_params, 0, sizeof (read_params));
2587 	read_params.rdev = compnp->dev;
2588 	read_params.wmp = (uintptr_t)wm;
2589 	read_params.offset = offset;
2590 	MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno);
2591 
2592 	if (metaioctl(MD_IOC_SPREADWM, &read_params,
2593 	    &read_params.mde, compnp->cname) != 0) {
2594 
2595 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2596 		    "Extent header read failed, block %llu.\n"), offset);
2597 		return (mdstealerror(ep, &read_params.mde));
2598 	}
2599 
2600 	/* make sure magic number is correct */
2601 	if (wm->wm_magic != MD_SP_MAGIC) {
2602 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2603 		    "found incorrect magic number %x, expected %x.\n"),
2604 		    wm->wm_magic, MD_SP_MAGIC);
2605 		/*
2606 		 * Pass NULL for the device name as we don't have
2607 		 * valid watermark contents.
2608 		 */
2609 		return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL));
2610 	}
2611 
2612 	if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
2613 	    sizeof (*wm), NULL)) {
2614 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
2615 		    "found incorrect checksum %x.\n"),
2616 		    wm->wm_checksum);
2617 		return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname));
2618 	}
2619 
2620 	return (0);
2621 }
2622 
2623 /*
2624  * **************************************************************************
2625  *                  Query Functions
2626  * **************************************************************************
2627  */
2628 
2629 /*
2630  * IMPORTANT NOTE: This is a static function that assumes that
2631  *		   its input parameters have been checked and
2632  *		   have valid values that lie within acceptable
2633  *		   ranges.
2634  *
2635  * FUNCTION:	meta_sp_enough_space()
2636  * INPUT:	desired_number_of_sps - the number of soft partitions desired;
2637  *					must be > 0
2638  *		desired_sp_size - the desired soft partition size in blocks;
2639  *				  must be > 0
2640  *		extent_listpp - a reference to a reference to an extent
2641  *				list that lists the extents on a device;
2642  *				must be a reference to a reference to a
2643  *				valid extent list
2644  *		alignment - the desired data space alignment for the sp's
2645  * OUTPUT:	boolean_t return value
2646  * RETURNS:	boolean_t - B_TRUE if there's enough space in the extent
2647  *			    list to create the desired soft partitions,
2648  *			    B_FALSE if there's not enough space
2649  * PURPOSE:	determines whether there's enough free space in an extent
2650  *		list to allow creation of a set of soft partitions
2651  */
2652 static boolean_t
2653 meta_sp_enough_space(
2654 	int		desired_number_of_sps,
2655 	blkcnt_t	desired_sp_size,
2656 	sp_ext_node_t	**extent_listpp,
2657 	sp_ext_length_t	alignment
2658 )
2659 {
2660 	boolean_t		enough_space;
2661 	int			number_of_sps;
2662 	int			number_of_extents_used;
2663 	sp_ext_length_t		desired_ext_length = desired_sp_size;
2664 
2665 	enough_space = B_TRUE;
2666 	number_of_sps = 0;
2667 	while ((enough_space == B_TRUE) &&
2668 	    (number_of_sps < desired_number_of_sps)) {
2669 		/*
2670 		 * Use the extent allocation algorithm implemented by
2671 		 * meta_sp_alloc_by_len() to test whether the free
2672 		 * extents in the extent list referenced by *extent_listpp
2673 		 * contain enough space to accomodate a soft partition
2674 		 * of size desired_ext_length.
2675 		 *
2676 		 * Repeat the test <desired_number_of_sps> times
2677 		 * or until it fails, whichever comes first,
2678 		 * each time allocating the extents required to
2679 		 * create the soft partition without actually
2680 		 * creating the soft partition.
2681 		 */
2682 		number_of_extents_used = meta_sp_alloc_by_len(
2683 		    TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2684 		    extent_listpp, &desired_ext_length,
2685 		    NO_OFFSET, alignment);
2686 		if (number_of_extents_used == -1) {
2687 			enough_space = B_FALSE;
2688 		} else {
2689 			number_of_sps++;
2690 		}
2691 	}
2692 	return (enough_space);
2693 }
2694 
2695 /*
2696  * IMPORTANT NOTE: This is a static function that calls other functions
2697  *		   that check its mdsetnamep and device_mdnamep
2698  *		   input parameters, but expects extent_listpp to
2699  *		   be a initialized to a valid address to which
2700  *		   it can write a reference to the extent list that
2701  *		   it creates.
2702  *
2703  * FUNCTION:	meta_sp_get_extent_list()
2704  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2705  *			     for the set containing the device for
2706  *			     which the extents are to be listed
2707  *		device_mdnamep - a reference to the mdname_t structure
2708  *				 for the device for which the extents
2709  *				 are to be listed
2710  * OUTPUT:	*extent_listpp - a reference to the extent list for
2711  *				 the device; NULL if the function fails
2712  *		*ep - the libmeta error encountered, if any
2713  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2714  *			    B_FALSE if not
2715  * PURPOSE:	gets the extent list for a device
2716  */
2717 static boolean_t
2718 meta_sp_get_extent_list(
2719 	mdsetname_t	*mdsetnamep,
2720 	mdname_t	*device_mdnamep,
2721 	sp_ext_node_t	**extent_listpp,
2722 	md_error_t	*ep
2723 )
2724 {
2725 	diskaddr_t		device_size_in_blocks;
2726 	mdnamelist_t		*sp_name_listp;
2727 	diskaddr_t		start_block_address_in_blocks;
2728 
2729 	*extent_listpp = NULL;
2730 	sp_name_listp = NULL;
2731 
2732 	start_block_address_in_blocks = meta_sp_get_start(mdsetnamep,
2733 	    device_mdnamep, ep);
2734 	if (start_block_address_in_blocks == MD_DISKADDR_ERROR) {
2735 		if (getenv(META_SP_DEBUG)) {
2736 			mde_perror(ep,
2737 			    "meta_sp_get_extent_list:meta_sp_get_start");
2738 		}
2739 		return (B_FALSE);
2740 	}
2741 
2742 	device_size_in_blocks = metagetsize(device_mdnamep, ep);
2743 	if (device_size_in_blocks == MD_DISKADDR_ERROR) {
2744 		if (getenv(META_SP_DEBUG)) {
2745 			mde_perror(ep,
2746 			    "meta_sp_get_extent_list:metagetsize");
2747 		}
2748 		return (B_FALSE);
2749 	}
2750 
2751 	/*
2752 	 * Sanity check: the start block will have skipped an integer
2753 	 * number of cylinders, C.  C will usually be zero.  If (C > 0),
2754 	 * and the disk slice happens to only be C cylinders in total
2755 	 * size, we'll fail this check.
2756 	 */
2757 	if (device_size_in_blocks <=
2758 	    (start_block_address_in_blocks + MD_SP_WMSIZE)) {
2759 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname);
2760 		return (B_FALSE);
2761 	}
2762 
2763 	/*
2764 	 * After this point, we will have allocated resources, so any
2765 	 * failure returns must be through the supplied "fail" label
2766 	 * to properly deallocate things.
2767 	 */
2768 
2769 	/*
2770 	 * Create an empty extent list that starts one watermark past
2771 	 * the start block of the device and ends one watermark before
2772 	 * the end of the device.
2773 	 */
2774 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2775 	    extent_listpp, NO_OFFSET,
2776 	    (sp_ext_length_t)start_block_address_in_blocks,
2777 	    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2778 	    meta_sp_cmp_by_offset);
2779 	meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2780 	    extent_listpp, (sp_ext_offset_t)(device_size_in_blocks -
2781 	    MD_SP_WMSIZE), MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER,
2782 	    NO_FLAGS, meta_sp_cmp_by_offset);
2783 
2784 	/*
2785 	 * Get the list of soft partitions that are already on the
2786 	 * device.
2787 	 */
2788 	if (meta_sp_get_by_component(mdsetnamep, device_mdnamep,
2789 	    &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) {
2790 		if (getenv(META_SP_DEBUG)) {
2791 			mde_perror(ep,
2792 			    "meta_sp_get_extent_list:meta_sp_get_by_component");
2793 		}
2794 		goto fail;
2795 	}
2796 
2797 	if (sp_name_listp != NULL) {
2798 		/*
2799 		 * If there are soft partitions on the device, add the
2800 		 * extents used in them to the extent list.
2801 		 */
2802 		if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp,
2803 		    extent_listpp, ep) == -1) {
2804 			if (getenv(META_SP_DEBUG)) {
2805 				mde_perror(ep, "meta_sp_get_extent_list:"
2806 				    "meta_sp_extlist_from_namelist");
2807 			}
2808 			goto fail;
2809 		}
2810 		metafreenamelist(sp_name_listp);
2811 	}
2812 
2813 	/*
2814 	 * Add free extents to the extent list to represent
2815 	 * the remaining regions of free space on the
2816 	 * device.
2817 	 */
2818 	meta_sp_list_freefill(extent_listpp, device_size_in_blocks);
2819 	return (B_TRUE);
2820 
2821 fail:
2822 	if (sp_name_listp != NULL) {
2823 		metafreenamelist(sp_name_listp);
2824 	}
2825 
2826 	if (*extent_listpp != NULL) {
2827 		/*
2828 		 * meta_sp_list_free sets *extent_listpp to NULL.
2829 		 */
2830 		meta_sp_list_free(extent_listpp);
2831 	}
2832 	return (B_FALSE);
2833 }
2834 
2835 /*
2836  * IMPORTANT NOTE: This is a static function that calls other functions
2837  *		   that check its mdsetnamep and mddrivenamep
2838  *		   input parameters, but expects extent_listpp to
2839  *		   be a initialized to a valid address to which
2840  *		   it can write a reference to the extent list that
2841  *		   it creates.
2842  *
2843  * FUNCTION:	meta_sp_get_extent_list_for_drive()
2844  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2845  *			     for the set containing the drive for
2846  *			     which the extents are to be listed
2847  *		mddrivenamep   - a reference to the mddrivename_t structure
2848  *				 for the drive for which the extents
2849  *				 are to be listed
2850  * OUTPUT:	*extent_listpp - a reference to the extent list for
2851  *				 the drive; NULL if the function fails
2852  * RETURNS:	boolean_t - B_TRUE if the function call was successful,
2853  *			    B_FALSE if not
2854  * PURPOSE:	gets the extent list for a drive when the entire drive
2855  *		is to be soft partitioned
2856  */
2857 static boolean_t
2858 meta_sp_get_extent_list_for_drive(
2859 	mdsetname_t	*mdsetnamep,
2860 	mddrivename_t	*mddrivenamep,
2861 	sp_ext_node_t	**extent_listpp
2862 )
2863 {
2864 	boolean_t		can_use;
2865 	diskaddr_t		free_space;
2866 	md_error_t		mderror;
2867 	mdvtoc_t		proposed_vtoc;
2868 	int			repartition_options;
2869 	int			return_value;
2870 	md_sp_t			test_sp_struct;
2871 
2872 	can_use = B_TRUE;
2873 	*extent_listpp = NULL;
2874 	mderror = mdnullerror;
2875 	test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0,
2876 	    &mderror);
2877 	if (test_sp_struct.compnamep == NULL) {
2878 		can_use = B_FALSE;
2879 	}
2880 
2881 	if (can_use == B_TRUE) {
2882 		mderror = mdnullerror;
2883 		repartition_options = 0;
2884 		return_value = meta_check_sp(mdsetnamep, &test_sp_struct,
2885 		    MDCMD_USE_WHOLE_DISK, &repartition_options, &mderror);
2886 		if (return_value != 0) {
2887 			can_use = B_FALSE;
2888 		}
2889 	}
2890 
2891 	if (can_use == B_TRUE) {
2892 		mderror = mdnullerror;
2893 		repartition_options = repartition_options |
2894 		    (MD_REPART_FORCE | MD_REPART_DONT_LABEL);
2895 		return_value = meta_repartition_drive(mdsetnamep, mddrivenamep,
2896 		    repartition_options, &proposed_vtoc, &mderror);
2897 		if (return_value != 0) {
2898 			can_use = B_FALSE;
2899 		}
2900 	}
2901 
2902 	if (can_use == B_TRUE) {
2903 		free_space = proposed_vtoc.parts[MD_SLICE0].size;
2904 		if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) {
2905 			can_use = B_FALSE;
2906 		}
2907 	}
2908 
2909 	if (can_use == B_TRUE) {
2910 		/*
2911 		 * Create an extent list that starts with
2912 		 * a reserved extent that ends at the start
2913 		 * of the usable space on slice zero of the
2914 		 * proposed VTOC, ends with an extent that
2915 		 * reserves space for a watermark at the end
2916 		 * of slice zero, and contains a single free
2917 		 * extent that occupies the rest of the space
2918 		 * on the slice.
2919 		 *
2920 		 * NOTE:
2921 		 *
2922 		 * Don't use metagetstart() or metagetsize() to
2923 		 * find the usable space.  They query the mdname_t
2924 		 * structure that represents an actual device to
2925 		 * determine the amount of space on the device that
2926 		 * contains metadata and the total amount of space
2927 		 * on the device.  Since this function creates a
2928 		 * proposed extent list that doesn't reflect the
2929 		 * state of an actual device, there's no mdname_t
2930 		 * structure to be queried.
2931 		 *
2932 		 * When a drive is reformatted to prepare for
2933 		 * soft partitioning, all of slice seven is
2934 		 * reserved for metadata, all of slice zero is
2935 		 * available for soft partitioning, and all other
2936 		 * slices on the drive are empty.  The proposed
2937 		 * extent list for the drive therefore contains
2938 		 * only three extents: a reserved extent that ends
2939 		 * at the start of the usable space on slice zero,
2940 		 * a single free extent that occupies all the usable
2941 		 * space on slice zero, and an ending extent that
2942 		 * reserves space for a watermark at the end of
2943 		 * slice zero.
2944 		 */
2945 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2946 		    extent_listpp, NO_OFFSET, (sp_ext_length_t)(MD_SP_START),
2947 		    EXTTYP_RESERVED, NO_SEQUENCE_NUMBER, NO_FLAGS,
2948 		    meta_sp_cmp_by_offset);
2949 		meta_sp_list_insert(TEST_SETNAMEP, TEST_SOFT_PARTITION_NAMEP,
2950 		    extent_listpp, (sp_ext_offset_t)(free_space - MD_SP_WMSIZE),
2951 		    MD_SP_WMSIZE, EXTTYP_END, NO_SEQUENCE_NUMBER, NO_FLAGS,
2952 		    meta_sp_cmp_by_offset);
2953 		meta_sp_list_freefill(extent_listpp, free_space);
2954 	}
2955 	return (can_use);
2956 }
2957 
2958 /*
2959  * FUNCTION:	meta_sp_can_create_sps()
2960  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
2961  *			     for the set containing the device for
2962  *			     which the extents are to be listed
2963  *		mdnamep - a reference to the mdname_t of the device
2964  *			  on which the soft parititions are to be created
2965  *		number_of_sps - the desired number of soft partitions
2966  *		sp_size - the desired soft partition size
2967  * OUTPUT:	boolean_t return value
2968  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
2969  *			    B_FALSE if not
2970  * PURPOSE:	determines whether a set of soft partitions can be created
2971  *		on a device
2972  */
2973 boolean_t
2974 meta_sp_can_create_sps(
2975 	mdsetname_t	*mdsetnamep,
2976 	mdname_t	*mdnamep,
2977 	int		number_of_sps,
2978 	blkcnt_t	sp_size
2979 )
2980 {
2981 	sp_ext_node_t	*extent_listp;
2982 	boolean_t	succeeded;
2983 	md_error_t	mde;
2984 
2985 	if ((number_of_sps > 0) && (sp_size > 0)) {
2986 		succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
2987 		    &extent_listp, &mde);
2988 	} else {
2989 		succeeded = B_FALSE;
2990 	}
2991 
2992 	/*
2993 	 * We don't really care about an error return from the
2994 	 * alignment call; that will just result in passing zero,
2995 	 * which will be interpreted as no alignment.
2996 	 */
2997 
2998 	if (succeeded == B_TRUE) {
2999 		succeeded = meta_sp_enough_space(number_of_sps,
3000 		    sp_size, &extent_listp,
3001 		    meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde));
3002 		meta_sp_list_free(&extent_listp);
3003 	}
3004 	return (succeeded);
3005 }
3006 
3007 /*
3008  * FUNCTION:	meta_sp_can_create_sps_on_drive()
3009  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3010  *			     for the set containing the drive for
3011  *			     which the extents are to be listed
3012  *		mddrivenamep - a reference to the mddrivename_t of the drive
3013  *			       on which the soft parititions are to be created
3014  *		number_of_sps - the desired number of soft partitions
3015  *		sp_size - the desired soft partition size
3016  * OUTPUT:	boolean_t return value
3017  * RETURNS:	boolean_t - B_TRUE if the soft partitionns can be created,
3018  *			    B_FALSE if not
3019  * PURPOSE:	determines whether a set of soft partitions can be created
3020  *		on a drive if the entire drive is soft partitioned
3021  */
3022 boolean_t
3023 meta_sp_can_create_sps_on_drive(
3024 	mdsetname_t	*mdsetnamep,
3025 	mddrivename_t	*mddrivenamep,
3026 	int		number_of_sps,
3027 	blkcnt_t	sp_size
3028 )
3029 {
3030 	sp_ext_node_t	*extent_listp;
3031 	boolean_t	succeeded;
3032 
3033 	if ((number_of_sps > 0) && (sp_size > 0)) {
3034 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3035 		    mddrivenamep, &extent_listp);
3036 	} else {
3037 		succeeded = B_FALSE;
3038 	}
3039 
3040 	/*
3041 	 * We don't care about alignment on the space call because
3042 	 * we're specifically dealing with a drive, which will have no
3043 	 * inherent alignment.
3044 	 */
3045 
3046 	if (succeeded == B_TRUE) {
3047 		succeeded = meta_sp_enough_space(number_of_sps, sp_size,
3048 		    &extent_listp, SP_UNALIGNED);
3049 		meta_sp_list_free(&extent_listp);
3050 	}
3051 	return (succeeded);
3052 }
3053 
3054 /*
3055  * FUNCTION:	meta_sp_get_free_space()
3056  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3057  *			     for the set containing the device for
3058  *			     which the free space is to be returned
3059  *		mdnamep - a reference to the mdname_t of the device
3060  *			  for which the free space is to be returned
3061  * OUTPUT:	blkcnt_t return value
3062  * RETURNS:	blkcnt_t - the number of blocks of free space on the device
3063  * PURPOSE:	returns the number of blocks of free space on a device
3064  */
3065 blkcnt_t
3066 meta_sp_get_free_space(
3067 	mdsetname_t	*mdsetnamep,
3068 	mdname_t	*mdnamep
3069 )
3070 {
3071 	sp_ext_node_t		*extent_listp;
3072 	sp_ext_length_t		free_blocks;
3073 	boolean_t		succeeded;
3074 	md_error_t		mde;
3075 
3076 	extent_listp = NULL;
3077 	free_blocks = 0;
3078 	succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep,
3079 	    &extent_listp, &mde);
3080 	if (succeeded == B_TRUE) {
3081 		free_blocks = meta_sp_list_size(extent_listp,
3082 		    EXTTYP_FREE, INCLUDE_WM);
3083 		meta_sp_list_free(&extent_listp);
3084 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3085 			/*
3086 			 * Subtract a safety margin for watermarks when
3087 			 * computing the number of blocks available for
3088 			 * use.  The actual number of watermarks can't
3089 			 * be calculated without knowing the exact numbers
3090 			 * and sizes of both the free extents and the soft
3091 			 * partitions to be created.  The calculation is
3092 			 * highly complex and error-prone even if those
3093 			 * quantities are known.  The approximate value
3094 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3095 			 * correct value in all practical cases.
3096 			 */
3097 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3098 		} else {
3099 			free_blocks = 0;
3100 		}
3101 	} else {
3102 		mdclrerror(&mde);
3103 	}
3104 
3105 	return (free_blocks);
3106 }
3107 
3108 /*
3109  * FUNCTION:	meta_sp_get_free_space_on_drive()
3110  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3111  *			     for the set containing the drive for
3112  *			     which the free space is to be returned
3113  *		mddrivenamep - a reference to the mddrivename_t of the drive
3114  *			       for which the free space is to be returned
3115  * OUTPUT:	blkcnt_t return value
3116  * RETURNS:	blkcnt_t - the number of blocks of free space on the drive
3117  * PURPOSE:	returns the number of blocks of space usable for soft
3118  *		partitions on an entire drive, if the entire drive is
3119  *		soft partitioned
3120  */
3121 blkcnt_t
3122 meta_sp_get_free_space_on_drive(
3123 	mdsetname_t	*mdsetnamep,
3124 	mddrivename_t	*mddrivenamep
3125 )
3126 {
3127 	sp_ext_node_t		*extent_listp;
3128 	sp_ext_length_t		free_blocks;
3129 	boolean_t		succeeded;
3130 
3131 	extent_listp = NULL;
3132 	free_blocks = 0;
3133 	succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3134 	    mddrivenamep, &extent_listp);
3135 	if (succeeded == B_TRUE) {
3136 		free_blocks = meta_sp_list_size(extent_listp,
3137 		    EXTTYP_FREE, INCLUDE_WM);
3138 		meta_sp_list_free(&extent_listp);
3139 		if (free_blocks > (10 * MD_SP_WMSIZE)) {
3140 			/*
3141 			 * Subtract a safety margin for watermarks when
3142 			 * computing the number of blocks available for
3143 			 * use.  The actual number of watermarks can't
3144 			 * be calculated without knowing the exact numbers
3145 			 * and sizes of both the free extents and the soft
3146 			 * partitions to be created.  The calculation is
3147 			 * highly complex and error-prone even if those
3148 			 * quantities are known.  The approximate value
3149 			 * 10 * MD_SP_WMSIZE is within a few blocks of the
3150 			 * correct value in all practical cases.
3151 			 */
3152 			free_blocks = free_blocks - (10 * MD_SP_WMSIZE);
3153 		} else {
3154 			free_blocks = 0;
3155 		}
3156 	}
3157 	return (free_blocks);
3158 }
3159 
3160 /*
3161  * FUNCTION:	meta_sp_get_number_of_possible_sps()
3162  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3163  *			     for the set containing the device for
3164  *			     which the number of possible soft partitions
3165  *			     is to be returned
3166  *		mdnamep - a reference to the mdname_t of the device
3167  *			  for which the number of possible soft partitions
3168  *			  is to be returned
3169  * OUTPUT:	int return value
3170  * RETURNS:	int - the number of soft partitions of the desired size
3171  *		      that can be created on the device
3172  * PURPOSE:	returns the number of soft partitions of a given size
3173  *		that can be created on a device
3174  */
3175 int
3176 meta_sp_get_number_of_possible_sps(
3177 	mdsetname_t	*mdsetnamep,
3178 	mdname_t	*mdnamep,
3179 	blkcnt_t	sp_size
3180 )
3181 {
3182 	sp_ext_node_t	*extent_listp;
3183 	int		number_of_possible_sps;
3184 	boolean_t	succeeded;
3185 	md_error_t	mde;
3186 	sp_ext_length_t	alignment;
3187 
3188 	extent_listp = NULL;
3189 	number_of_possible_sps = 0;
3190 	if (sp_size > 0) {
3191 		if ((succeeded = meta_sp_get_extent_list(mdsetnamep,
3192 		    mdnamep, &extent_listp, &mde)) == B_FALSE)
3193 			mdclrerror(&mde);
3194 	} else {
3195 		succeeded = B_FALSE;
3196 	}
3197 
3198 	if (succeeded == B_TRUE) {
3199 		alignment = meta_sp_get_default_alignment(mdsetnamep,
3200 		    mdnamep, &mde);
3201 	}
3202 
3203 	while (succeeded == B_TRUE) {
3204 		/*
3205 		 * Keep allocating space from the extent list
3206 		 * for soft partitions of the desired size until
3207 		 * there's not enough free space left in the list
3208 		 * for another soft partiition of that size.
3209 		 * Add one to the number of possible soft partitions
3210 		 * for each soft partition for which there is
3211 		 * enough free space left.
3212 		 */
3213 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3214 		    sp_size, &extent_listp, alignment);
3215 		if (succeeded == B_TRUE) {
3216 			number_of_possible_sps++;
3217 		}
3218 	}
3219 	if (extent_listp != NULL) {
3220 		meta_sp_list_free(&extent_listp);
3221 	}
3222 	return (number_of_possible_sps);
3223 }
3224 
3225 /*
3226  * FUNCTION:	meta_sp_get_number_of_possible_sps_on_drive()
3227  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3228  *			     for the set containing the drive for
3229  *			     which the number of possible soft partitions
3230  *			     is to be returned
3231  *		mddrivenamep - a reference to the mddrivename_t of the drive
3232  *			       for which the number of possible soft partitions
3233  *			       is to be returned
3234  *		sp_size - the size in blocks of the proposed soft partitions
3235  * OUTPUT:	int return value
3236  * RETURNS:	int - the number of soft partitions of the desired size
3237  *		      that can be created on the drive
3238  * PURPOSE:	returns the number of soft partitions of a given size
3239  *		that can be created on a drive, if the entire drive is
3240  *		soft partitioned
3241  */
3242 int
3243 meta_sp_get_number_of_possible_sps_on_drive(
3244 	mdsetname_t	*mdsetnamep,
3245 	mddrivename_t	*mddrivenamep,
3246 	blkcnt_t	sp_size
3247 )
3248 {
3249 	sp_ext_node_t	*extent_listp;
3250 	int		number_of_possible_sps;
3251 	boolean_t	succeeded;
3252 
3253 	extent_listp = NULL;
3254 	number_of_possible_sps = 0;
3255 	if (sp_size > 0) {
3256 		succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep,
3257 		    mddrivenamep, &extent_listp);
3258 	} else {
3259 		succeeded = B_FALSE;
3260 	}
3261 	while (succeeded == B_TRUE) {
3262 		/*
3263 		 * Keep allocating space from the extent list
3264 		 * for soft partitions of the desired size until
3265 		 * there's not enough free space left in the list
3266 		 * for another soft partition of that size.
3267 		 * Add one to the number of possible soft partitions
3268 		 * for each soft partition for which there is
3269 		 * enough free space left.
3270 		 *
3271 		 * Since it's a drive, not a metadevice, make no
3272 		 * assumptions about alignment.
3273 		 */
3274 		succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION,
3275 		    sp_size, &extent_listp, SP_UNALIGNED);
3276 		if (succeeded == B_TRUE) {
3277 			number_of_possible_sps++;
3278 		}
3279 	}
3280 	if (extent_listp != NULL) {
3281 		meta_sp_list_free(&extent_listp);
3282 	}
3283 	return (number_of_possible_sps);
3284 }
3285 
3286 /*
3287  * FUNCTION:	meta_sp_get_possible_sp_size()
3288  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3289  *			     for the set containing the device for
3290  *			     which the possible soft partition size
3291  *			     is to be returned
3292  *		mdnamep - a reference to the mdname_t of the device
3293  *			  for which the possible soft partition size
3294  *			  is to be returned
3295  *		number_of_sps - the desired number of soft partitions
3296  * OUTPUT:	blkcnt_t return value
3297  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3298  * PURPOSE:	returns the maximum possible size of each of a given number of
3299  *		soft partitions of equal size that can be created on a device
3300  */
3301 blkcnt_t
3302 meta_sp_get_possible_sp_size(
3303 	mdsetname_t	*mdsetnamep,
3304 	mdname_t	*mdnamep,
3305 	int		number_of_sps
3306 )
3307 {
3308 	blkcnt_t	free_blocks;
3309 	blkcnt_t	sp_size;
3310 	boolean_t	succeeded;
3311 
3312 	sp_size = 0;
3313 	if (number_of_sps > 0) {
3314 		free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep);
3315 		sp_size = free_blocks / number_of_sps;
3316 		succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3317 		    number_of_sps, sp_size);
3318 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3319 			/*
3320 			 * To compensate for space that may have been
3321 			 * occupied by watermarks, reduce sp_size by a
3322 			 * number of blocks equal to the number of soft
3323 			 * partitions desired, and test again to see
3324 			 * whether the desired number of soft partitions
3325 			 * can be created.
3326 			 */
3327 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3328 			succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep,
3329 			    number_of_sps, sp_size);
3330 		}
3331 		if (sp_size < 0) {
3332 			sp_size = 0;
3333 		}
3334 	}
3335 	return (sp_size);
3336 }
3337 
3338 /*
3339  * FUNCTION:	meta_sp_get_possible_sp_size_on_drive()
3340  * INPUT:	mdsetnamep - a reference to the mdsetname_t structure
3341  *			     for the set containing the drive for
3342  *			     which the possible soft partition size
3343  *			     is to be returned
3344  *		mddrivenamep - a reference to the mddrivename_t of the drive
3345  *			       for which the possible soft partition size
3346  *			       is to be returned
3347  *		number_of_sps - the desired number of soft partitions
3348  * OUTPUT:	blkcnt_t return value
3349  * RETURNS:	blkcnt_t - the possible soft partition size in blocks
3350  * PURPOSE:	returns the maximum possible size of each of a given number of
3351  *		soft partitions of equal size that can be created on a drive
3352  *              if the entire drive is soft partitioned
3353  */
3354 blkcnt_t
3355 meta_sp_get_possible_sp_size_on_drive(
3356 	mdsetname_t	*mdsetnamep,
3357 	mddrivename_t	*mddrivenamep,
3358 	int		number_of_sps
3359 )
3360 {
3361 	blkcnt_t	free_blocks;
3362 	blkcnt_t	sp_size;
3363 	boolean_t	succeeded;
3364 
3365 	sp_size = 0;
3366 	if (number_of_sps > 0) {
3367 		free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep,
3368 		    mddrivenamep);
3369 		sp_size = free_blocks / number_of_sps;
3370 		succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3371 		    mddrivenamep, number_of_sps, sp_size);
3372 		while ((succeeded == B_FALSE) && (sp_size > 0)) {
3373 			/*
3374 			 * To compensate for space that may have been
3375 			 * occupied by watermarks, reduce sp_size by a
3376 			 * number of blocks equal to the number of soft
3377 			 * partitions desired, and test again to see
3378 			 * whether the desired number of soft partitions
3379 			 * can be created.
3380 			 */
3381 			sp_size = sp_size - ((blkcnt_t)number_of_sps);
3382 			succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep,
3383 			    mddrivenamep, number_of_sps, sp_size);
3384 		}
3385 		if (sp_size < 0) {
3386 			sp_size = 0;
3387 		}
3388 	}
3389 	return (sp_size);
3390 }
3391 
3392 /*
3393  * **************************************************************************
3394  *                  Unit Structure Manipulation Functions                   *
3395  * **************************************************************************
3396  */
3397 
3398 /*
3399  * FUNCTION:	meta_sp_fillextarray()
3400  * INPUT:	mp	- the unit structure to fill
3401  *		extlist	- the list of extents to fill with
3402  * OUTPUT:	none
3403  * RETURNS:	void
3404  * PURPOSE:	fills in the unit structure extent list with the extents
3405  *		specified by extlist.  Only extents in extlist with the
3406  *		EXTFLG_UPDATE flag are changed in the unit structure,
3407  *		and the index into the unit structure is the sequence
3408  *		number in the extent list.  After all of the nodes have
3409  *		been updated the virtual offsets in the unit structure
3410  *		are updated to reflect the new lengths.
3411  */
3412 static void
3413 meta_sp_fillextarray(
3414 	mp_unit_t	*mp,
3415 	sp_ext_node_t	*extlist
3416 )
3417 {
3418 	int	i;
3419 	sp_ext_node_t	*ext;
3420 	sp_ext_offset_t	curvoff = 0LL;
3421 
3422 	assert(mp != NULL);
3423 
3424 	/* go through the allocation list and fill in our unit structure */
3425 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
3426 		if ((ext->ext_type == EXTTYP_ALLOC) &&
3427 		    (ext->ext_flags & EXTFLG_UPDATE) != 0) {
3428 			mp->un_ext[ext->ext_seq].un_poff =
3429 			    ext->ext_offset + MD_SP_WMSIZE;
3430 			mp->un_ext[ext->ext_seq].un_len =
3431 			    ext->ext_length - MD_SP_WMSIZE;
3432 		}
3433 	}
3434 
3435 	for (i = 0; i < mp->un_numexts; i++) {
3436 		assert(mp->un_ext[i].un_poff != 0);
3437 		assert(mp->un_ext[i].un_len  != 0);
3438 		mp->un_ext[i].un_voff = curvoff;
3439 		curvoff += mp->un_ext[i].un_len;
3440 	}
3441 }
3442 
3443 /*
3444  * FUNCTION:	meta_sp_createunit()
3445  * INPUT:	np	- the name of the device to create a unit structure for
3446  *		compnp	- the name of the device the soft partition is on
3447  *		extlist	- the extent list to populate the new unit with
3448  *		numexts	- the number of extents in the extent list
3449  *		len	- the total size of the soft partition (sectors)
3450  *		status	- the initial status of the unit structure
3451  * OUTPUT:	ep	- return error pointer
3452  * RETURNS:	mp_unit_t * - the new unit structure.
3453  * PURPOSE:	allocates and fills in a new soft partition unit
3454  *		structure to be passed to the soft partitioning driver
3455  *		for creation.
3456  */
3457 static mp_unit_t *
3458 meta_sp_createunit(
3459 	mdname_t	*np,
3460 	mdname_t	*compnp,
3461 	sp_ext_node_t	*extlist,
3462 	int		numexts,
3463 	sp_ext_length_t	len,
3464 	sp_status_t	status,
3465 	md_error_t	*ep
3466 )
3467 {
3468 	mp_unit_t	*mp;
3469 	uint_t		ms_size;
3470 
3471 	ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) +
3472 	    (numexts * sizeof (mp->un_ext[0]));
3473 
3474 	mp = Zalloc(ms_size);
3475 
3476 	/* fill in fields in common unit structure */
3477 	mp->c.un_type = MD_METASP;
3478 	mp->c.un_size = ms_size;
3479 	MD_SID(mp) = meta_getminor(np->dev);
3480 	mp->c.un_total_blocks = len;
3481 	mp->c.un_actual_tb = len;
3482 
3483 	/* set up geometry */
3484 	(void) meta_sp_setgeom(np, compnp, mp, ep);
3485 
3486 	/* if we're building on metadevice we can't parent */
3487 	if (metaismeta(compnp))
3488 		MD_CAPAB(mp) = MD_CANT_PARENT;
3489 	else
3490 		MD_CAPAB(mp) = MD_CAN_PARENT;
3491 
3492 	/* fill soft partition-specific fields */
3493 	mp->un_dev = compnp->dev;
3494 	mp->un_key = compnp->key;
3495 
3496 	/* mdname_t start_blk field is not 64-bit! */
3497 	mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk;
3498 	mp->un_status = status;
3499 	mp->un_numexts = numexts;
3500 	mp->un_length = len;
3501 
3502 	/* fill in the extent array */
3503 	meta_sp_fillextarray(mp, extlist);
3504 
3505 	return (mp);
3506 }
3507 
3508 /*
3509  * FUNCTION:	meta_sp_updateunit()
3510  * INPUT:	np       - name structure for the metadevice being updated
3511  *		old_un	 - the original unit structure that is being updated
3512  *		extlist	 - the extent list to populate the new unit with
3513  *		grow_len - the amount by which the partition is being grown
3514  *		numexts	 - the number of extents in the extent list
3515  *		ep       - return error pointer
3516  * OUTPUT:	none
3517  * RETURNS:	mp_unit_t * - the updated unit structure
3518  * PURPOSE:	allocates and fills in a new soft partition unit structure to
3519  *		be passed to the soft partitioning driver for creation.  The
3520  *		old unit structure is first copied in, and then the updated
3521  *		extents are changed in the new unit structure.  This is
3522  *		typically used when the size of an existing unit is changed.
3523  */
3524 static mp_unit_t *
3525 meta_sp_updateunit(
3526 	mdname_t	*np,
3527 	mp_unit_t	*old_un,
3528 	sp_ext_node_t	*extlist,
3529 	sp_ext_length_t	grow_len,
3530 	int		numexts,
3531 	md_error_t	*ep
3532 )
3533 {
3534 	mp_unit_t	*new_un;
3535 	sp_ext_length_t	new_len;
3536 	uint_t		new_size;
3537 
3538 	assert(old_un != NULL);
3539 	assert(extlist != NULL);
3540 
3541 	/* allocate new unit structure and copy in old unit */
3542 	new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) +
3543 	    ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0]));
3544 	new_len = old_un->un_length + grow_len;
3545 	new_un = Zalloc(new_size);
3546 	bcopy(old_un, new_un, old_un->c.un_size);
3547 
3548 	/* update size and geometry information */
3549 	new_un->c.un_size = new_size;
3550 	new_un->un_length = new_len;
3551 	new_un->c.un_total_blocks = new_len;
3552 	new_un->c.un_actual_tb = new_len;
3553 	if (meta_adjust_geom((md_unit_t *)new_un, np,
3554 	    old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct,
3555 	    0, ep) != 0) {
3556 		Free(new_un);
3557 		return (NULL);
3558 	}
3559 
3560 	/* update extent information */
3561 	new_un->un_numexts += numexts;
3562 
3563 	meta_sp_fillextarray(new_un, extlist);
3564 
3565 	return (new_un);
3566 }
3567 
3568 /*
3569  * FUNCTION:	meta_get_sp()
3570  * INPUT:	sp	- the set name for the device to get
3571  *		np	- the name of the device to get
3572  * OUTPUT:	ep	- return error pointer
3573  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition
3574  * PURPOSE:	interface to the rest of libmeta for fetching a unit structure
3575  *		for the named device.  Just a wrapper for meta_get_sp_common().
3576  */
3577 md_sp_t *
3578 meta_get_sp(
3579 	mdsetname_t	*sp,
3580 	mdname_t	*np,
3581 	md_error_t	*ep
3582 )
3583 {
3584 	return (meta_get_sp_common(sp, np, 0, ep));
3585 }
3586 
3587 /*
3588  * FUNCTION:	meta_get_sp_common()
3589  * INPUT:	sp	- the set name for the device to get
3590  *		np	- the name of the device to get
3591  *		fast	- whether to use the cache or not (NOT IMPLEMENTED!)
3592  * OUTPUT:	ep	- return error pointer
3593  * RETURNS:	md_sp_t * - the XDR unit structure for the soft partition,
3594  *			    NULL if np is not a soft partition
3595  * PURPOSE:	common routine for fetching a soft partition unit structure
3596  */
3597 md_sp_t *
3598 meta_get_sp_common(
3599 	mdsetname_t	*sp,
3600 	mdname_t	*np,
3601 	int		fast,
3602 	md_error_t	*ep
3603 )
3604 {
3605 	mddrivename_t	*dnp = np->drivenamep;
3606 	char		*miscname;
3607 	mp_unit_t	*mp;
3608 	md_sp_t		*msp;
3609 	int		i;
3610 
3611 	/* must have set */
3612 	assert(sp != NULL);
3613 
3614 	/* short circuit */
3615 	if (dnp->unitp != NULL) {
3616 		if (dnp->unitp->type != MD_METASP)
3617 			return (NULL);
3618 		return ((md_sp_t *)dnp->unitp);
3619 	}
3620 	/* get miscname and unit */
3621 	if ((miscname = metagetmiscname(np, ep)) == NULL)
3622 		return (NULL);
3623 
3624 	if (strcmp(miscname, MD_SP) != 0) {
3625 		(void) mdmderror(ep, MDE_NOT_SP, 0, np->cname);
3626 		return (NULL);
3627 	}
3628 
3629 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
3630 		return (NULL);
3631 
3632 	assert(mp->c.un_type == MD_METASP);
3633 
3634 	/* allocate soft partition */
3635 	msp = Zalloc(sizeof (*msp));
3636 
3637 	/* get the common information */
3638 	msp->common.namep = np;
3639 	msp->common.type = mp->c.un_type;
3640 	msp->common.state = mp->c.un_status;
3641 	msp->common.capabilities = mp->c.un_capabilities;
3642 	msp->common.parent = mp->c.un_parent;
3643 	msp->common.size = mp->c.un_total_blocks;
3644 	msp->common.user_flags = mp->c.un_user_flags;
3645 	msp->common.revision = mp->c.un_revision;
3646 
3647 	/* get soft partition information */
3648 	if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL)
3649 		goto out;
3650 
3651 	/*
3652 	 * Fill in the key and the start block.  Note that the start
3653 	 * block in the unit structure is 64 bits but the name pointer
3654 	 * only supports 32 bits.
3655 	 */
3656 	msp->compnamep->key = mp->un_key;
3657 	msp->compnamep->start_blk = mp->un_start_blk;
3658 
3659 	/* fill in status field */
3660 	msp->status = mp->un_status;
3661 
3662 	/* allocate the extents */
3663 	msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val));
3664 	msp->ext.ext_len = mp->un_numexts;
3665 
3666 	/* do the extents for this soft partition */
3667 	for (i = 0; i < mp->un_numexts; i++) {
3668 		struct mp_ext	*mde = &mp->un_ext[i];
3669 		md_sp_ext_t	*extp = &msp->ext.ext_val[i];
3670 
3671 		extp->voff = mde->un_voff;
3672 		extp->poff = mde->un_poff;
3673 		extp->len = mde->un_len;
3674 	}
3675 
3676 	/* cleanup, return success */
3677 	Free(mp);
3678 	dnp->unitp = (md_common_t *)msp;
3679 	return (msp);
3680 
3681 out:
3682 	/* clean up and return error */
3683 	Free(mp);
3684 	Free(msp);
3685 	return (NULL);
3686 }
3687 
3688 
3689 /*
3690  * FUNCTION:	meta_init_sp()
3691  * INPUT:	spp	- the set name for the new device
3692  *		argc	- the remaining argument count for the metainit cmdline
3693  *		argv	- the remainder of the unparsed command line
3694  *		options	- global options parsed by metainit
3695  * OUTPUT:	ep	- return error pointer
3696  * RETURNS:	int	- -1 failure, 0 success
3697  * PURPOSE:	provides the command line parsing and name management overhead
3698  *		for creating a new soft partition.  Ultimately this calls
3699  *		meta_create_sp() which does the real work of allocating space
3700  *		for the new soft partition.
3701  */
3702 int
3703 meta_init_sp(
3704 	mdsetname_t	**spp,
3705 	int		argc,
3706 	char		*argv[],
3707 	mdcmdopts_t	options,
3708 	md_error_t	*ep
3709 )
3710 {
3711 	char		*compname = NULL;
3712 	mdname_t	*spcompnp = NULL;	/* name of component volume */
3713 	char		*devname = argv[0];	/* unit name */
3714 	mdname_t	*np = NULL;		/* name of soft partition */
3715 	md_sp_t		*msp = NULL;
3716 	int		c;
3717 	int		old_optind;
3718 	sp_ext_length_t	len = 0LL;
3719 	int		rval = -1;
3720 	uint_t		seq;
3721 	int		oflag;
3722 	int		failed;
3723 	mddrivename_t	*dnp = NULL;
3724 	sp_ext_length_t	alignment = 0LL;
3725 	sp_ext_node_t	*extlist = NULL;
3726 
3727 	assert(argc > 0);
3728 
3729 	/* expect sp name, -p, optional -e, compname, and size parameters */
3730 	/* grab soft partition name */
3731 	if ((np = metaname(spp, devname, META_DEVICE, ep)) == NULL)
3732 		goto out;
3733 
3734 	/* see if it exists already */
3735 	if (metagetmiscname(np, ep) != NULL) {
3736 		(void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP,
3737 		    meta_getminor(np->dev), devname);
3738 		goto out;
3739 	} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) {
3740 		goto out;
3741 	} else {
3742 		mdclrerror(ep);
3743 	}
3744 	--argc, ++argv;
3745 
3746 	if (argc == 0)
3747 		goto syntax;
3748 
3749 	/* grab -p */
3750 	if (strcmp(argv[0], "-p") != 0)
3751 		goto syntax;
3752 	--argc, ++argv;
3753 
3754 	if (argc == 0)
3755 		goto syntax;
3756 
3757 	/* see if -e is there */
3758 	if (strcmp(argv[0], "-e") == 0) {
3759 		/* use the whole disk */
3760 		options |= MDCMD_USE_WHOLE_DISK;
3761 		--argc, ++argv;
3762 	}
3763 
3764 	if (argc == 0)
3765 		goto syntax;
3766 
3767 	/* get component name */
3768 	compname = Strdup(argv[0]);
3769 
3770 	if (options & MDCMD_USE_WHOLE_DISK) {
3771 		if ((dnp = metadrivename(spp, compname, ep)) == NULL) {
3772 			goto out;
3773 		}
3774 		if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) {
3775 			goto out;
3776 		}
3777 	} else if ((spcompnp = metaname(spp, compname, UNKNOWN, ep)) == NULL) {
3778 		goto out;
3779 	}
3780 	assert(*spp != NULL);
3781 
3782 	if (!(options & MDCMD_NOLOCK)) {
3783 		/* grab set lock */
3784 		if (meta_lock(*spp, TRUE, ep))
3785 			goto out;
3786 
3787 		if (meta_check_ownership(*spp, ep) != 0)
3788 			goto out;
3789 	}
3790 
3791 	/* allocate the soft partition */
3792 	msp = Zalloc(sizeof (*msp));
3793 
3794 	/* setup common */
3795 	msp->common.namep = np;
3796 	msp->common.type = MD_METASP;
3797 
3798 	compname = spcompnp->cname;
3799 
3800 	assert(spcompnp->rname != NULL);
3801 	--argc, ++argv;
3802 
3803 	if (argc == 0) {
3804 		goto syntax;
3805 	}
3806 
3807 	if (*argv[0] == '-') {
3808 		/*
3809 		 * parse any other command line options, this includes
3810 		 * the recovery options -o and -b. The special thing
3811 		 * with these options is that the len needs to be
3812 		 * kept track of otherwise when the geometry of the
3813 		 * "device" is built it will create an invalid geometry
3814 		 */
3815 		old_optind = optind = 0;
3816 		opterr = 0;
3817 		oflag = 0;
3818 		seq = 0;
3819 		failed = 0;
3820 		while ((c = getopt(argc, argv, "A:o:b:")) != -1) {
3821 			sp_ext_offset_t	offset;
3822 			sp_ext_length_t	length;
3823 			longlong_t	tmp_size;
3824 
3825 			switch (c) {
3826 			case 'A':	/* data alignment */
3827 				if (meta_sp_parsesizestring(optarg,
3828 				    &alignment) == -1) {
3829 					failed = 1;
3830 				}
3831 				break;
3832 			case 'o':	/* offset in the partition */
3833 				if (oflag == 1) {
3834 					failed = 1;
3835 				} else {
3836 					tmp_size = atoll(optarg);
3837 					if (tmp_size <= 0) {
3838 						failed = 1;
3839 					} else {
3840 						oflag = 1;
3841 						options |= MDCMD_DIRECT;
3842 
3843 						offset = tmp_size;
3844 					}
3845 				}
3846 
3847 				break;
3848 			case 'b':	/* number of blocks */
3849 				if (oflag == 0) {
3850 					failed = 1;
3851 				} else {
3852 					tmp_size = atoll(optarg);
3853 					if (tmp_size <= 0) {
3854 						failed = 1;
3855 					} else {
3856 						oflag = 0;
3857 
3858 						length = tmp_size;
3859 
3860 						/* we have a pair of values */
3861 						meta_sp_list_insert(*spp, np,
3862 						    &extlist, offset, length,
3863 						    EXTTYP_ALLOC, seq++,
3864 						    EXTFLG_UPDATE,
3865 						    meta_sp_cmp_by_offset);
3866 						len += length;
3867 					}
3868 				}
3869 
3870 				break;
3871 			default:
3872 				argc -= old_optind;
3873 				argv += old_optind;
3874 				goto options;
3875 			}
3876 
3877 			if (failed) {
3878 				argc -= old_optind;
3879 				argv += old_optind;
3880 				goto syntax;
3881 			}
3882 
3883 			old_optind = optind;
3884 		}
3885 		argc -= optind;
3886 		argv += optind;
3887 
3888 		/*
3889 		 * Must have matching pairs of -o and -b flags
3890 		 */
3891 		if (oflag != 0)
3892 			goto syntax;
3893 
3894 		/*
3895 		 * Can't specify both layout (indicated indirectly by
3896 		 * len being set by thye -o/-b cases above) AND
3897 		 * alignment
3898 		 */
3899 		if ((len > 0LL) && (alignment > 0LL))
3900 			goto syntax;
3901 
3902 		/*
3903 		 * sanity check the allocation list
3904 		 */
3905 		if ((extlist != NULL) && meta_sp_list_overlaps(extlist))
3906 			goto syntax;
3907 	}
3908 
3909 	if (len == 0LL) {
3910 		if (argc == 0)
3911 			goto syntax;
3912 		if (meta_sp_parsesize(argv[0], &len) == -1)
3913 			goto syntax;
3914 		--argc, ++argv;
3915 	}
3916 
3917 	msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val));
3918 	msp->ext.ext_val->len = len;
3919 	msp->compnamep = spcompnp;
3920 
3921 	/* we should be at the end */
3922 	if (argc != 0)
3923 		goto syntax;
3924 
3925 	/* create soft partition */
3926 	if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0)
3927 		goto out;
3928 	rval = 0;
3929 
3930 	/* let em know */
3931 	if (options & MDCMD_PRINT) {
3932 		(void) printf(dgettext(TEXT_DOMAIN,
3933 		    "%s: Soft Partition is setup\n"),
3934 		    devname);
3935 		(void) fflush(stdout);
3936 	}
3937 	goto out;
3938 
3939 syntax:
3940 	/* syntax error */
3941 	rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv);
3942 	goto out;
3943 
3944 options:
3945 	/* options error */
3946 	rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv);
3947 	goto out;
3948 
3949 out:
3950 	if (msp != NULL) {
3951 		if (msp->ext.ext_val != NULL) {
3952 			Free(msp->ext.ext_val);
3953 		}
3954 		Free(msp);
3955 	}
3956 
3957 	return (rval);
3958 }
3959 
3960 /*
3961  * FUNCTION:	meta_free_sp()
3962  * INPUT:	msp	- the soft partition unit to free
3963  * OUTPUT:	none
3964  * RETURNS:	void
3965  * PURPOSE:	provides an interface from the rest of libmeta for freeing a
3966  *		soft partition unit
3967  */
3968 void
3969 meta_free_sp(md_sp_t *msp)
3970 {
3971 	Free(msp);
3972 }
3973 
3974 /*
3975  * FUNCTION:	meta_sp_issp()
3976  * INPUT:	sp	- the set name to check
3977  *		np	- the name to check
3978  * OUTPUT:	ep	- return error pointer
3979  * RETURNS:	int	- 0 means sp,np is a soft partition
3980  *			  1 means sp,np is not a soft partition
3981  * PURPOSE:	determines whether the given device is a soft partition
3982  *		device.  This is called by other metadevice check routines.
3983  */
3984 int
3985 meta_sp_issp(
3986 	mdsetname_t	*sp,
3987 	mdname_t	*np,
3988 	md_error_t	*ep
3989 )
3990 {
3991 	if (meta_get_sp_common(sp, np, 0, ep) == NULL)
3992 		return (1);
3993 
3994 	return (0);
3995 }
3996 
3997 /*
3998  * FUNCTION:	meta_check_sp()
3999  * INPUT:	sp	- the set name to check
4000  *		msp	- the unit structure to check
4001  *		options	- creation options
4002  * OUTPUT:	repart_options - options to be passed to
4003  *				meta_repartition_drive()
4004  *		ep	- return error pointer
4005  * RETURNS:	int	-  0 ok to create on this component
4006  *			  -1 error or not ok to create on this component
4007  * PURPOSE:	Checks to determine whether the rules for creation of
4008  *		soft partitions allow creation of a soft partition on
4009  *		the device described by the mdname_t structure referred
4010  *		to by msp->compnamep.
4011  *
4012  *		NOTE: Does NOT check to determine whether the extents
4013  *		      described in the md_sp_t structure referred to by
4014  *		      msp will fit on the device described by the mdname_t
4015  *		      structure located at msp->compnamep.
4016  */
4017 static int
4018 meta_check_sp(
4019 	mdsetname_t	*sp,
4020 	md_sp_t		*msp,
4021 	mdcmdopts_t	options,
4022 	int		*repart_options,
4023 	md_error_t	*ep
4024 )
4025 {
4026 	md_common_t	*mdp;
4027 	mdname_t	*compnp = msp->compnamep;
4028 	uint_t		slice;
4029 	mddrivename_t	*dnp;
4030 	mdname_t	*slicenp;
4031 	mdvtoc_t	*vtocp;
4032 
4033 	/* make sure it is in the set */
4034 	if (meta_check_inset(sp, compnp, ep) != 0)
4035 		return (-1);
4036 
4037 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4038 		uint_t	rep_slice;
4039 
4040 		/*
4041 		 * check to make sure we can partition this drive.
4042 		 * we cannot continue if any of the following are
4043 		 * true:
4044 		 * The drive is a metadevice.
4045 		 * The drive contains a mounted slice.
4046 		 * The drive contains a slice being swapped to.
4047 		 * The drive contains slices which are part of other
4048 		 * metadevices.
4049 		 * The drive contains a metadb.
4050 		 */
4051 		if (metaismeta(compnp))
4052 			return (mddeverror(ep, MDE_IS_META, compnp->dev,
4053 			    compnp->cname));
4054 
4055 		assert(compnp->drivenamep != NULL);
4056 
4057 		/*
4058 		 * ensure that we have slice 0 since the disk will be
4059 		 * repartitioned in the USE_WHOLE_DISK case.  this check
4060 		 * is redundant unless the user incorrectly specifies a
4061 		 * a fully qualified drive AND slice name (i.e.,
4062 		 * /dev/dsk/cXtXdXsX), which will be incorrectly
4063 		 * recognized as a drive name by the metaname code.
4064 		 */
4065 
4066 		if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL)
4067 			return (-1);
4068 		if (slice != MD_SLICE0)
4069 			return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname));
4070 
4071 		dnp = compnp->drivenamep;
4072 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
4073 			return (-1);
4074 
4075 		for (slice = 0; slice < vtocp->nparts; slice++) {
4076 
4077 			/* only check if the slice really exists */
4078 			if (vtocp->parts[slice].size == 0)
4079 				continue;
4080 
4081 			slicenp = metaslicename(dnp, slice, ep);
4082 			if (slicenp == NULL)
4083 				return (-1);
4084 
4085 			/* check to ensure that it is not already in use */
4086 			if (meta_check_inuse(sp,
4087 			    slicenp, MDCHK_INUSE, ep) != 0) {
4088 				return (-1);
4089 			}
4090 
4091 			/*
4092 			 * Up to this point, tests are applied to all
4093 			 * slices uniformly.
4094 			 */
4095 
4096 			if (slice == rep_slice) {
4097 				/*
4098 				 * Tests inside the body of this
4099 				 * conditional are applied only to
4100 				 * slice seven.
4101 				 */
4102 				if (meta_check_inmeta(sp, slicenp,
4103 				    options | MDCHK_ALLOW_MDDB |
4104 				    MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0)
4105 					return (-1);
4106 
4107 				/*
4108 				 * For slice seven, a metadb is NOT an
4109 				 * automatic failure. It merely means
4110 				 * that we're not allowed to muck
4111 				 * about with the partitioning of that
4112 				 * slice.  We indicate this by masking
4113 				 * in the MD_REPART_LEAVE_REP flag.
4114 				 */
4115 				if (metahasmddb(sp, slicenp, ep)) {
4116 					assert(repart_options !=
4117 					    NULL);
4118 					*repart_options |=
4119 					    MD_REPART_LEAVE_REP;
4120 				}
4121 
4122 				/*
4123 				 * Skip the remaining tests for slice
4124 				 * seven
4125 				 */
4126 				continue;
4127 			}
4128 
4129 			/*
4130 			 * Tests below this point will be applied to
4131 			 * all slices EXCEPT for the replica slice.
4132 			 */
4133 
4134 
4135 			/* check if component is in a metadevice */
4136 			if (meta_check_inmeta(sp, slicenp, options, 0,
4137 			    -1, ep) != 0)
4138 				return (-1);
4139 
4140 			/* check to see if component has a metadb */
4141 			if (metahasmddb(sp, slicenp, ep))
4142 				return (mddeverror(ep, MDE_HAS_MDDB,
4143 				    slicenp->dev, slicenp->cname));
4144 		}
4145 		/*
4146 		 * This should be all of the testing necessary when
4147 		 * the MDCMD_USE_WHOLE_DISK flag is set; the rest of
4148 		 * meta_check_sp() is oriented towards component
4149 		 * arguments instead of disks.
4150 		 */
4151 		goto meta_check_sp_ok;
4152 
4153 	}
4154 
4155 	/* check to ensure that it is not already in use */
4156 	if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) {
4157 		return (-1);
4158 	}
4159 
4160 	if (!metaismeta(compnp)) {	/* handle non-metadevices */
4161 
4162 		/*
4163 		 * The component can have one or more soft partitions on it
4164 		 * already, but can't be part of any other type of metadevice,
4165 		 * so if it is used for a metadevice, but the metadevice
4166 		 * isn't a soft partition, return failure.
4167 		 */
4168 
4169 		if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 &&
4170 		    meta_check_insp(sp, compnp, 0, -1, ep) == 0) {
4171 			return (-1);
4172 		}
4173 	} else {			/* handle metadevices */
4174 		/* get underlying unit & check capabilities */
4175 		if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL)
4176 			return (-1);
4177 
4178 		if ((! (mdp->capabilities & MD_CAN_PARENT)) ||
4179 		    (! (mdp->capabilities & MD_CAN_SP)))
4180 			return (mdmderror(ep, MDE_INVAL_UNIT,
4181 			    meta_getminor(compnp->dev), compnp->cname));
4182 	}
4183 
4184 meta_check_sp_ok:
4185 	mdclrerror(ep);
4186 	return (0);
4187 }
4188 
4189 /*
4190  * FUNCTION:	meta_create_sp()
4191  * INPUT:	sp	- the set name to create in
4192  *		msp	- the unit structure to create
4193  *		oblist	- an optional list of requested extents (-o/-b options)
4194  *		options	- creation options
4195  *		alignment - data alignment
4196  * OUTPUT:	ep	- return error pointer
4197  * RETURNS:	int	-  0 success, -1 error
4198  * PURPOSE:	does most of the work for creating a soft partition.  If
4199  *		metainit -p -e was used, first partition the drive.  Then
4200  *		create an extent list based on the existing soft partitions
4201  *		and assume all space not used by them is free.  Storage for
4202  *		the new soft partition is allocated from the free extents
4203  *		based on the length specified on the command line or the
4204  *		oblist passed in.  The unit structure is then committed and
4205  *		the watermarks are updated.  Finally, the status is changed to
4206  *		Okay and the process is complete.
4207  */
4208 static int
4209 meta_create_sp(
4210 	mdsetname_t	*sp,
4211 	md_sp_t		*msp,
4212 	sp_ext_node_t	*oblist,
4213 	mdcmdopts_t	options,
4214 	sp_ext_length_t	alignment,
4215 	md_error_t	*ep
4216 )
4217 {
4218 	mdname_t	*np = msp->common.namep;
4219 	mdname_t	*compnp = msp->compnamep;
4220 	mp_unit_t	*mp = NULL;
4221 	mdnamelist_t	*keynlp = NULL, *spnlp = NULL;
4222 	md_set_params_t	set_params;
4223 	int		rval = -1;
4224 	diskaddr_t	comp_size;
4225 	diskaddr_t	sp_start;
4226 	sp_ext_node_t	*extlist = NULL;
4227 	int		numexts = 0;	/* number of extents */
4228 	int		count = 0;
4229 	int		committed = 0;
4230 	int		repart_options = MD_REPART_FORCE;
4231 	int		create_flag = MD_CRO_32BIT;
4232 
4233 	md_set_desc	*sd;
4234 	mm_unit_t	*mm;
4235 	md_set_mmown_params_t	*ownpar = NULL;
4236 	int		comp_is_mirror = 0;
4237 
4238 	/* validate soft partition */
4239 	if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0)
4240 		return (-1);
4241 
4242 	if ((options & MDCMD_USE_WHOLE_DISK) != 0) {
4243 		if ((options & MDCMD_DOIT) != 0) {
4244 			if (meta_repartition_drive(sp,
4245 			    compnp->drivenamep,
4246 			    repart_options,
4247 			    NULL, /* Don't return the VTOC */
4248 			    ep) != 0)
4249 
4250 				return (-1);
4251 		} else {
4252 			/*
4253 			 * If -n and -e are both specified, it doesn't make
4254 			 * sense to continue without actually partitioning
4255 			 * the drive.
4256 			 */
4257 			return (0);
4258 		}
4259 	}
4260 
4261 	/* populate the start_blk field of the component name */
4262 	if ((sp_start = meta_sp_get_start(sp, compnp, ep)) ==
4263 	    MD_DISKADDR_ERROR) {
4264 		rval = -1;
4265 		goto out;
4266 	}
4267 
4268 	if (options & MDCMD_DOIT) {
4269 		/* store name in namespace */
4270 		if (add_key_name(sp, compnp, &keynlp, ep) != 0) {
4271 			rval = -1;
4272 			goto out;
4273 		}
4274 	}
4275 
4276 	/*
4277 	 * Get a list of the soft partitions that currently reside on
4278 	 * the component.  We should ALWAYS force reload the cache,
4279 	 * because if this is a single creation, there will not BE a
4280 	 * cached list, and if we're using the md.tab, we must rebuild
4281 	 * the list because it won't contain the previous (if any)
4282 	 * soft partition.
4283 	 */
4284 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4285 	if (count < 0) {
4286 		/* error occured */
4287 		rval = -1;
4288 		goto out;
4289 	}
4290 
4291 	/*
4292 	 * get the size of the underlying device.  if the size is smaller
4293 	 * than or equal to the watermark size, we know there isn't
4294 	 * enough space.
4295 	 */
4296 	if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) {
4297 		rval = -1;
4298 		goto out;
4299 	} else if (comp_size <= MD_SP_WMSIZE) {
4300 		(void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname);
4301 		rval = -1;
4302 		goto out;
4303 	}
4304 	/*
4305 	 * seed extlist with reserved space at the beginning of the volume and
4306 	 * enough space for the end watermark.  The end watermark always gets
4307 	 * updated, but if the underlying device changes size it may not be
4308 	 * pointed to until the extent before it is updated.  Since the
4309 	 * end of the reserved space is where the first watermark starts,
4310 	 * the reserved extent should never be marked for updating.
4311 	 */
4312 
4313 	meta_sp_list_insert(NULL, NULL, &extlist,
4314 	    0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4315 	meta_sp_list_insert(NULL, NULL, &extlist,
4316 	    (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE,
4317 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4318 
4319 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4320 		rval = -1;
4321 		goto out;
4322 	}
4323 
4324 	metafreenamelist(spnlp);
4325 
4326 	if (getenv(META_SP_DEBUG)) {
4327 		meta_sp_debug("meta_create_sp: list of used extents:\n");
4328 		meta_sp_list_dump(extlist);
4329 	}
4330 
4331 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4332 
4333 	/* get extent list from -o/-b options or from free space */
4334 	if (options & MDCMD_DIRECT) {
4335 		if (getenv(META_SP_DEBUG)) {
4336 			meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n");
4337 			meta_sp_list_dump(oblist);
4338 		}
4339 
4340 		numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist);
4341 		if (numexts == -1) {
4342 			(void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname);
4343 			rval = -1;
4344 			goto out;
4345 		}
4346 	} else {
4347 		numexts = meta_sp_alloc_by_len(sp, np, &extlist,
4348 		    &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment :
4349 		    meta_sp_get_default_alignment(sp, compnp, ep));
4350 		if (numexts == -1) {
4351 			(void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname);
4352 			rval = -1;
4353 			goto out;
4354 		}
4355 	}
4356 
4357 	assert(extlist != NULL);
4358 
4359 	/* create soft partition */
4360 	mp = meta_sp_createunit(msp->common.namep, msp->compnamep,
4361 	    extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep);
4362 
4363 	create_flag = meta_check_devicesize(mp->c.un_total_blocks);
4364 
4365 	/* if we're not doing anything (metainit -n), return success */
4366 	if (! (options & MDCMD_DOIT)) {
4367 		rval = 0;	/* success */
4368 		goto out;
4369 	}
4370 
4371 	(void) memset(&set_params, 0, sizeof (set_params));
4372 
4373 	if (create_flag == MD_CRO_64BIT) {
4374 		mp->c.un_revision |= MD_64BIT_META_DEV;
4375 		set_params.options = MD_CRO_64BIT;
4376 	} else {
4377 		mp->c.un_revision &= ~MD_64BIT_META_DEV;
4378 		set_params.options = MD_CRO_32BIT;
4379 	}
4380 
4381 	if (getenv(META_SP_DEBUG)) {
4382 		meta_sp_debug("meta_create_sp: printing unit structure\n");
4383 		meta_sp_printunit(mp);
4384 	}
4385 
4386 	/*
4387 	 * Check to see if we're trying to create a partition on a mirror. If so
4388 	 * we may have to enforce an ownership change before writing the
4389 	 * watermark out.
4390 	 */
4391 	if (metaismeta(compnp)) {
4392 		char *miscname;
4393 
4394 		miscname = metagetmiscname(compnp, ep);
4395 		if (miscname != NULL)
4396 			comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0);
4397 		else
4398 			comp_is_mirror = 0;
4399 	} else {
4400 		comp_is_mirror = 0;
4401 	}
4402 
4403 	/*
4404 	 * For a multi-node environment we have to ensure that the master
4405 	 * node owns an underlying mirror before we issue the MD_IOCSET ioctl.
4406 	 * If the master does not own the device we will deadlock as the
4407 	 * implicit write of the watermarks (in sp_ioctl.c) will cause an
4408 	 * ownership change that will block as the MD_IOCSET is still in
4409 	 * progress. To close this window we force an owner change to occur
4410 	 * before issuing the MD_IOCSET. We cannot simply open the device and
4411 	 * write to it as this will only work for the first soft-partition
4412 	 * creation.
4413 	 */
4414 
4415 	if (comp_is_mirror && !metaislocalset(sp)) {
4416 
4417 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4418 			rval = -1;
4419 			goto out;
4420 		}
4421 		if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
4422 			mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
4423 			if (mm == NULL) {
4424 				rval = -1;
4425 				goto out;
4426 			} else {
4427 				rval = meta_mn_change_owner(&ownpar, sp->setno,
4428 				    meta_getminor(compnp->dev),
4429 				    sd->sd_mn_mynode->nd_nodeid,
4430 				    MD_MN_MM_PREVENT_CHANGE |
4431 				    MD_MN_MM_SPAWN_THREAD);
4432 				if (rval == -1)
4433 					goto out;
4434 			}
4435 		}
4436 	}
4437 
4438 	set_params.mnum = MD_SID(mp);
4439 	set_params.size = mp->c.un_size;
4440 	set_params.mdp = (uintptr_t)mp;
4441 	MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum));
4442 
4443 	/* first phase of commit. */
4444 	if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
4445 	    np->cname) != 0) {
4446 		(void) mdstealerror(ep, &set_params.mde);
4447 		rval = -1;
4448 		goto out;
4449 	}
4450 
4451 	/* we've successfully committed the record */
4452 	committed = 1;
4453 
4454 	/* write watermarks */
4455 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
4456 		rval = -1;
4457 		goto out;
4458 	}
4459 
4460 	/*
4461 	 * Allow mirror ownership to change. If we don't succeed in this
4462 	 * ioctl it isn't fatal, but the cluster will probably hang fairly
4463 	 * soon as the mirror owner won't change. However, we have
4464 	 * successfully written the watermarks out to the device so the
4465 	 * softpart creation has succeeded
4466 	 */
4467 	if (ownpar) {
4468 		(void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum,
4469 		    ownpar->d.owner,
4470 		    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
4471 	}
4472 
4473 	/* second phase of commit, set status to MD_SP_OK */
4474 	if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) {
4475 		rval = -1;
4476 		goto out;
4477 	}
4478 	rval = 0;
4479 out:
4480 	Free(mp);
4481 	if (ownpar)
4482 		Free(ownpar);
4483 
4484 	if (extlist != NULL)
4485 		meta_sp_list_free(&extlist);
4486 
4487 	if (rval != 0 && keynlp != NULL && committed != 1)
4488 		(void) del_key_names(sp, keynlp, NULL);
4489 
4490 	metafreenamelist(keynlp);
4491 
4492 	return (rval);
4493 }
4494 
4495 /*
4496  * **************************************************************************
4497  *                      Reset (metaclear) Functions                         *
4498  * **************************************************************************
4499  */
4500 
4501 /*
4502  * FUNCTION:	meta_sp_reset_common()
4503  * INPUT:	sp	- the set name of the device to reset
4504  *		np	- the name of the device to reset
4505  *		msp	- the unit structure to reset
4506  *		options	- metaclear options
4507  * OUTPUT:	ep	- return error pointer
4508  * RETURNS:	int	-  0 success, -1 error
4509  * PURPOSE:	"resets", or more accurately deletes, the soft partition
4510  *		specified.  First the state is set to "deleting" and then the
4511  *		watermarks are all cleared out.  Once the watermarks have been
4512  *		updated, the unit structure is deleted from the metadb.
4513  */
4514 static int
4515 meta_sp_reset_common(
4516 	mdsetname_t	*sp,
4517 	mdname_t	*np,
4518 	md_sp_t		*msp,
4519 	md_sp_reset_t	reset_params,
4520 	mdcmdopts_t	options,
4521 	md_error_t	*ep
4522 )
4523 {
4524 	char	*miscname;
4525 	int	rval = -1;
4526 	int	is_open = 0;
4527 
4528 	/* make sure that nobody owns us */
4529 	if (MD_HAS_PARENT(msp->common.parent))
4530 		return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev),
4531 		    np->cname));
4532 
4533 	/* make sure that the soft partition isn't open */
4534 	if ((is_open = meta_isopen(sp, np, ep, options)) < 0)
4535 		return (-1);
4536 	else if (is_open)
4537 		return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev),
4538 		    np->cname));
4539 
4540 	/* get miscname */
4541 	if ((miscname = metagetmiscname(np, ep)) == NULL)
4542 		return (-1);
4543 
4544 	/* fill in reset params */
4545 	MD_SETDRIVERNAME(&reset_params, miscname, sp->setno);
4546 	reset_params.mnum = meta_getminor(np->dev);
4547 	reset_params.force = (options & MDCMD_FORCE) ? 1 : 0;
4548 
4549 	/*
4550 	 * clear soft partition - phase one.
4551 	 * place the soft partition into the "delete pending" state.
4552 	 */
4553 	if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0)
4554 		return (-1);
4555 
4556 	/*
4557 	 * Now clear the watermarks.  If the force flag is specified,
4558 	 * ignore any errors writing the watermarks and delete the unit
4559 	 * structure anyway.  An error may leave the on-disk format in a
4560 	 * corrupt state.  If force is not specified and we fail here,
4561 	 * the soft partition will remain in the "delete pending" state.
4562 	 */
4563 	if ((meta_sp_clear_wm(sp, msp, ep) < 0) &&
4564 	    ((options & MDCMD_FORCE) == 0))
4565 		goto out;
4566 
4567 	/*
4568 	 * clear soft partition - phase two.
4569 	 * the driver removes the soft partition from the metadb and
4570 	 * zeros out incore version.
4571 	 */
4572 	if (metaioctl(MD_IOCRESET, &reset_params,
4573 	    &reset_params.mde, np->cname) != 0) {
4574 		(void) mdstealerror(ep, &reset_params.mde);
4575 		goto out;
4576 	}
4577 
4578 	/*
4579 	 * Wait for the /dev to be cleaned up. Ignore the return
4580 	 * value since there's not much we can do.
4581 	 */
4582 	(void) meta_update_devtree(meta_getminor(np->dev));
4583 
4584 	rval = 0;	/* success */
4585 
4586 	if (options & MDCMD_PRINT) {
4587 		(void) printf(dgettext(TEXT_DOMAIN,
4588 		    "%s: Soft Partition is cleared\n"),
4589 		    np->cname);
4590 		(void) fflush(stdout);
4591 	}
4592 
4593 	/*
4594 	 * if told to recurse and on a metadevice, then attempt to
4595 	 * clear the subdevices.  Indicate failure if the clear fails.
4596 	 */
4597 	if ((options & MDCMD_RECURSE) &&
4598 	    (metaismeta(msp->compnamep)) &&
4599 	    (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0))
4600 		rval = -1;
4601 
4602 out:
4603 	meta_invalidate_name(np);
4604 	return (rval);
4605 }
4606 
4607 /*
4608  * FUNCTION:	meta_sp_reset()
4609  * INPUT:	sp	- the set name of the device to reset
4610  *		np	- the name of the device to reset
4611  *		options	- metaclear options
4612  * OUTPUT:	ep	- return error pointer
4613  * RETURNS:	int	-  0 success, -1 error
4614  * PURPOSE:	provides the entry point to the rest of libmeta for deleting a
4615  *		soft partition.  If np is NULL, then soft partitions are
4616  *		all deleted at the current level and then recursively deleted.
4617  *		Otherwise, if a name is specified either directly or as a
4618  *		result of a recursive operation, it deletes only that name.
4619  *		Since something sitting under a soft partition may be parented
4620  *		to it, we have to reparent that other device to another soft
4621  *		partition on the same component if we're deleting the one it's
4622  *		parented to.
4623  */
4624 int
4625 meta_sp_reset(
4626 	mdsetname_t	*sp,
4627 	mdname_t	*np,
4628 	mdcmdopts_t	options,
4629 	md_error_t	*ep
4630 )
4631 {
4632 	md_sp_t		*msp;
4633 	int		rval = -1;
4634 	mdnamelist_t	*spnlp = NULL, *nlp = NULL;
4635 	md_sp_reset_t	reset_params;
4636 	int		num_sp;
4637 
4638 	assert(sp != NULL);
4639 
4640 	/* reset/delete all soft paritions */
4641 	if (np == NULL) {
4642 		/*
4643 		 * meta_reset_all sets MDCMD_RECURSE, but this behavior
4644 		 * is incorrect for soft partitions.  We want to clear
4645 		 * all soft partitions at a particular level in the
4646 		 * metadevice stack before moving to the next level.
4647 		 * Thus, we clear MDCMD_RECURSE from the options.
4648 		 */
4649 		options &= ~MDCMD_RECURSE;
4650 
4651 		/* for each soft partition */
4652 		rval = 0;
4653 		if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0)
4654 			rval = -1;
4655 
4656 		for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) {
4657 			np = nlp->namep;
4658 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4659 				rval = -1;
4660 				break;
4661 			}
4662 			/*
4663 			 * meta_reset_all calls us twice to get soft
4664 			 * partitions at the top and bottom of the stack.
4665 			 * thus, if we have a parent, we'll get deleted
4666 			 * on the next call.
4667 			 */
4668 			if (MD_HAS_PARENT(msp->common.parent))
4669 				continue;
4670 			/*
4671 			 * If this is a multi-node set, we send a series
4672 			 * of individual metaclear commands.
4673 			 */
4674 			if (meta_is_mn_set(sp, ep)) {
4675 				if (meta_mn_send_metaclear_command(sp,
4676 				    np->cname, options, 0, ep) != 0) {
4677 					rval = -1;
4678 					break;
4679 				}
4680 			} else {
4681 				if (meta_sp_reset(sp, np, options, ep) != 0) {
4682 					rval = -1;
4683 					break;
4684 				}
4685 			}
4686 		}
4687 		/* cleanup return status */
4688 		metafreenamelist(spnlp);
4689 		return (rval);
4690 	}
4691 
4692 	/* check the name */
4693 	if (metachkmeta(np, ep) != 0)
4694 		return (-1);
4695 
4696 	/* get the unit structure */
4697 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
4698 		return (-1);
4699 
4700 	/* clear out reset parameters */
4701 	(void) memset(&reset_params, 0, sizeof (reset_params));
4702 
4703 	/* if our child is a metadevice, we need to deparent/reparent it */
4704 	if (metaismeta(msp->compnamep)) {
4705 		/* get sp's on this component */
4706 		if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep,
4707 		    &spnlp, 1, ep)) <= 0)
4708 			/* no sp's on this device.  error! */
4709 			return (-1);
4710 		else if (num_sp == 1)
4711 			/* last sp on this device, so we deparent */
4712 			reset_params.new_parent = MD_NO_PARENT;
4713 		else {
4714 			/* have to reparent this metadevice */
4715 			for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4716 				if (meta_getminor(nlp->namep->dev) ==
4717 				    meta_getminor(np->dev))
4718 					continue;
4719 				/*
4720 				 * this isn't the softpart we are deleting,
4721 				 * so use this device as the new parent.
4722 				 */
4723 				reset_params.new_parent =
4724 				    meta_getminor(nlp->namep->dev);
4725 				break;
4726 			}
4727 		}
4728 		metafreenamelist(spnlp);
4729 	}
4730 
4731 	if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0)
4732 		return (-1);
4733 
4734 	return (0);
4735 }
4736 
4737 /*
4738  * FUNCTION:	meta_sp_reset_component()
4739  * INPUT:	sp	- the set name of the device to reset
4740  *		name	- the string name of the device to reset
4741  *		options	- metaclear options
4742  * OUTPUT:	ep	- return error pointer
4743  * RETURNS:	int	-  0 success, -1 error
4744  * PURPOSE:	provides the ability to delete all soft partitions on a
4745  *		specified device (metaclear -p).  It first gets all of the
4746  *		soft partitions on the component and then deletes each one
4747  *		individually.
4748  */
4749 int
4750 meta_sp_reset_component(
4751 	mdsetname_t	*sp,
4752 	char		*name,
4753 	mdcmdopts_t	options,
4754 	md_error_t	*ep
4755 )
4756 {
4757 	mdname_t	*compnp, *np;
4758 	mdnamelist_t	*spnlp = NULL;
4759 	mdnamelist_t	*nlp = NULL;
4760 	md_sp_t		*msp;
4761 	int		count;
4762 	md_sp_reset_t	reset_params;
4763 
4764 	if ((compnp = metaname(&sp, name, UNKNOWN, ep)) == NULL)
4765 		return (-1);
4766 
4767 	/* If we're starting out with no soft partitions, it's an error */
4768 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep);
4769 	if (count == 0)
4770 		return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname));
4771 	else if (count < 0)
4772 		return (-1);
4773 
4774 	/*
4775 	 * clear all soft partitions on this component.
4776 	 * NOTE: we reparent underlying metadevices as we go so that
4777 	 * things stay sane.  Also, if we encounter an error, we stop
4778 	 * and go no further in case recovery might be needed.
4779 	 */
4780 	for (nlp = spnlp; nlp != NULL; nlp = nlp->next) {
4781 		/* clear out reset parameters */
4782 		(void) memset(&reset_params, 0, sizeof (reset_params));
4783 
4784 		/* check the name */
4785 		np = nlp->namep;
4786 
4787 		if (metachkmeta(np, ep) != 0) {
4788 			metafreenamelist(spnlp);
4789 			return (-1);
4790 		}
4791 
4792 		/* get the unit structure */
4793 		if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
4794 			metafreenamelist(spnlp);
4795 			return (-1);
4796 		}
4797 
4798 		/* have to deparent/reparent metadevices */
4799 		if (metaismeta(compnp)) {
4800 			if (nlp->next == NULL)
4801 				reset_params.new_parent = MD_NO_PARENT;
4802 			else
4803 				reset_params.new_parent =
4804 				    meta_getminor(spnlp->next->namep->dev);
4805 		}
4806 
4807 		/* clear soft partition */
4808 		if (meta_sp_reset_common(sp, np, msp, reset_params,
4809 		    options, ep) < 0) {
4810 			metafreenamelist(spnlp);
4811 			return (-1);
4812 		}
4813 	}
4814 	metafreenamelist(spnlp);
4815 	return (0);
4816 }
4817 
4818 /*
4819  * **************************************************************************
4820  *                      Grow (metattach) Functions                          *
4821  * **************************************************************************
4822  */
4823 
4824 /*
4825  * FUNCTION:	meta_sp_attach()
4826  * INPUT:	sp	- the set name of the device to attach to
4827  *		np	- the name of the device to attach to
4828  *		addsize	- the unparsed string holding the amount of space to add
4829  *		options	- metattach options
4830  *		alignment - data alignment
4831  * OUTPUT:	ep	- return error pointer
4832  * RETURNS:	int	-  0 success, -1 error
4833  * PURPOSE:	grows a soft partition by reading in the existing unit
4834  *		structure and setting its state to Growing, allocating more
4835  *		space (similar to meta_create_sp()), updating the watermarks,
4836  *		and then writing out the new unit structure in the Okay state.
4837  */
4838 int
4839 meta_sp_attach(
4840 	mdsetname_t	*sp,
4841 	mdname_t	*np,
4842 	char		*addsize,
4843 	mdcmdopts_t	options,
4844 	sp_ext_length_t	alignment,
4845 	md_error_t	*ep
4846 )
4847 {
4848 	md_grow_params_t	grow_params;
4849 	sp_ext_length_t		grow_len;	/* amount to grow */
4850 	mp_unit_t		*mp, *new_un;
4851 	mdname_t		*compnp = NULL;
4852 
4853 	sp_ext_node_t		*extlist = NULL;
4854 	int			numexts;
4855 	mdnamelist_t		*spnlp = NULL;
4856 	int			count;
4857 	md_sp_t			*msp;
4858 	daddr_t			start_block;
4859 
4860 	/* should have the same set */
4861 	assert(sp != NULL);
4862 	assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev)));
4863 
4864 	/* check name */
4865 	if (metachkmeta(np, ep) != 0)
4866 		return (-1);
4867 
4868 	if (meta_sp_parsesize(addsize, &grow_len) == -1) {
4869 		return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname));
4870 	}
4871 
4872 	if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL)
4873 		return (-1);
4874 
4875 	/* make sure we don't have a parent */
4876 	if (MD_HAS_PARENT(mp->c.un_parent)) {
4877 		Free(mp);
4878 		return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname));
4879 	}
4880 
4881 	if (getenv(META_SP_DEBUG)) {
4882 		meta_sp_debug("meta_sp_attach: Unit structure before new "
4883 		    "space:\n");
4884 		meta_sp_printunit(mp);
4885 	}
4886 
4887 	/*
4888 	 * NOTE: the fast option to metakeyname is 0 as opposed to 1
4889 	 * If this was not the case we would suffer the following
4890 	 * assertion failure:
4891 	 * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP
4892 	 * file meta_check.x, line 315
4893 	 * I guess this is because we have not "seen" this drive before
4894 	 * and hence hit the failure - this is of course the attach routine
4895 	 */
4896 	if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) {
4897 		Free(mp);
4898 		return (-1);
4899 	}
4900 
4901 	/* metakeyname does not fill in the key. */
4902 	compnp->key = mp->un_key;
4903 
4904 	/* work out the space on the component that we are dealing with */
4905 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
4906 
4907 	/*
4908 	 * see if the component has been soft partitioned yet, or if an
4909 	 * error occurred.
4910 	 */
4911 	if (count == 0) {
4912 		Free(mp);
4913 		return (mdmderror(ep, MDE_NOT_SP, 0, np->cname));
4914 	} else if (count < 0) {
4915 		Free(mp);
4916 		return (-1);
4917 	}
4918 
4919 	/*
4920 	 * seed extlist with reserved space at the beginning of the volume and
4921 	 * enough space for the end watermark.  The end watermark always gets
4922 	 * updated, but if the underlying device changes size it may not be
4923 	 * pointed to until the extent before it is updated.  Since the
4924 	 * end of the reserved space is where the first watermark starts,
4925 	 * the reserved extent should never be marked for updating.
4926 	 */
4927 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
4928 	    MD_DISKADDR_ERROR) {
4929 		Free(mp);
4930 		return (-1);
4931 	}
4932 
4933 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
4934 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
4935 	meta_sp_list_insert(NULL, NULL, &extlist,
4936 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
4937 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
4938 
4939 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
4940 		Free(mp);
4941 		return (-1);
4942 	}
4943 
4944 	metafreenamelist(spnlp);
4945 
4946 	if (getenv(META_SP_DEBUG)) {
4947 		meta_sp_debug("meta_sp_attach: list of used extents:\n");
4948 		meta_sp_list_dump(extlist);
4949 	}
4950 
4951 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
4952 
4953 	assert(mp->un_numexts >= 1);
4954 	numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len,
4955 	    mp->un_ext[mp->un_numexts - 1].un_poff,
4956 	    (alignment > 0) ? alignment :
4957 	    meta_sp_get_default_alignment(sp, compnp, ep));
4958 
4959 	if (numexts == -1) {
4960 		Free(mp);
4961 		return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname));
4962 	}
4963 
4964 	/* allocate new unit structure and copy in old unit */
4965 	if ((new_un = meta_sp_updateunit(np, mp, extlist,
4966 	    grow_len, numexts, ep)) == NULL) {
4967 		Free(mp);
4968 		return (-1);
4969 	}
4970 	Free(mp);
4971 
4972 	/* If running in dryrun mode (-n option), we're done here */
4973 	if ((options & MDCMD_DOIT) == 0) {
4974 		if (options & MDCMD_PRINT) {
4975 			(void) printf(dgettext(TEXT_DOMAIN,
4976 			    "%s: Soft Partition would grow\n"),
4977 			    np->cname);
4978 			(void) fflush(stdout);
4979 		}
4980 		return (0);
4981 	}
4982 
4983 	if (getenv(META_SP_DEBUG)) {
4984 		meta_sp_debug("meta_sp_attach: updated unit structure:\n");
4985 		meta_sp_printunit(new_un);
4986 	}
4987 
4988 	assert(new_un != NULL);
4989 
4990 	(void) memset(&grow_params, 0, sizeof (grow_params));
4991 	if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) {
4992 		grow_params.options = MD_CRO_64BIT;
4993 		new_un->c.un_revision |= MD_64BIT_META_DEV;
4994 	} else {
4995 		grow_params.options = MD_CRO_32BIT;
4996 		new_un->c.un_revision &= ~MD_64BIT_META_DEV;
4997 	}
4998 	grow_params.mnum = MD_SID(new_un);
4999 	grow_params.size = new_un->c.un_size;
5000 	grow_params.mdp = (uintptr_t)new_un;
5001 	MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum));
5002 
5003 	if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde,
5004 	    np->cname) != 0) {
5005 		(void) mdstealerror(ep, &grow_params.mde);
5006 		return (-1);
5007 	}
5008 
5009 	/* update all watermarks */
5010 
5011 	if ((msp = meta_get_sp(sp, np, ep)) == NULL)
5012 		return (-1);
5013 	if (meta_sp_update_wm(sp, msp, extlist, ep) < 0)
5014 		return (-1);
5015 
5016 
5017 	/* second phase of commit, set status to MD_SP_OK */
5018 	if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0)
5019 		return (-1);
5020 
5021 	meta_invalidate_name(np);
5022 
5023 	if (options & MDCMD_PRINT) {
5024 		(void) printf(dgettext(TEXT_DOMAIN,
5025 		    "%s: Soft Partition has been grown\n"),
5026 		    np->cname);
5027 		(void) fflush(stdout);
5028 	}
5029 
5030 	return (0);
5031 }
5032 
5033 /*
5034  * **************************************************************************
5035  *                    Recovery (metarecover) Functions                      *
5036  * **************************************************************************
5037  */
5038 
5039 /*
5040  * FUNCTION:	meta_recover_sp()
5041  * INPUT:	sp	- the name of the set we are recovering on
5042  *		compnp	- name pointer for device we are recovering on
5043  *		argc	- argument count
5044  *		argv	- left over arguments not parsed by metarecover command
5045  *		options	- metarecover options
5046  * OUTPUT:	ep	- return error pointer
5047  * RETURNS:	int	- 0 - success, -1 - error
5048  * PURPOSE:	parse soft partitioning-specific metarecover options and
5049  *		dispatch to the appropriate function to handle recovery.
5050  */
5051 int
5052 meta_recover_sp(
5053 	mdsetname_t	*sp,
5054 	mdname_t	*compnp,
5055 	int		argc,
5056 	char		*argv[],
5057 	mdcmdopts_t	options,
5058 	md_error_t	*ep
5059 )
5060 {
5061 	md_set_desc	*sd;
5062 
5063 	if (argc > 1) {
5064 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5065 		    argc, argv);
5066 		return (-1);
5067 	}
5068 
5069 	/*
5070 	 * For a MN set, this operation must be performed on the master
5071 	 * as it is responsible for maintaining the watermarks
5072 	 */
5073 	if (!metaislocalset(sp)) {
5074 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
5075 			return (-1);
5076 		if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) {
5077 			(void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno,
5078 			    sd->sd_mn_master_nodenm, NULL, NULL);
5079 			return (-1);
5080 		}
5081 	}
5082 	if (argc == 0) {
5083 		/*
5084 		 * if no additional arguments are passed, metarecover should
5085 		 * validate both on-disk and metadb structures as well as
5086 		 * checking that both are consistent with each other
5087 		 */
5088 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5089 			return (-1);
5090 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5091 			return (-1);
5092 		if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0)
5093 			return (-1);
5094 	} else if (strcmp(argv[0], "-d") == 0) {
5095 		/*
5096 		 * Ensure that there is no existing valid record for this
5097 		 * soft-partition. If there is we have nothing to do.
5098 		 */
5099 		if (meta_sp_validate_unit(sp, compnp, options, ep) == 0)
5100 			return (-1);
5101 		/* validate and recover from on-disk structures */
5102 		if (meta_sp_validate_wm(sp, compnp, options, ep) < 0)
5103 			return (-1);
5104 		if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0)
5105 			return (-1);
5106 	} else if (strcmp(argv[0], "-m") == 0) {
5107 		/* validate and recover from metadb structures */
5108 		if (meta_sp_validate_unit(sp, compnp, options, ep) < 0)
5109 			return (-1);
5110 		if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0)
5111 			return (-1);
5112 	} else {
5113 		/* syntax error */
5114 		(void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname,
5115 		    argc, argv);
5116 		return (-1);
5117 	}
5118 
5119 	return (0);
5120 }
5121 
5122 /*
5123  * FUNCTION:	meta_sp_display_exthdr()
5124  * INPUT:	none
5125  * OUTPUT:	none
5126  * RETURNS:	void
5127  * PURPOSE:	print header line for sp_ext_node_t information.  to be used
5128  *		in conjunction with meta_sp_display_ext().
5129  */
5130 static void
5131 meta_sp_display_exthdr(void)
5132 {
5133 	(void) printf("%20s %5s %7s %20s %20s\n",
5134 	    dgettext(TEXT_DOMAIN, "Name"),
5135 	    dgettext(TEXT_DOMAIN, "Seq#"),
5136 	    dgettext(TEXT_DOMAIN, "Type"),
5137 	    dgettext(TEXT_DOMAIN, "Offset"),
5138 	    dgettext(TEXT_DOMAIN, "Length"));
5139 }
5140 
5141 
5142 /*
5143  * FUNCTION:	meta_sp_display_ext()
5144  * INPUT:	ext	- extent to display
5145  * OUTPUT:	none
5146  * RETURNS:	void
5147  * PURPOSE:	print selected fields from sp_ext_node_t.
5148  */
5149 static void
5150 meta_sp_display_ext(sp_ext_node_t *ext)
5151 {
5152 	/* print extent information */
5153 	if (ext->ext_namep != NULL)
5154 		(void) printf("%20s ", ext->ext_namep->cname);
5155 	else
5156 		(void) printf("%20s ", "NONE");
5157 
5158 	(void) printf("%5u ", ext->ext_seq);
5159 
5160 	switch (ext->ext_type) {
5161 	case EXTTYP_ALLOC:
5162 		(void) printf("%7s ", "ALLOC");
5163 		break;
5164 	case EXTTYP_FREE:
5165 		(void) printf("%7s ", "FREE");
5166 		break;
5167 	case EXTTYP_RESERVED:
5168 		(void) printf("%7s ", "RESV");
5169 		break;
5170 	case EXTTYP_END:
5171 		(void) printf("%7s ", "END");
5172 		break;
5173 	default:
5174 		(void) printf("%7s ", "INVLD");
5175 		break;
5176 	}
5177 
5178 	(void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length);
5179 }
5180 
5181 
5182 /*
5183  * FUNCTION:	meta_sp_checkseq()
5184  * INPUT:	extlist	- list of extents to be checked
5185  * OUTPUT:	none
5186  * RETURNS:	int	- 0 - success, -1 - error
5187  * PURPOSE:	check soft partition sequence numbers.  this function assumes
5188  *		that a list of extents representing 1 or more soft partitions
5189  *		is passed in sorted in sequence number order.  within a
5190  *		single soft partition, there may not be any missing or
5191  *		duplicate sequence numbers.
5192  */
5193 static int
5194 meta_sp_checkseq(sp_ext_node_t *extlist)
5195 {
5196 	sp_ext_node_t *ext;
5197 
5198 	assert(extlist != NULL);
5199 
5200 	for (ext = extlist;
5201 	    ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC;
5202 	    ext = ext->ext_next) {
5203 		if (ext->ext_next->ext_namep != NULL &&
5204 		    strcmp(ext->ext_next->ext_namep->cname,
5205 		    ext->ext_namep->cname) != 0)
5206 				continue;
5207 
5208 		if (ext->ext_next->ext_seq != ext->ext_seq + 1) {
5209 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5210 			    "%s: sequence numbers are "
5211 			    "incorrect: %d should be %d\n"),
5212 			    ext->ext_next->ext_namep->cname,
5213 			    ext->ext_next->ext_seq, ext->ext_seq + 1);
5214 			return (-1);
5215 		}
5216 	}
5217 	return (0);
5218 }
5219 
5220 
5221 /*
5222  * FUNCTION:	meta_sp_resolve_name_conflict()
5223  * INPUT:	sp	- name of set we're are recovering in.
5224  *		old_np	- name pointer of soft partition we found on disk.
5225  * OUTPUT:	new_np	- name pointer for new soft partition name.
5226  *		ep	- error pointer returned.
5227  * RETURNS:	int	- 0 - name not replace, 1 - name replaced, -1 - error
5228  * PURPOSE:	Check to see if the name of one of the soft partitions we found
5229  *		on disk already exists in the metadb.  If so, prompt for a new
5230  *		name.  In addition, we keep a static array of names that
5231  *		will be recovered from this device since these names don't
5232  *		exist in the configuration at this point but cannot be
5233  *		recovered more than once.
5234  */
5235 static int
5236 meta_sp_resolve_name_conflict(
5237 	mdsetname_t	*sp,
5238 	mdname_t	*old_np,
5239 	mdname_t	**new_np,
5240 	md_error_t	*ep
5241 )
5242 {
5243 	char		yesno[255];
5244 	char		*yes;
5245 	char		newname[MD_SP_MAX_DEVNAME_PLUS_1];
5246 	int		nunits;
5247 	static int	*used_names = NULL;
5248 
5249 	assert(old_np != NULL);
5250 
5251 	if (used_names == NULL) {
5252 		if ((nunits = meta_get_nunits(ep)) < 0)
5253 			return (-1);
5254 		used_names = Zalloc(nunits * sizeof (int));
5255 	}
5256 
5257 	/* see if it exists already */
5258 	if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 &&
5259 	    metagetmiscname(old_np, ep) == NULL) {
5260 		if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5261 			return (-1);
5262 		else {
5263 			used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1;
5264 			mdclrerror(ep);
5265 			return (0);
5266 		}
5267 	}
5268 
5269 	/* name exists, ask the user for a new one */
5270 	(void) printf(dgettext(TEXT_DOMAIN,
5271 	    "WARNING: A soft partition named %s was found in the extent\n"
5272 	    "headers, but this name already exists in the metadb "
5273 	    "configuration.\n"
5274 	    "In order to continue recovery you must supply\n"
5275 	    "a new name for this soft partition.\n"), old_np->cname);
5276 	(void) printf(dgettext(TEXT_DOMAIN,
5277 	    "Would you like to continue and supply a new name? (yes/no) "));
5278 
5279 	(void) fflush(stdout);
5280 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
5281 	    (strlen(yesno) == 1))
5282 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
5283 		    dgettext(TEXT_DOMAIN, "no"));
5284 	yes = dgettext(TEXT_DOMAIN, "yes");
5285 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
5286 		return (-1);
5287 	}
5288 
5289 	(void) fflush(stdin);
5290 
5291 	/* get the new name */
5292 	for (;;) {
5293 		(void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name "
5294 		    "for this soft partition (dXXXX) "));
5295 		(void) fflush(stdout);
5296 		if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL)
5297 			(void) strcpy(newname, "");
5298 
5299 		/* remove newline character */
5300 		if (newname[strlen(newname) - 1] == '\n')
5301 			newname[strlen(newname) - 1] = '\0';
5302 
5303 		if (!(is_metaname(newname)) ||
5304 		    (meta_init_make_device(&sp, newname, ep) <= 0)) {
5305 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5306 			    "Invalid metadevice name\n"));
5307 			(void) fflush(stderr);
5308 			continue;
5309 		}
5310 
5311 		if ((*new_np = metaname(&sp, newname,
5312 		    META_DEVICE, ep)) == NULL) {
5313 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5314 			    "Invalid metadevice name\n"));
5315 			(void) fflush(stderr);
5316 			continue;
5317 		}
5318 
5319 		assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits);
5320 		/* make sure the name isn't already being used */
5321 		if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] ||
5322 		    metagetmiscname(*new_np, ep) != NULL) {
5323 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5324 			    "That name already exists\n"));
5325 			continue;
5326 		} else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP))
5327 			return (-1);
5328 
5329 		break;
5330 	}
5331 
5332 	/* got a new name, place in used array and return */
5333 	used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1;
5334 	mdclrerror(ep);
5335 	return (1);
5336 }
5337 
5338 /*
5339  * FUNCTION:	meta_sp_validate_wm()
5340  * INPUT:	sp	- set name we are recovering in
5341  *		compnp	- name pointer for device we are recovering from
5342  *		options	- metarecover options
5343  * OUTPUT:	ep	- error pointer returned
5344  * RETURNS:	int	- 0 - success, -1 - error
5345  * PURPOSE:	validate and display watermark configuration.  walk the
5346  *		on-disk watermark structures and validate the information
5347  *		found within.  since a watermark configuration is
5348  *		"self-defining", the act of traversing the watermarks
5349  *		is part of the validation process.
5350  */
5351 static int
5352 meta_sp_validate_wm(
5353 	mdsetname_t	*sp,
5354 	mdname_t	*compnp,
5355 	mdcmdopts_t	options,
5356 	md_error_t	*ep
5357 )
5358 {
5359 	sp_ext_node_t	*extlist = NULL;
5360 	sp_ext_node_t	*ext;
5361 	int		num_sps = 0;
5362 	int		rval;
5363 
5364 	if ((options & MDCMD_VERBOSE) != 0)
5365 		(void) printf(dgettext(TEXT_DOMAIN,
5366 		    "Verifying on-disk structures on %s.\n"),
5367 		    compnp->cname);
5368 
5369 	/*
5370 	 * for each watermark, build an ext_node, place on list.
5371 	 */
5372 	rval = meta_sp_extlist_from_wm(sp, compnp, &extlist,
5373 	    meta_sp_cmp_by_nameseq, ep);
5374 
5375 	if ((options & MDCMD_VERBOSE) != 0) {
5376 		/* print out what we found */
5377 		if (extlist == NULL)
5378 			(void) printf(dgettext(TEXT_DOMAIN,
5379 			    "No extent headers found on %s.\n"),
5380 			    compnp->cname);
5381 		else {
5382 			(void) printf(dgettext(TEXT_DOMAIN,
5383 			    "The following extent headers were found on %s.\n"),
5384 			    compnp->cname);
5385 			meta_sp_display_exthdr();
5386 		}
5387 		for (ext = extlist; ext != NULL; ext = ext->ext_next)
5388 			meta_sp_display_ext(ext);
5389 	}
5390 
5391 	if (rval < 0) {
5392 		(void) printf(dgettext(TEXT_DOMAIN,
5393 		    "%s: On-disk structures invalid or "
5394 		    "no soft partitions found.\n"),
5395 		    compnp->cname);
5396 		return (-1);
5397 	}
5398 
5399 	assert(extlist != NULL);
5400 
5401 	/* count number of soft partitions */
5402 	for (ext = extlist;
5403 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5404 	    ext = ext->ext_next) {
5405 		if (ext->ext_next != NULL &&
5406 		    ext->ext_next->ext_namep != NULL &&
5407 		    strcmp(ext->ext_next->ext_namep->cname,
5408 		    ext->ext_namep->cname) == 0)
5409 				continue;
5410 		num_sps++;
5411 	}
5412 
5413 	if ((options & MDCMD_VERBOSE) != 0)
5414 		(void) printf(dgettext(TEXT_DOMAIN,
5415 		    "Found %d soft partition(s) on %s.\n"), num_sps,
5416 		    compnp->cname);
5417 
5418 	if (num_sps == 0) {
5419 		(void) printf(dgettext(TEXT_DOMAIN,
5420 		    "%s: No soft partitions.\n"), compnp->cname);
5421 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5422 	}
5423 
5424 	/* check sequence numbers */
5425 	if ((options & MDCMD_VERBOSE) != 0)
5426 		(void) printf(dgettext(TEXT_DOMAIN,
5427 		    "Checking sequence numbers.\n"));
5428 
5429 	if (meta_sp_checkseq(extlist) != 0)
5430 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5431 
5432 	return (0);
5433 }
5434 
5435 /*
5436  * FUNCTION:	meta_sp_validate_unit()
5437  * INPUT:	sp	- name of set we are recovering in
5438  *		compnp	- name of component we are recovering from
5439  *		options	- metarecover options
5440  * OUTPUT:	ep	- error pointer returned
5441  * RETURNS:	int	- 0 - success, -1 - error
5442  * PURPOSE:	validate and display metadb configuration.  begin by getting
5443  *		all soft partitions built on the specified component.  get
5444  *		the unit structure for each one and validate the fields within.
5445  */
5446 static int
5447 meta_sp_validate_unit(
5448 	mdsetname_t	*sp,
5449 	mdname_t	*compnp,
5450 	mdcmdopts_t	options,
5451 	md_error_t	*ep
5452 )
5453 {
5454 	md_sp_t		*msp;
5455 	mdnamelist_t	*spnlp = NULL;
5456 	mdnamelist_t	*namep = NULL;
5457 	int		count;
5458 	uint_t		extn;
5459 	sp_ext_length_t	size;
5460 
5461 	if ((options & MDCMD_VERBOSE) != 0)
5462 		(void) printf(dgettext(TEXT_DOMAIN,
5463 		    "%s: Validating soft partition metadb entries.\n"),
5464 		    compnp->cname);
5465 
5466 	if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR)
5467 		return (-1);
5468 
5469 	/* get all soft partitions on component */
5470 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
5471 
5472 	if (count == 0) {
5473 		(void) printf(dgettext(TEXT_DOMAIN,
5474 		    "%s: No soft partitions.\n"), compnp->cname);
5475 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5476 	} else if (count < 0) {
5477 		return (-1);
5478 	}
5479 
5480 	/* Now go through the soft partitions and check each one */
5481 	for (namep = spnlp; namep != NULL; namep = namep->next) {
5482 		mdname_t	*curnp = namep->namep;
5483 		sp_ext_offset_t	curvoff;
5484 
5485 		/* get the unit structure */
5486 		if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL)
5487 			return (-1);
5488 
5489 		/* verify generic unit structure parameters */
5490 		if ((options & MDCMD_VERBOSE) != 0)
5491 			(void) printf(dgettext(TEXT_DOMAIN,
5492 			    "\nVerifying device %s.\n"),
5493 			    curnp->cname);
5494 
5495 		/*
5496 		 * MD_SP_LAST is an invalid state and is always the
5497 		 * highest numbered.
5498 		 */
5499 		if (msp->status >= MD_SP_LAST) {
5500 			(void) printf(dgettext(TEXT_DOMAIN,
5501 			    "%s: status value %u is out of range.\n"),
5502 			    curnp->cname, msp->status);
5503 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5504 			    0, curnp->cname));
5505 		} else if ((options & MDCMD_VERBOSE) != 0) {
5506 			uint_t	tstate = 0;
5507 
5508 			if (metaismeta(msp->compnamep)) {
5509 				if (meta_get_tstate(msp->common.namep->dev,
5510 				    &tstate, ep) != 0)
5511 					return (-1);
5512 			}
5513 			(void) printf(dgettext(TEXT_DOMAIN,
5514 			    "%s: Status \"%s\" is valid.\n"),
5515 			    curnp->cname, meta_sp_status_to_name(msp->status,
5516 			    tstate & MD_DEV_ERRORED));
5517 		}
5518 
5519 		/* Now verify each extent */
5520 		if ((options & MDCMD_VERBOSE) != 0)
5521 			(void) printf("%14s %21s %21s %21s\n",
5522 			    dgettext(TEXT_DOMAIN, "Extent Number"),
5523 			    dgettext(TEXT_DOMAIN, "Virtual Offset"),
5524 			    dgettext(TEXT_DOMAIN, "Physical Offset"),
5525 			    dgettext(TEXT_DOMAIN, "Length"));
5526 
5527 		curvoff = 0ULL;
5528 		for (extn = 0; extn < msp->ext.ext_len; extn++) {
5529 			md_sp_ext_t	*extp = &msp->ext.ext_val[extn];
5530 
5531 			if ((options & MDCMD_VERBOSE) != 0)
5532 				(void) printf("%14u %21llu %21llu %21llu\n",
5533 				    extn, extp->voff, extp->poff, extp->len);
5534 
5535 			if (extp->voff != curvoff) {
5536 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5537 				    "%s: virtual offset for extent %u "
5538 				    "is inconsistent, expected %llu, "
5539 				    "got %llu.\n"), curnp->cname, extn,
5540 				    curvoff, extp->voff);
5541 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5542 				    0, compnp->cname));
5543 			}
5544 
5545 			/* make sure extent does not drop off the end */
5546 			if ((extp->poff + extp->len) == size) {
5547 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5548 				    "%s: extent %u at offset %llu, "
5549 				    "length %llu exceeds the size of the "
5550 				    "device, %llu.\n"), curnp->cname,
5551 				    extn, extp->poff, extp->len, size);
5552 				return (mdmderror(ep, MDE_RECOVER_FAILED,
5553 				    0, compnp->cname));
5554 			}
5555 
5556 			curvoff += extp->len;
5557 		}
5558 	}
5559 	if (options & MDCMD_PRINT) {
5560 		(void) printf(dgettext(TEXT_DOMAIN,
5561 		    "%s: Soft Partition metadb configuration is valid\n"),
5562 		    compnp->cname);
5563 	}
5564 	return (0);
5565 }
5566 
5567 /*
5568  * FUNCTION:	meta_sp_validate_wm_and_unit()
5569  * INPUT:	sp	- name of set we are recovering in
5570  *		compnp	- name of device we are recovering from
5571  *		options	- metarecover options
5572  * OUTPUT:	ep	- error pointer returned
5573  * RETURNS:	int	- 0 - success, -1 error
5574  * PURPOSE:	cross-validate and display watermarks and metadb records.
5575  *		get both the unit structures for the soft partitions built
5576  *		on the specified component and the watermarks found on that
5577  *		component and check to make sure they are consistent with
5578  *		each other.
5579  */
5580 static int
5581 meta_sp_validate_wm_and_unit(
5582 	mdsetname_t	*sp,
5583 	mdname_t	*np,
5584 	mdcmdopts_t	options,
5585 	md_error_t	*ep
5586 )
5587 {
5588 	sp_ext_node_t	*wmlist = NULL;
5589 	sp_ext_node_t	*unitlist = NULL;
5590 	sp_ext_node_t	*unitext;
5591 	sp_ext_node_t	*wmext;
5592 	sp_ext_offset_t	tmpunitoff;
5593 	mdnamelist_t	*spnlp = NULL;
5594 	int		count;
5595 	int		rval = 0;
5596 	int		verbose = (options & MDCMD_VERBOSE);
5597 
5598 	/* get unit structure list */
5599 	count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep);
5600 	if (count <= 0)
5601 		return (-1);
5602 
5603 	meta_sp_list_insert(NULL, NULL, &unitlist,
5604 	    metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
5605 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
5606 
5607 	if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) {
5608 		metafreenamelist(spnlp);
5609 		return (-1);
5610 	}
5611 
5612 	metafreenamelist(spnlp);
5613 
5614 	meta_sp_list_freefill(&unitlist, metagetsize(np, ep));
5615 
5616 	if (meta_sp_extlist_from_wm(sp, np, &wmlist,
5617 	    meta_sp_cmp_by_offset, ep) < 0) {
5618 		meta_sp_list_free(&unitlist);
5619 		return (-1);
5620 	}
5621 
5622 	if (getenv(META_SP_DEBUG)) {
5623 		meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n");
5624 		meta_sp_list_dump(unitlist);
5625 		meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n");
5626 		meta_sp_list_dump(wmlist);
5627 	}
5628 
5629 	/*
5630 	 * step through both lists and compare allocated nodes.  Free
5631 	 * nodes and end watermarks may differ between the two but
5632 	 * that's generally ok, and if they're wrong will typically
5633 	 * cause misplaced allocated extents.
5634 	 */
5635 	if (verbose)
5636 		(void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb "
5637 		    "allocations match extent headers.\n"), np->cname);
5638 
5639 	unitext = unitlist;
5640 	wmext = wmlist;
5641 	while ((wmext != NULL) && (unitext != NULL)) {
5642 		/* find next allocated extents in each list */
5643 		while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC)
5644 			wmext = wmext->ext_next;
5645 
5646 		while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC)
5647 			unitext = unitext->ext_next;
5648 
5649 		if (wmext == NULL || unitext == NULL)
5650 			break;
5651 
5652 		if (verbose) {
5653 			(void) printf(dgettext(TEXT_DOMAIN,
5654 			    "Metadb extent:\n"));
5655 			meta_sp_display_exthdr();
5656 			meta_sp_display_ext(unitext);
5657 			(void) printf(dgettext(TEXT_DOMAIN,
5658 			    "Extent header extent:\n"));
5659 			meta_sp_display_exthdr();
5660 			meta_sp_display_ext(wmext);
5661 			(void) printf("\n");
5662 		}
5663 
5664 		if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0)
5665 			rval = -1;
5666 
5667 		/*
5668 		 * if the offsets aren't equal, only increment the
5669 		 * lowest one in hopes of getting the lists back in sync.
5670 		 */
5671 		tmpunitoff = unitext->ext_offset;
5672 		if (unitext->ext_offset <= wmext->ext_offset)
5673 			unitext = unitext->ext_next;
5674 		if (wmext->ext_offset <= tmpunitoff)
5675 			wmext = wmext->ext_next;
5676 	}
5677 
5678 	/*
5679 	 * if both lists aren't at the end then there are extra
5680 	 * allocated nodes in one of them.
5681 	 */
5682 	if (wmext != NULL) {
5683 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5684 		    "%s: extent headers contain allocations not in "
5685 		    "the metadb\n\n"), np->cname);
5686 		rval = -1;
5687 	}
5688 
5689 	if (unitext != NULL) {
5690 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5691 		    "%s: metadb contains allocations not in the extent "
5692 		    "headers\n\n"), np->cname);
5693 		rval = -1;
5694 	}
5695 
5696 	if (options & MDCMD_PRINT) {
5697 		if (rval == 0) {
5698 			(void) printf(dgettext(TEXT_DOMAIN,
5699 			    "%s: Soft Partition metadb matches extent "
5700 			    "header configuration\n"), np->cname);
5701 		} else {
5702 			(void) printf(dgettext(TEXT_DOMAIN,
5703 			    "%s: Soft Partition metadb does not match extent "
5704 			    "header configuration\n"), np->cname);
5705 		}
5706 	}
5707 
5708 	return (rval);
5709 }
5710 
5711 /*
5712  * FUNCTION:	meta_sp_validate_exts()
5713  * INPUT:	compnp	- name pointer for device we are recovering from
5714  *		wmext	- extent node representing watermark
5715  *		unitext	- extent node from unit structure
5716  * OUTPUT:	ep	- return error pointer
5717  * RETURNS:	int	- 0 - succes, mdmderror return code - error
5718  * PURPOSE:	Takes two extent nodes and checks them against each other.
5719  *		offset, length, sequence number, set, and name are compared.
5720  */
5721 static int
5722 meta_sp_validate_exts(
5723 	mdname_t	*compnp,
5724 	sp_ext_node_t	*wmext,
5725 	sp_ext_node_t	*unitext,
5726 	md_error_t	*ep
5727 )
5728 {
5729 	if (wmext->ext_offset != unitext->ext_offset) {
5730 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5731 		    "%s: unit structure and extent header offsets differ.\n"),
5732 		    compnp->cname);
5733 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5734 	}
5735 
5736 	if (wmext->ext_length != unitext->ext_length) {
5737 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5738 		    "%s: unit structure and extent header lengths differ.\n"),
5739 		    compnp->cname);
5740 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5741 	}
5742 
5743 	if (wmext->ext_seq != unitext->ext_seq) {
5744 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5745 		    "%s: unit structure and extent header sequence numbers "
5746 		    "differ.\n"), compnp->cname);
5747 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5748 	}
5749 
5750 	if (wmext->ext_type != unitext->ext_type) {
5751 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5752 		    "%s: unit structure and extent header types differ.\n"),
5753 		    compnp->cname);
5754 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5755 	}
5756 
5757 	/*
5758 	 * If one has a set pointer and the other doesn't, error.
5759 	 * If both extents have setnames, then make sure they match
5760 	 * If both are NULL, it's ok, they match.
5761 	 */
5762 	if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) {
5763 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5764 		    "%s: unit structure and extent header set values "
5765 		    "differ.\n"), compnp->cname);
5766 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5767 	}
5768 
5769 	if (unitext->ext_setp != NULL) {
5770 		if (strcmp(unitext->ext_setp->setname,
5771 		    wmext->ext_setp->setname) != 0) {
5772 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5773 			    "%s: unit structure and extent header set names "
5774 			    "differ.\n"), compnp->cname);
5775 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5776 			    0, compnp->cname));
5777 		}
5778 	}
5779 
5780 	/*
5781 	 * If one has a name pointer and the other doesn't, error.
5782 	 * If both extents have names, then make sure they match
5783 	 * If both are NULL, it's ok, they match.
5784 	 */
5785 	if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) {
5786 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5787 		    "%s: unit structure and extent header name values "
5788 		    "differ.\n"), compnp->cname);
5789 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5790 	}
5791 
5792 	if (unitext->ext_namep != NULL) {
5793 		if (strcmp(wmext->ext_namep->cname,
5794 		    unitext->ext_namep->cname) != 0) {
5795 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5796 			    "%s: unit structure and extent header names "
5797 			    "differ.\n"), compnp->cname);
5798 			return (mdmderror(ep, MDE_RECOVER_FAILED,
5799 			    0, compnp->cname));
5800 		}
5801 	}
5802 
5803 	return (0);
5804 }
5805 
5806 /*
5807  * FUNCTION:	update_sp_status()
5808  * INPUT:	sp	- name of set we are recovering in
5809  *		minors	- pointer to an array of soft partition minor numbers
5810  *		num_sps	- number of minor numbers in array
5811  *		status	- new status to be applied to all soft parts in array
5812  *		mn_set	- set if current set is a multi-node set
5813  * OUTPUT:	ep	- return error pointer
5814  * RETURNS:	int	- 0 - success, -1 - error
5815  * PURPOSE:	update  status of soft partitions to new status. minors is an
5816  *		array of minor numbers to apply the new status to.
5817  *		If mn_set is set, a message is sent to all nodes in the
5818  *		cluster to update the status locally.
5819  */
5820 static int
5821 update_sp_status(
5822 	mdsetname_t	*sp,
5823 	minor_t		*minors,
5824 	int		num_sps,
5825 	sp_status_t	status,
5826 	bool_t		mn_set,
5827 	md_error_t	*ep
5828 )
5829 {
5830 	int	i;
5831 	int	err = 0;
5832 
5833 	if (mn_set) {
5834 		md_mn_msg_sp_setstat_t	sp_setstat_params;
5835 		int			result;
5836 		md_mn_result_t		*resp = NULL;
5837 
5838 		for (i = 0; i < num_sps; i++) {
5839 			sp_setstat_params.sp_setstat_mnum = minors[i];
5840 			sp_setstat_params.sp_setstat_status = status;
5841 
5842 			result = mdmn_send_message(sp->setno,
5843 			    MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS,
5844 			    (char *)&sp_setstat_params,
5845 			    sizeof (sp_setstat_params),
5846 			    &resp, ep);
5847 			if (resp != NULL) {
5848 				if (resp->mmr_exitval != 0)
5849 					err = -1;
5850 				free_result(resp);
5851 			}
5852 			if (result != 0) {
5853 				err = -1;
5854 			}
5855 		}
5856 	} else {
5857 		if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0)
5858 			err = -1;
5859 	}
5860 	if (err < 0) {
5861 		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
5862 		    "Error updating status on recovered soft "
5863 		    "partitions.\n"));
5864 	}
5865 	return (err);
5866 }
5867 
5868 /*
5869  * FUNCTION:	meta_sp_recover_from_wm()
5870  * INPUT:	sp	- name of set we are recovering in
5871  *		compnp	- name pointer for component we are recovering from
5872  *		options	- metarecover options
5873  * OUTPUT:	ep	- return error pointer
5874  * RETURNS:	int	- 0 - success, -1 - error
5875  * PURPOSE:	update metadb records to match watermarks.  begin by getting
5876  *		an extlist representing all soft partitions on the component.
5877  *		then build a unit structure for each soft partition.
5878  *		notify user of changes, then commit each soft partition to
5879  *		the metadb one at a time in the "recovering" state.  update
5880  *		any watermarks that may need it	(to reflect possible name
5881  *		changes), and, finally, set the status of all recovered
5882  *		partitions to the "OK" state at once.
5883  */
5884 static int
5885 meta_sp_recover_from_wm(
5886 	mdsetname_t	*sp,
5887 	mdname_t	*compnp,
5888 	mdcmdopts_t	options,
5889 	md_error_t	*ep
5890 )
5891 {
5892 	sp_ext_node_t		*extlist = NULL;
5893 	sp_ext_node_t		*sp_list = NULL;
5894 	sp_ext_node_t		*update_list = NULL;
5895 	sp_ext_node_t		*ext;
5896 	sp_ext_node_t		*sp_ext;
5897 	mp_unit_t		*mp;
5898 	mp_unit_t		**un_array;
5899 	int			numexts = 0, num_sps = 0, i = 0;
5900 	int			err = 0;
5901 	int			not_recovered = 0;
5902 	int			committed = 0;
5903 	sp_ext_length_t		sp_length = 0LL;
5904 	mdnamelist_t		*keynlp = NULL;
5905 	mdname_t		*np;
5906 	mdname_t		*new_np;
5907 	int			new_name;
5908 	md_set_params_t		set_params;
5909 	minor_t			*minors = NULL;
5910 	char			yesno[255];
5911 	char			*yes;
5912 	bool_t			mn_set = 0;
5913 	md_set_desc		*sd;
5914 	mm_unit_t		*mm;
5915 	md_set_mmown_params_t	*ownpar = NULL;
5916 	int			comp_is_mirror = 0;
5917 
5918 	/*
5919 	 * if this component appears in another metadevice already, do
5920 	 * NOT recover from it.
5921 	 */
5922 	if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0)
5923 		return (-1);
5924 
5925 	/* set flag if dealing with a MN set */
5926 	if (!metaislocalset(sp)) {
5927 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5928 			return (-1);
5929 		}
5930 		if (MD_MNSET_DESC(sd))
5931 			mn_set = 1;
5932 	}
5933 	/*
5934 	 * for each watermark, build an ext_node, place on list.
5935 	 */
5936 	if (meta_sp_extlist_from_wm(sp, compnp, &extlist,
5937 	    meta_sp_cmp_by_nameseq, ep) < 0)
5938 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
5939 
5940 	assert(extlist != NULL);
5941 
5942 	/* count number of soft partitions */
5943 	for (ext = extlist;
5944 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5945 	    ext = ext->ext_next) {
5946 		if (ext->ext_next != NULL &&
5947 		    ext->ext_next->ext_namep != NULL &&
5948 		    strcmp(ext->ext_next->ext_namep->cname,
5949 		    ext->ext_namep->cname) == 0)
5950 				continue;
5951 		num_sps++;
5952 	}
5953 
5954 	/* allocate array of unit structure pointers */
5955 	un_array = Zalloc(num_sps * sizeof (mp_unit_t *));
5956 
5957 	/*
5958 	 * build unit structures from list of ext_nodes.
5959 	 */
5960 	for (ext = extlist;
5961 	    ext != NULL && ext->ext_type == EXTTYP_ALLOC;
5962 	    ext = ext->ext_next) {
5963 		meta_sp_list_insert(ext->ext_setp, ext->ext_namep,
5964 		    &sp_list, ext->ext_offset, ext->ext_length,
5965 		    ext->ext_type, ext->ext_seq, ext->ext_flags,
5966 		    meta_sp_cmp_by_nameseq);
5967 
5968 		numexts++;
5969 		sp_length += ext->ext_length - MD_SP_WMSIZE;
5970 
5971 		if (ext->ext_next != NULL &&
5972 		    ext->ext_next->ext_namep != NULL &&
5973 		    strcmp(ext->ext_next->ext_namep->cname,
5974 		    ext->ext_namep->cname) == 0)
5975 				continue;
5976 
5977 		/*
5978 		 * if we made it here, we are at a soft partition
5979 		 * boundary in the list.
5980 		 */
5981 		if (getenv(META_SP_DEBUG)) {
5982 			meta_sp_debug("meta_recover_from_wm: dumping wm "
5983 			    "list:\n");
5984 			meta_sp_list_dump(sp_list);
5985 		}
5986 
5987 		assert(sp_list != NULL);
5988 		assert(sp_list->ext_namep != NULL);
5989 
5990 		if ((new_name = meta_sp_resolve_name_conflict(sp,
5991 		    sp_list->ext_namep, &new_np, ep)) < 0) {
5992 			err = 1;
5993 			goto out;
5994 		} else if (new_name) {
5995 			for (sp_ext = sp_list;
5996 			    sp_ext != NULL;
5997 			    sp_ext = sp_ext->ext_next) {
5998 				/*
5999 				 * insert into the update list for
6000 				 * watermark update.
6001 				 */
6002 				meta_sp_list_insert(sp_ext->ext_setp,
6003 				    new_np, &update_list, sp_ext->ext_offset,
6004 				    sp_ext->ext_length, sp_ext->ext_type,
6005 				    sp_ext->ext_seq, EXTFLG_UPDATE,
6006 				    meta_sp_cmp_by_offset);
6007 			}
6008 
6009 		}
6010 		if (options & MDCMD_DOIT) {
6011 			/* store name in namespace */
6012 			if (mn_set) {
6013 				/* send message to all nodes to return key */
6014 				md_mn_msg_addkeyname_t	*send_params;
6015 				int			result;
6016 				md_mn_result_t		*resp = NULL;
6017 				int			message_size;
6018 
6019 				message_size =  sizeof (*send_params) +
6020 				    strlen(compnp->cname) + 1;
6021 				send_params = Zalloc(message_size);
6022 				send_params->addkeyname_setno = sp->setno;
6023 				(void) strcpy(&send_params->addkeyname_name[0],
6024 				    compnp->cname);
6025 				result = mdmn_send_message(sp->setno,
6026 				    MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6027 				    (char *)send_params, message_size, &resp,
6028 				    ep);
6029 				Free(send_params);
6030 				if (resp != NULL) {
6031 					if (resp->mmr_exitval >= 0) {
6032 						compnp->key =
6033 						    (mdkey_t)resp->mmr_exitval;
6034 					} else {
6035 						err = 1;
6036 						free_result(resp);
6037 						goto out;
6038 					}
6039 					free_result(resp);
6040 				}
6041 				if (result != 0) {
6042 					err = 1;
6043 					goto out;
6044 				}
6045 				(void) metanamelist_append(&keynlp, compnp);
6046 			} else {
6047 				if (add_key_name(sp, compnp, &keynlp,
6048 				    ep) != 0) {
6049 					err = 1;
6050 					goto out;
6051 				}
6052 			}
6053 		}
6054 
6055 		/* create the unit structure */
6056 		if ((mp = meta_sp_createunit(
6057 		    (new_name) ? new_np : sp_list->ext_namep, compnp,
6058 		    sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) {
6059 			err = 1;
6060 			goto out;
6061 		}
6062 
6063 		if (getenv(META_SP_DEBUG)) {
6064 			meta_sp_debug("meta_sp_recover_from_wm: "
6065 			    "printing newly created unit structure");
6066 			meta_sp_printunit(mp);
6067 		}
6068 
6069 		/* place in unit structure array */
6070 		un_array[i++] = mp;
6071 
6072 		/* free sp_list */
6073 		meta_sp_list_free(&sp_list);
6074 		sp_list = NULL;
6075 		numexts = 0;
6076 		sp_length = 0LL;
6077 	}
6078 
6079 	/* display configuration updates */
6080 	(void) printf(dgettext(TEXT_DOMAIN,
6081 	    "The following soft partitions were found and will be added to\n"
6082 	    "your metadevice configuration.\n"));
6083 	(void) printf("%5s %15s %18s\n",
6084 	    dgettext(TEXT_DOMAIN, "Name"),
6085 	    dgettext(TEXT_DOMAIN, "Size"),
6086 	    dgettext(TEXT_DOMAIN, "No. of Extents"));
6087 	for (i = 0; i < num_sps; i++) {
6088 		(void) printf("%5s%lu %15llu %9d\n", "d",
6089 		    MD_MIN2UNIT(MD_SID(un_array[i])),
6090 		    un_array[i]->un_length, un_array[i]->un_numexts);
6091 	}
6092 
6093 	if (!(options & MDCMD_DOIT)) {
6094 		not_recovered = 1;
6095 		goto out;
6096 	}
6097 
6098 	/* ask user for confirmation */
6099 	(void) printf(dgettext(TEXT_DOMAIN,
6100 	    "WARNING: You are about to add one or more soft partition\n"
6101 	    "metadevices to your metadevice configuration.  If there\n"
6102 	    "appears to be an error in the soft partition(s) displayed\n"
6103 	    "above, do NOT proceed with this recovery operation.\n"));
6104 	(void) printf(dgettext(TEXT_DOMAIN,
6105 	    "Are you sure you want to do this (yes/no)? "));
6106 
6107 	(void) fflush(stdout);
6108 	if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6109 	    (strlen(yesno) == 1))
6110 		(void) snprintf(yesno, sizeof (yesno), "%s\n",
6111 		    dgettext(TEXT_DOMAIN, "no"));
6112 	yes = dgettext(TEXT_DOMAIN, "yes");
6113 	if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) {
6114 		not_recovered = 1;
6115 		goto out;
6116 	}
6117 
6118 	/* commit records one at a time */
6119 	for (i = 0; i < num_sps; i++) {
6120 		(void) memset(&set_params, 0, sizeof (set_params));
6121 		set_params.mnum = MD_SID(un_array[i]);
6122 		set_params.size = (un_array[i])->c.un_size;
6123 		set_params.mdp = (uintptr_t)(un_array[i]);
6124 		set_params.options =
6125 		    meta_check_devicesize(un_array[i]->un_length);
6126 		if (set_params.options == MD_CRO_64BIT) {
6127 			un_array[i]->c.un_revision |= MD_64BIT_META_DEV;
6128 		} else {
6129 			un_array[i]->c.un_revision &= ~MD_64BIT_META_DEV;
6130 		}
6131 		MD_SETDRIVERNAME(&set_params, MD_SP,
6132 		    MD_MIN2SET(set_params.mnum));
6133 
6134 		np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep);
6135 
6136 		/*
6137 		 * If this is an MN set, send the MD_IOCSET ioctl to all nodes
6138 		 */
6139 		if (mn_set) {
6140 			md_mn_msg_iocset_t	send_params;
6141 			int			result;
6142 			md_mn_result_t		*resp = NULL;
6143 			int			mess_size;
6144 
6145 			/*
6146 			 * Calculate message size. md_mn_msg_iocset_t only
6147 			 * contains one extent, so increment the size to
6148 			 * include all extents
6149 			 */
6150 			mess_size = sizeof (send_params) -
6151 			    sizeof (mp_ext_t) +
6152 			    (un_array[i]->un_numexts * sizeof (mp_ext_t));
6153 
6154 			send_params.iocset_params = set_params;
6155 			(void) memcpy(&send_params.unit, un_array[i],
6156 			    sizeof (*un_array[i]) - sizeof (mp_ext_t) +
6157 			    (un_array[i]->un_numexts * sizeof (mp_ext_t)));
6158 			result = mdmn_send_message(sp->setno,
6159 			    MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS,
6160 			    (char *)&send_params, mess_size, &resp,
6161 			    ep);
6162 			if (resp != NULL) {
6163 				if (resp->mmr_exitval != 0)
6164 					err = 1;
6165 				free_result(resp);
6166 			}
6167 			if (result != 0) {
6168 				err = 1;
6169 			}
6170 		} else {
6171 			if (metaioctl(MD_IOCSET, &set_params, &set_params.mde,
6172 			    np->cname) != 0) {
6173 				err = 1;
6174 			}
6175 		}
6176 
6177 		if (err == 1) {
6178 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6179 			    "%s: Error committing record to metadb.\n"),
6180 			    np->cname);
6181 			goto out;
6182 		}
6183 
6184 		/* note that we've committed a record */
6185 		if (!committed)
6186 			committed = 1;
6187 
6188 		/* update any watermarks that need it */
6189 		if (update_list != NULL) {
6190 			md_sp_t *msp;
6191 
6192 			/*
6193 			 * Check to see if we're trying to create a partition
6194 			 * on a mirror. If so we may have to enforce an
6195 			 * ownership change before writing the watermark out.
6196 			 */
6197 			if (metaismeta(compnp)) {
6198 				char *miscname;
6199 
6200 				miscname = metagetmiscname(compnp, ep);
6201 				if (miscname != NULL)
6202 					comp_is_mirror = (strcmp(miscname,
6203 					    MD_MIRROR) == 0);
6204 				else
6205 					comp_is_mirror = 0;
6206 			}
6207 			/*
6208 			 * If this is a MN set and the component is a mirror,
6209 			 * change ownership to this node in order to write the
6210 			 * watermarks
6211 			 */
6212 			if (mn_set && comp_is_mirror) {
6213 				mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
6214 				if (mm == NULL) {
6215 					err = 1;
6216 					goto out;
6217 				} else {
6218 					err = meta_mn_change_owner(&ownpar,
6219 					    sp->setno,
6220 					    meta_getminor(compnp->dev),
6221 					    sd->sd_mn_mynode->nd_nodeid,
6222 					    MD_MN_MM_PREVENT_CHANGE |
6223 					    MD_MN_MM_SPAWN_THREAD);
6224 					if (err != 0)
6225 						goto out;
6226 				}
6227 			}
6228 
6229 			if ((msp = meta_get_sp(sp, np, ep)) == NULL) {
6230 				err = 1;
6231 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6232 				    "%s: Error updating extent headers.\n"),
6233 				    np->cname);
6234 				goto out;
6235 			}
6236 			if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) {
6237 				err = 1;
6238 				(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
6239 				    "%s: Error updating extent headers "
6240 				    "on disk.\n"), np->cname);
6241 				goto out;
6242 			}
6243 		}
6244 		/*
6245 		 * If we have changed ownership earlier and prevented any
6246 		 * ownership changes, we can now allow ownership changes
6247 		 * again.
6248 		 */
6249 		if (ownpar) {
6250 			(void) meta_mn_change_owner(&ownpar, sp->setno,
6251 			    ownpar->d.mnum,
6252 			    ownpar->d.owner,
6253 			    MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
6254 		}
6255 	}
6256 
6257 	/* update status of all soft partitions to OK */
6258 	minors = Zalloc(num_sps * sizeof (minor_t));
6259 	for (i = 0; i < num_sps; i++)
6260 		minors[i] = MD_SID(un_array[i]);
6261 
6262 	err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep);
6263 	if (err != 0)
6264 		goto out;
6265 
6266 	if (options & MDCMD_PRINT)
6267 		(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6268 		    "Soft Partitions recovered from device.\n"),
6269 		    compnp->cname);
6270 out:
6271 	/* free memory */
6272 	if (extlist != NULL)
6273 		meta_sp_list_free(&extlist);
6274 	if (sp_list != NULL)
6275 		meta_sp_list_free(&sp_list);
6276 	if (update_list != NULL)
6277 		meta_sp_list_free(&update_list);
6278 	if (un_array != NULL)	{
6279 		for (i = 0; i < num_sps; i++)
6280 			Free(un_array[i]);
6281 		Free(un_array);
6282 	}
6283 	if (minors != NULL)
6284 		Free(minors);
6285 	if (ownpar != NULL)
6286 		Free(ownpar);
6287 	(void) fflush(stdout);
6288 
6289 	if ((keynlp != NULL) && (committed != 1)) {
6290 		/*
6291 		 * if we haven't committed any softparts, either because of an
6292 		 * error or because the user decided not to proceed, delete
6293 		 * namelist key for the component
6294 		 */
6295 		if (mn_set) {
6296 			mdnamelist_t	*p;
6297 
6298 			for (p = keynlp; (p != NULL); p = p->next) {
6299 				mdname_t		*np = p->namep;
6300 				md_mn_msg_delkeyname_t	send_params;
6301 				md_mn_result_t		*resp = NULL;
6302 
6303 				send_params.delkeyname_dev = np->dev;
6304 				send_params.delkeyname_setno = sp->setno;
6305 				send_params.delkeyname_key = np->key;
6306 				(void) mdmn_send_message(sp->setno,
6307 				    MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
6308 				    (char *)&send_params, sizeof (send_params),
6309 				    &resp, ep);
6310 				if (resp != NULL) {
6311 					free_result(resp);
6312 				}
6313 			}
6314 		} else {
6315 			(void) del_key_names(sp, keynlp, NULL);
6316 		}
6317 	}
6318 
6319 	metafreenamelist(keynlp);
6320 
6321 	if (err)
6322 		return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname));
6323 
6324 	if (not_recovered)
6325 		if (options & MDCMD_PRINT)
6326 			(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6327 			    "Soft Partitions NOT recovered from device.\n"),
6328 			    compnp->cname);
6329 	return (0);
6330 }
6331 
6332 /*
6333  * FUNCTION:	meta_sp_recover_from_unit()
6334  * INPUT:	sp	- name of set we are recovering in
6335  *		compnp	- name of component we are recovering from
6336  *		options	- metarecover options
6337  * OUTPUT:	ep	- return error pointer
6338  * RETURNS:	int	- 0 - success, -1 - error
6339  * PURPOSE:	update watermarks to match metadb records.  begin by getting
6340  *		a namelist representing all soft partitions on the specified
6341  *		component.  then, build an extlist representing the soft
6342  *		partitions, filling in the freespace extents.  notify user
6343  *		of changes, place all soft partitions into the "recovering"
6344  *		state and update the watermarks.  finally, return all soft
6345  *		partitions to the "OK" state.
6346  */
6347 static int
6348 meta_sp_recover_from_unit(
6349 	mdsetname_t	*sp,
6350 	mdname_t	*compnp,
6351 	mdcmdopts_t	options,
6352 	md_error_t	*ep
6353 )
6354 {
6355 	mdnamelist_t	*spnlp = NULL;
6356 	mdnamelist_t	*nlp = NULL;
6357 	sp_ext_node_t	*ext = NULL;
6358 	sp_ext_node_t	*extlist = NULL;
6359 	int		count;
6360 	char		yesno[255];
6361 	char		*yes;
6362 	int		rval = 0;
6363 	minor_t		*minors = NULL;
6364 	int		i;
6365 	md_sp_t		*msp;
6366 	md_set_desc	*sd;
6367 	bool_t		mn_set = 0;
6368 	daddr_t		start_block;
6369 
6370 	count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep);
6371 	if (count <= 0)
6372 		return (-1);
6373 
6374 	/* set flag if dealing with a MN set */
6375 	if (!metaislocalset(sp)) {
6376 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
6377 			return (-1);
6378 		}
6379 		if (MD_MNSET_DESC(sd))
6380 			mn_set = 1;
6381 	}
6382 	/*
6383 	 * Save the XDR unit structure for one of the soft partitions;
6384 	 * we'll use this later to provide metadevice context to
6385 	 * update the watermarks so the device can be resolved by
6386 	 * devid instead of dev_t.
6387 	 */
6388 	if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) {
6389 		metafreenamelist(spnlp);
6390 		return (-1);
6391 	}
6392 
6393 	if ((start_block = meta_sp_get_start(sp, compnp, ep)) ==
6394 	    MD_DISKADDR_ERROR) {
6395 		return (-1);
6396 	}
6397 
6398 	meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block,
6399 	    EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset);
6400 	meta_sp_list_insert(NULL, NULL, &extlist,
6401 	    metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE,
6402 	    EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset);
6403 
6404 	if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) {
6405 		metafreenamelist(spnlp);
6406 		return (-1);
6407 	}
6408 
6409 	assert(extlist != NULL);
6410 	if ((options & MDCMD_VERBOSE) != 0) {
6411 		(void) printf(dgettext(TEXT_DOMAIN,
6412 		    "Updating extent headers on device %s from metadb.\n\n"),
6413 		    compnp->cname);
6414 		(void) printf(dgettext(TEXT_DOMAIN,
6415 		    "The following extent headers will be written:\n"));
6416 		meta_sp_display_exthdr();
6417 	}
6418 
6419 	meta_sp_list_freefill(&extlist, metagetsize(compnp, ep));
6420 
6421 	for (ext = extlist; ext != NULL; ext = ext->ext_next) {
6422 
6423 		/* mark every node for updating except the reserved space */
6424 		if (ext->ext_type != EXTTYP_RESERVED) {
6425 			ext->ext_flags |= EXTFLG_UPDATE;
6426 
6427 			/* print extent information */
6428 			if ((options & MDCMD_VERBOSE) != 0)
6429 				meta_sp_display_ext(ext);
6430 		}
6431 	}
6432 
6433 	/* request verification and then update all watermarks */
6434 	if ((options & MDCMD_DOIT) != 0) {
6435 
6436 		(void) printf(dgettext(TEXT_DOMAIN,
6437 		    "\nWARNING: You are about to overwrite portions of %s\n"
6438 		    "with soft partition metadata. The extent headers will be\n"
6439 		    "written to match the existing metadb configuration.  If\n"
6440 		    "the device was not previously setup with this\n"
6441 		    "configuration, data loss may result.\n\n"),
6442 		    compnp->cname);
6443 		(void) printf(dgettext(TEXT_DOMAIN,
6444 		    "Are you sure you want to do this (yes/no)? "));
6445 
6446 		(void) fflush(stdout);
6447 		if ((fgets(yesno, sizeof (yesno), stdin) == NULL) ||
6448 		    (strlen(yesno) == 1))
6449 			(void) snprintf(yesno, sizeof (yesno),
6450 			    "%s\n", dgettext(TEXT_DOMAIN, "no"));
6451 		yes = dgettext(TEXT_DOMAIN, "yes");
6452 		if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) {
6453 			/* place soft partitions into recovering state */
6454 			minors = Zalloc(count * sizeof (minor_t));
6455 			for (nlp = spnlp, i = 0;
6456 			    nlp != NULL && i < count;
6457 			    nlp = nlp->next, i++) {
6458 				assert(nlp->namep != NULL);
6459 				minors[i] = meta_getminor(nlp->namep->dev);
6460 			}
6461 			if (update_sp_status(sp, minors, count,
6462 			    MD_SP_RECOVER, mn_set, ep) != 0) {
6463 				rval = -1;
6464 				goto out;
6465 			}
6466 
6467 			/* update the watermarks */
6468 			if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
6469 				rval = -1;
6470 				goto out;
6471 			}
6472 
6473 			if (options & MDCMD_PRINT) {
6474 				(void) printf(dgettext(TEXT_DOMAIN, "%s: "
6475 				    "Soft Partitions recovered from metadb\n"),
6476 				    compnp->cname);
6477 			}
6478 
6479 			/* return soft partitions to the OK state */
6480 			if (update_sp_status(sp, minors, count,
6481 			    MD_SP_OK, mn_set, ep) != 0) {
6482 				rval = -1;
6483 				goto out;
6484 			}
6485 
6486 			rval = 0;
6487 			goto out;
6488 		}
6489 	}
6490 
6491 	if (options & MDCMD_PRINT) {
6492 		(void) printf(dgettext(TEXT_DOMAIN,
6493 		    "%s: Soft Partitions NOT recovered from metadb\n"),
6494 		    compnp->cname);
6495 	}
6496 
6497 out:
6498 	if (minors != NULL)
6499 		Free(minors);
6500 	metafreenamelist(spnlp);
6501 	meta_sp_list_free(&extlist);
6502 	(void) fflush(stdout);
6503 	return (rval);
6504 }
6505 
6506 
6507 /*
6508  * FUNCTION:	meta_sp_update_abr()
6509  * INPUT:	sp	- name of set we are recovering in
6510  * OUTPUT:	ep	- return error pointer
6511  * RETURNS:	int	- 0 - success, -1 - error
6512  * PURPOSE:	update the ABR state for all soft partitions in the set. This
6513  *		is called when joining a set. It sends a message to the master
6514  *		node for each soft partition to get the value of tstate and
6515  *		then sets ABR ,if required, by opening the sp, setting ABR
6516  *		and then closing the sp. This approach is taken rather that
6517  *		just issuing the MD_MN_SET_CAP ioctl, in order to deal with
6518  *		the case when we have another node simultaneously unsetting ABR.
6519  */
6520 int
6521 meta_sp_update_abr(
6522 	mdsetname_t	*sp,
6523 	md_error_t	*ep
6524 )
6525 {
6526 	mdnamelist_t	*devnlp = NULL;
6527 	mdnamelist_t	*p;
6528 	mdname_t	*devnp = NULL;
6529 	md_unit_t	*un;
6530 	char		fname[MAXPATHLEN];
6531 	int		mnum, fd;
6532 	volcap_t	vc;
6533 	uint_t		tstate;
6534 
6535 
6536 	if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
6537 		return (-1);
6538 	}
6539 
6540 	/* Exit if no soft partitions in this set */
6541 	if (devnlp == NULL)
6542 		return (0);
6543 
6544 	/* For each soft partition */
6545 	for (p = devnlp; (p != NULL); p = p->next) {
6546 		devnp = p->namep;
6547 
6548 		/* check if this is a top level metadevice */
6549 		if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL)
6550 			goto out;
6551 		if (MD_HAS_PARENT(MD_PARENT(un))) {
6552 			Free(un);
6553 			continue;
6554 		}
6555 		Free(un);
6556 
6557 		/* Get tstate from Master */
6558 		if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) {
6559 			mdname_t	*np;
6560 			np = metamnumname(&sp, meta_getminor(devnp->dev), 0,
6561 			    ep);
6562 			if (np) {
6563 				md_perror(dgettext(TEXT_DOMAIN,
6564 				    "Unable to get tstate for %s"), np->cname);
6565 			}
6566 			continue;
6567 		}
6568 		/* If not set on the master, nothing to do */
6569 		if (!(tstate & MD_ABR_CAP))
6570 			continue;
6571 
6572 		mnum = meta_getminor(devnp->dev);
6573 		(void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u",
6574 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
6575 		if ((fd = open(fname, O_RDWR, 0)) < 0) {
6576 			md_perror(dgettext(TEXT_DOMAIN,
6577 			    "Could not open device %s"), fname);
6578 			continue;
6579 		}
6580 
6581 		/* Set ABR state */
6582 		vc.vc_info = 0;
6583 		vc.vc_set = 0;
6584 		if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
6585 			(void) close(fd);
6586 			continue;
6587 		}
6588 
6589 		vc.vc_set = DKV_ABR_CAP;
6590 		if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
6591 			(void) close(fd);
6592 			goto out;
6593 		}
6594 
6595 		(void) close(fd);
6596 	}
6597 	metafreenamelist(devnlp);
6598 	return (0);
6599 out:
6600 	metafreenamelist(devnlp);
6601 	return (-1);
6602 }
6603 
6604 /*
6605  * FUNCTION:	meta_mn_sp_update_abr()
6606  * INPUT:	arg	- Given set.
6607  * PURPOSE:	update the ABR state for all soft partitions in the set by
6608  *		forking a process to call meta_sp_update_abr()
6609  *		This function is only called via rpc.metad when adding a node
6610  *		to a set, ie this node is beong joined to the set by another
6611  *		node.
6612  */
6613 void *
6614 meta_mn_sp_update_abr(void *arg)
6615 {
6616 	set_t		setno = *((set_t *)arg);
6617 	mdsetname_t	*sp;
6618 	md_error_t	mde = mdnullerror;
6619 	int		fval;
6620 
6621 	/* should have a set */
6622 	assert(setno != NULL);
6623 
6624 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6625 		mde_perror(&mde, "");
6626 		return (NULL);
6627 	}
6628 
6629 	if (!(meta_is_mn_set(sp, &mde))) {
6630 		mde_perror(&mde, "");
6631 		return (NULL);
6632 	}
6633 
6634 	/* fork a process */
6635 	if ((fval = md_daemonize(sp, &mde)) != 0) {
6636 		/*
6637 		 * md_daemonize will fork off a process.  The is the
6638 		 * parent or error.
6639 		 */
6640 		if (fval > 0) {
6641 			return (NULL);
6642 		}
6643 		mde_perror(&mde, "");
6644 		return (NULL);
6645 	}
6646 	/*
6647 	 * Child process should never return back to rpc.metad, but
6648 	 * should exit.
6649 	 * Flush all internally cached data inherited from parent process
6650 	 * since cached data will be cleared when parent process RPC request
6651 	 * has completed (which is possibly before this child process
6652 	 * can complete).
6653 	 * Child process can retrieve and cache its own copy of data from
6654 	 * rpc.metad that won't be changed by the parent process.
6655 	 *
6656 	 * Reset md_in_daemon since this child will be a client of rpc.metad
6657 	 * not part of the rpc.metad daemon itself.
6658 	 * md_in_daemon is used by rpc.metad so that libmeta can tell if
6659 	 * this thread is rpc.metad or any other thread.  (If this thread
6660 	 * was rpc.metad it could use some short circuit code to get data
6661 	 * directly from rpc.metad instead of doing an RPC call to rpc.metad).
6662 	 */
6663 	md_in_daemon = 0;
6664 	metaflushsetname(sp);
6665 	sr_cache_flush_setno(setno);
6666 	if ((sp = metasetnosetname(setno, &mde)) == NULL) {
6667 		mde_perror(&mde, "");
6668 		md_exit(sp, 1);
6669 	}
6670 
6671 
6672 	/*
6673 	 * Closing stdin/out/err here.
6674 	 */
6675 	(void) close(0);
6676 	(void) close(1);
6677 	(void) close(2);
6678 	assert(fval == 0);
6679 
6680 	(void) meta_sp_update_abr(sp, &mde);
6681 
6682 	md_exit(sp, 0);
6683 	/*NOTREACHED*/
6684 	return (NULL);
6685 }
6686 
6687 int
6688 meta_sp_check_component(
6689 	mdsetname_t	*sp,
6690 	mdname_t	*np,
6691 	md_error_t	*ep
6692 )
6693 {
6694 	md_sp_t	*msp;
6695 	minor_t	mnum = 0;
6696 	md_dev64_t	dev = 0;
6697 	mdnm_params_t	nm;
6698 	md_getdevs_params_t	mgd;
6699 	side_t	sideno;
6700 	char	*miscname;
6701 	md_dev64_t	*mydev = NULL;
6702 	char	*pname, *t;
6703 	char	*ctd_name;
6704 	char	*devname;
6705 	int	len;
6706 	int	rval = -1;
6707 
6708 	(void) memset(&nm, '\0', sizeof (nm));
6709 	if ((msp = meta_get_sp_common(sp, np, 0, ep)) == NULL)
6710 		return (-1);
6711 
6712 	if ((miscname = metagetmiscname(np, ep)) == NULL)
6713 		return (-1);
6714 
6715 	sideno = getmyside(sp, ep);
6716 
6717 	meta_sp_debug("meta_sp_check_component: %s is on %s key: %d"
6718 	    " dev: %llu\n",
6719 	    np->cname, msp->compnamep->cname, msp->compnamep->key,
6720 	    msp->compnamep->dev);
6721 
6722 	/*
6723 	 * Now get the data from the unit structure. The compnamep stuff
6724 	 * contains the data from the namespace and we need the un_dev
6725 	 * from the unit structure.
6726 	 */
6727 	(void) memset(&mgd, '\0', sizeof (mgd));
6728 	MD_SETDRIVERNAME(&mgd, miscname, sp->setno);
6729 	mgd.cnt = 1;		    /* sp's only have one subdevice */
6730 	mgd.mnum = meta_getminor(np->dev);
6731 
6732 	mydev = Zalloc(sizeof (*mydev));
6733 	mgd.devs = (uintptr_t)mydev;
6734 
6735 	if (metaioctl(MD_IOCGET_DEVS, &mgd, &mgd.mde, np->cname) != 0) {
6736 		meta_sp_debug("meta_sp_check_component: ioctl failed\n");
6737 		(void) mdstealerror(ep, &mgd.mde);
6738 		rval = 0;
6739 		goto out;
6740 	} else if (mgd.cnt <= 0) {
6741 		assert(mgd.cnt >= 0);
6742 		rval = 0;
6743 		goto out;
6744 	}
6745 
6746 	/* Get the devname from the name space. */
6747 	if ((devname = meta_getnmentbykey(sp->setno, sideno,
6748 	    msp->compnamep->key, NULL, &mnum, &dev, ep)) == NULL) {
6749 		meta_sp_debug("meta_sp_check_component: key %d not"
6750 		    "found\n", msp->compnamep->key);
6751 		goto out;
6752 	}
6753 
6754 	meta_sp_debug("dev %s from component: (%lu, %lu)\n",
6755 	    devname,
6756 	    meta_getmajor(*mydev),
6757 	    meta_getminor(*mydev));
6758 	meta_sp_debug("minor from the namespace: %lu\n", mnum);
6759 
6760 	if (mnum != meta_getminor(*mydev)) {
6761 		/*
6762 		 * The minor numbers are different. Update the namespace
6763 		 * with the information from the component.
6764 		 */
6765 
6766 		t = strrchr(devname, '/');
6767 		t++;
6768 		ctd_name = Strdup(t);
6769 
6770 		meta_sp_debug("meta_sp_check_component: ctd_name: %s\n",
6771 		    ctd_name);
6772 
6773 		len = strlen(devname);
6774 		t = strrchr(devname, '/');
6775 		t++;
6776 		pname = Zalloc((len - strlen(t)) + 1);
6777 		(void) strncpy(pname, devname, (len - strlen(t)));
6778 		meta_sp_debug("pathname: %s\n", pname);
6779 
6780 		meta_sp_debug("updating the minor number to %lu\n", nm.mnum);
6781 
6782 		if (meta_update_namespace(sp->setno, sideno,
6783 		    ctd_name, *mydev, msp->compnamep->key, pname,
6784 		    ep) != 0) {
6785 			goto out;
6786 		}
6787 	}
6788 out:
6789 	if (pname != NULL)
6790 		Free(pname);
6791 	if (ctd_name != NULL)
6792 		Free(ctd_name);
6793 	if (devname != NULL)
6794 		Free(devname);
6795 	if (mydev != NULL)
6796 		Free(mydev);
6797 	return (rval);
6798 }
6799