xref: /illumos-gate/usr/src/cmd/zpool/zpool_vdev.c (revision a1cdd5a67f3bf3e60db3f3a77baef63640ad91a4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25  * Copyright (c) 2016, 2017 Intel Corporation.
26  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27  */
28 
29 /*
30  * Functions to convert between a list of vdevs and an nvlist representing the
31  * configuration.  Each entry in the list can be one of:
32  *
33  *	Device vdevs
34  *		disk=(path=..., devid=...)
35  *		file=(path=...)
36  *
37  *	Group vdevs
38  *		raidz[1|2]=(...)
39  *		mirror=(...)
40  *
41  *	Hot spares
42  *
43  * While the underlying implementation supports it, group vdevs cannot contain
44  * other group vdevs.  All userland verification of devices is contained within
45  * this file.  If successful, the nvlist returned can be passed directly to the
46  * kernel; we've done as much verification as possible in userland.
47  *
48  * Hot spares are a special case, and passed down as an array of disk vdevs, at
49  * the same level as the root of the vdev tree.
50  *
51  * The only function exported by this file is 'make_root_vdev'.  The
52  * function performs several passes:
53  *
54  *	1. Construct the vdev specification.  Performs syntax validation and
55  *         makes sure each device is valid.
56  *	2. Check for devices in use.  Using libdiskmgt, makes sure that no
57  *         devices are also in use.  Some can be overridden using the 'force'
58  *         flag, others cannot.
59  *	3. Check for replication errors if the 'force' flag is not specified.
60  *         validates that the replication level is consistent across the
61  *         entire pool.
62  *	4. Call libzfs to label any whole disks with an EFI label.
63  */
64 
65 #include <assert.h>
66 #include <devid.h>
67 #include <errno.h>
68 #include <fcntl.h>
69 #include <libdiskmgt.h>
70 #include <libintl.h>
71 #include <libnvpair.h>
72 #include <limits.h>
73 #include <sys/spa.h>
74 #include <stdio.h>
75 #include <string.h>
76 #include <unistd.h>
77 #include <sys/efi_partition.h>
78 #include <sys/stat.h>
79 #include <sys/vtoc.h>
80 #include <sys/mntent.h>
81 
82 #include "zpool_util.h"
83 
84 #define	BACKUP_SLICE	"s2"
85 
86 /*
87  * For any given vdev specification, we can have multiple errors.  The
88  * vdev_error() function keeps track of whether we have seen an error yet, and
89  * prints out a header if its the first error we've seen.
90  */
91 boolean_t error_seen;
92 boolean_t is_force;
93 
94 /*PRINTFLIKE1*/
95 static void
96 vdev_error(const char *fmt, ...)
97 {
98 	va_list ap;
99 
100 	if (!error_seen) {
101 		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
102 		if (!is_force)
103 			(void) fprintf(stderr, gettext("use '-f' to override "
104 			    "the following errors:\n"));
105 		else
106 			(void) fprintf(stderr, gettext("the following errors "
107 			    "must be manually repaired:\n"));
108 		error_seen = B_TRUE;
109 	}
110 
111 	va_start(ap, fmt);
112 	(void) vfprintf(stderr, fmt, ap);
113 	va_end(ap);
114 }
115 
116 static void
117 libdiskmgt_error(int error)
118 {
119 	/*
120 	 * ENXIO/ENODEV is a valid error message if the device doesn't live in
121 	 * /dev/dsk.  Don't bother printing an error message in this case.
122 	 */
123 	if (error == ENXIO || error == ENODEV)
124 		return;
125 
126 	(void) fprintf(stderr, gettext("warning: device in use checking "
127 	    "failed: %s\n"), strerror(error));
128 }
129 
130 /*
131  * Validate a device, passing the bulk of the work off to libdiskmgt.
132  */
133 static int
134 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
135 {
136 	char *msg;
137 	int error = 0;
138 	dm_who_type_t who;
139 
140 	if (force)
141 		who = DM_WHO_ZPOOL_FORCE;
142 	else if (isspare)
143 		who = DM_WHO_ZPOOL_SPARE;
144 	else
145 		who = DM_WHO_ZPOOL;
146 
147 	if (dm_inuse((char *)path, &msg, who, &error) || error) {
148 		if (error != 0) {
149 			libdiskmgt_error(error);
150 			return (0);
151 		} else {
152 			vdev_error("%s", msg);
153 			free(msg);
154 			return (-1);
155 		}
156 	}
157 
158 	/*
159 	 * If we're given a whole disk, ignore overlapping slices since we're
160 	 * about to label it anyway.
161 	 */
162 	error = 0;
163 	if (!wholedisk && !force &&
164 	    (dm_isoverlapping((char *)path, &msg, &error) || error)) {
165 		if (error == 0) {
166 			/* dm_isoverlapping returned -1 */
167 			vdev_error(gettext("%s overlaps with %s\n"), path, msg);
168 			free(msg);
169 			return (-1);
170 		} else if (error != ENODEV) {
171 			/* libdiskmgt's devcache only handles physical drives */
172 			libdiskmgt_error(error);
173 			return (0);
174 		}
175 	}
176 
177 	return (0);
178 }
179 
180 
181 /*
182  * Validate a whole disk.  Iterate over all slices on the disk and make sure
183  * that none is in use by calling check_slice().
184  */
185 static int
186 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
187 {
188 	dm_descriptor_t *drive, *media, *slice;
189 	int err = 0;
190 	int i;
191 	int ret;
192 
193 	/*
194 	 * Get the drive associated with this disk.  This should never fail,
195 	 * because we already have an alias handle open for the device.
196 	 */
197 	if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
198 	    &err)) == NULL || *drive == 0) {
199 		if (err)
200 			libdiskmgt_error(err);
201 		return (0);
202 	}
203 
204 	if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
205 	    &err)) == NULL) {
206 		dm_free_descriptors(drive);
207 		if (err)
208 			libdiskmgt_error(err);
209 		return (0);
210 	}
211 
212 	dm_free_descriptors(drive);
213 
214 	/*
215 	 * It is possible that the user has specified a removable media drive,
216 	 * and the media is not present.
217 	 */
218 	if (*media == 0) {
219 		dm_free_descriptors(media);
220 		vdev_error(gettext("'%s' has no media in drive\n"), name);
221 		return (-1);
222 	}
223 
224 	if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
225 	    &err)) == NULL) {
226 		dm_free_descriptors(media);
227 		if (err)
228 			libdiskmgt_error(err);
229 		return (0);
230 	}
231 
232 	dm_free_descriptors(media);
233 
234 	ret = 0;
235 
236 	/*
237 	 * Iterate over all slices and report any errors.  We don't care about
238 	 * overlapping slices because we are using the whole disk.
239 	 */
240 	for (i = 0; slice[i] != 0; i++) {
241 		char *name = dm_get_name(slice[i], &err);
242 
243 		if (check_slice(name, force, B_TRUE, isspare) != 0)
244 			ret = -1;
245 
246 		dm_free_name(name);
247 	}
248 
249 	dm_free_descriptors(slice);
250 	return (ret);
251 }
252 
253 /*
254  * Validate a device.
255  */
256 static int
257 check_device(const char *path, boolean_t force, boolean_t isspare)
258 {
259 	dm_descriptor_t desc;
260 	int err;
261 	char *dev;
262 
263 	/*
264 	 * For whole disks, libdiskmgt does not include the leading dev path.
265 	 */
266 	dev = strrchr(path, '/');
267 	assert(dev != NULL);
268 	dev++;
269 	if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != 0) {
270 		err = check_disk(path, desc, force, isspare);
271 		dm_free_descriptor(desc);
272 		return (err);
273 	}
274 
275 	return (check_slice(path, force, B_FALSE, isspare));
276 }
277 
278 /*
279  * Check that a file is valid.  All we can do in this case is check that it's
280  * not in use by another pool, and not in use by swap.
281  */
282 static int
283 check_file(const char *file, boolean_t force, boolean_t isspare)
284 {
285 	char  *name;
286 	int fd;
287 	int ret = 0;
288 	int err;
289 	pool_state_t state;
290 	boolean_t inuse;
291 
292 	if (dm_inuse_swap(file, &err)) {
293 		if (err)
294 			libdiskmgt_error(err);
295 		else
296 			vdev_error(gettext("%s is currently used by swap. "
297 			    "Please see swap(1M).\n"), file);
298 		return (-1);
299 	}
300 
301 	if ((fd = open(file, O_RDONLY)) < 0)
302 		return (0);
303 
304 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
305 		const char *desc;
306 
307 		switch (state) {
308 		case POOL_STATE_ACTIVE:
309 			desc = gettext("active");
310 			break;
311 
312 		case POOL_STATE_EXPORTED:
313 			desc = gettext("exported");
314 			break;
315 
316 		case POOL_STATE_POTENTIALLY_ACTIVE:
317 			desc = gettext("potentially active");
318 			break;
319 
320 		default:
321 			desc = gettext("unknown");
322 			break;
323 		}
324 
325 		/*
326 		 * Allow hot spares to be shared between pools.
327 		 */
328 		if (state == POOL_STATE_SPARE && isspare)
329 			return (0);
330 
331 		if (state == POOL_STATE_ACTIVE ||
332 		    state == POOL_STATE_SPARE || !force) {
333 			switch (state) {
334 			case POOL_STATE_SPARE:
335 				vdev_error(gettext("%s is reserved as a hot "
336 				    "spare for pool %s\n"), file, name);
337 				break;
338 			default:
339 				vdev_error(gettext("%s is part of %s pool "
340 				    "'%s'\n"), file, desc, name);
341 				break;
342 			}
343 			ret = -1;
344 		}
345 
346 		free(name);
347 	}
348 
349 	(void) close(fd);
350 	return (ret);
351 }
352 
353 
354 /*
355  * By "whole disk" we mean an entire physical disk (something we can
356  * label, toggle the write cache on, etc.) as opposed to the full
357  * capacity of a pseudo-device such as lofi or did.  We act as if we
358  * are labeling the disk, which should be a pretty good test of whether
359  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
360  * it isn't.
361  */
362 static boolean_t
363 is_whole_disk(const char *arg)
364 {
365 	struct dk_gpt *label;
366 	int	fd;
367 	char	path[MAXPATHLEN];
368 
369 	(void) snprintf(path, sizeof (path), "%s%s%s",
370 	    ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
371 	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
372 		return (B_FALSE);
373 	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
374 		(void) close(fd);
375 		return (B_FALSE);
376 	}
377 	efi_free(label);
378 	(void) close(fd);
379 	return (B_TRUE);
380 }
381 
382 /*
383  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
384  * device, fill in the device id to make a complete nvlist.  Valid forms for a
385  * leaf vdev are:
386  *
387  *	/dev/dsk/xxx	Complete disk path
388  *	/xxx		Full path to file
389  *	xxx		Shorthand for /dev/dsk/xxx
390  */
391 static nvlist_t *
392 make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log)
393 {
394 	char path[MAXPATHLEN];
395 	struct stat64 statbuf;
396 	nvlist_t *vdev = NULL;
397 	char *type = NULL;
398 	boolean_t wholedisk = B_FALSE;
399 	uint64_t ashift = 0;
400 
401 	/*
402 	 * Determine what type of vdev this is, and put the full path into
403 	 * 'path'.  We detect whether this is a device of file afterwards by
404 	 * checking the st_mode of the file.
405 	 */
406 	if (arg[0] == '/') {
407 		/*
408 		 * Complete device or file path.  Exact type is determined by
409 		 * examining the file descriptor afterwards.
410 		 */
411 		wholedisk = is_whole_disk(arg);
412 		if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
413 			(void) fprintf(stderr,
414 			    gettext("cannot open '%s': %s\n"),
415 			    arg, strerror(errno));
416 			return (NULL);
417 		}
418 
419 		(void) strlcpy(path, arg, sizeof (path));
420 	} else {
421 		/*
422 		 * This may be a short path for a device, or it could be total
423 		 * gibberish.  Check to see if it's a known device in
424 		 * /dev/dsk/.  As part of this check, see if we've been given a
425 		 * an entire disk (minus the slice number).
426 		 */
427 		(void) snprintf(path, sizeof (path), "%s/%s", ZFS_DISK_ROOT,
428 		    arg);
429 		wholedisk = is_whole_disk(path);
430 		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
431 			/*
432 			 * If we got ENOENT, then the user gave us
433 			 * gibberish, so try to direct them with a
434 			 * reasonable error message.  Otherwise,
435 			 * regurgitate strerror() since it's the best we
436 			 * can do.
437 			 */
438 			if (errno == ENOENT) {
439 				(void) fprintf(stderr,
440 				    gettext("cannot open '%s': no such "
441 				    "device in %s\n"), arg, ZFS_DISK_ROOT);
442 				(void) fprintf(stderr,
443 				    gettext("must be a full path or "
444 				    "shorthand device name\n"));
445 				return (NULL);
446 			} else {
447 				(void) fprintf(stderr,
448 				    gettext("cannot open '%s': %s\n"),
449 				    path, strerror(errno));
450 				return (NULL);
451 			}
452 		}
453 	}
454 
455 	/*
456 	 * Determine whether this is a device or a file.
457 	 */
458 	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
459 		type = VDEV_TYPE_DISK;
460 	} else if (S_ISREG(statbuf.st_mode)) {
461 		type = VDEV_TYPE_FILE;
462 	} else {
463 		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
464 		    "block device or regular file\n"), path);
465 		return (NULL);
466 	}
467 
468 	/*
469 	 * Finally, we have the complete device or file, and we know that it is
470 	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
471 	 * vdevs have a 'path' element, and devices also have a 'devid' element.
472 	 */
473 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
474 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
475 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
476 	verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
477 	if (is_log)
478 		verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
479 		    VDEV_ALLOC_BIAS_LOG) == 0);
480 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
481 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
482 		    (uint64_t)wholedisk) == 0);
483 
484 	if (props != NULL) {
485 		char *value = NULL;
486 
487 		if (nvlist_lookup_string(props,
488 		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
489 			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
490 				(void) fprintf(stderr,
491 				    gettext("ashift must be a number.\n"));
492 				return (NULL);
493 			}
494 			if (ashift != 0 &&
495 			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
496 				(void) fprintf(stderr,
497 				    gettext("invalid 'ashift=%" PRIu64 "' "
498 				    "property: only values between %" PRId32 " "
499 				    "and %" PRId32 " are allowed.\n"),
500 				    ashift, ASHIFT_MIN, ASHIFT_MAX);
501 				return (NULL);
502 			}
503 		}
504 	}
505 
506 	/*
507 	 * For a whole disk, defer getting its devid until after labeling it.
508 	 */
509 	if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
510 		/*
511 		 * Get the devid for the device.
512 		 */
513 		int fd;
514 		ddi_devid_t devid;
515 		char *minor = NULL, *devid_str = NULL;
516 
517 		if ((fd = open(path, O_RDONLY)) < 0) {
518 			(void) fprintf(stderr, gettext("cannot open '%s': "
519 			    "%s\n"), path, strerror(errno));
520 			nvlist_free(vdev);
521 			return (NULL);
522 		}
523 
524 		if (devid_get(fd, &devid) == 0) {
525 			if (devid_get_minor_name(fd, &minor) == 0 &&
526 			    (devid_str = devid_str_encode(devid, minor)) !=
527 			    NULL) {
528 				verify(nvlist_add_string(vdev,
529 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
530 			}
531 			if (devid_str != NULL)
532 				devid_str_free(devid_str);
533 			if (minor != NULL)
534 				devid_str_free(minor);
535 			devid_free(devid);
536 		}
537 
538 		(void) close(fd);
539 	}
540 
541 	if (ashift > 0)
542 		(void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
543 
544 	return (vdev);
545 }
546 
547 /*
548  * Go through and verify the replication level of the pool is consistent.
549  * Performs the following checks:
550  *
551  *	For the new spec, verifies that devices in mirrors and raidz are the
552  *	same size.
553  *
554  *	If the current configuration already has inconsistent replication
555  *	levels, ignore any other potential problems in the new spec.
556  *
557  *	Otherwise, make sure that the current spec (if there is one) and the new
558  *	spec have consistent replication levels.
559  *
560  *	If there is no current spec (create), make sure new spec has at least
561  *	one general purpose vdev.
562  */
563 typedef struct replication_level {
564 	char *zprl_type;
565 	uint64_t zprl_children;
566 	uint64_t zprl_parity;
567 } replication_level_t;
568 
569 #define	ZPOOL_FUZZ	(16 * 1024 * 1024)
570 
571 static boolean_t
572 is_raidz_mirror(replication_level_t *a, replication_level_t *b,
573     replication_level_t **raidz, replication_level_t **mirror)
574 {
575 	if (strcmp(a->zprl_type, "raidz") == 0 &&
576 	    strcmp(b->zprl_type, "mirror") == 0) {
577 		*raidz = a;
578 		*mirror = b;
579 		return (B_TRUE);
580 	}
581 	return (B_FALSE);
582 }
583 
584 /*
585  * Given a list of toplevel vdevs, return the current replication level.  If
586  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
587  * an error message will be displayed for each self-inconsistent vdev.
588  */
589 static replication_level_t *
590 get_replication(nvlist_t *nvroot, boolean_t fatal)
591 {
592 	nvlist_t **top;
593 	uint_t t, toplevels;
594 	nvlist_t **child;
595 	uint_t c, children;
596 	nvlist_t *nv;
597 	char *type;
598 	replication_level_t lastrep = {0};
599 	replication_level_t rep;
600 	replication_level_t *ret;
601 	replication_level_t *raidz, *mirror;
602 	boolean_t dontreport;
603 
604 	ret = safe_malloc(sizeof (replication_level_t));
605 
606 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
607 	    &top, &toplevels) == 0);
608 
609 	for (t = 0; t < toplevels; t++) {
610 		uint64_t is_log = B_FALSE;
611 
612 		nv = top[t];
613 
614 		/*
615 		 * For separate logs we ignore the top level vdev replication
616 		 * constraints.
617 		 */
618 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
619 		if (is_log)
620 			continue;
621 
622 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
623 		    &type) == 0);
624 		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
625 		    &child, &children) != 0) {
626 			/*
627 			 * This is a 'file' or 'disk' vdev.
628 			 */
629 			rep.zprl_type = type;
630 			rep.zprl_children = 1;
631 			rep.zprl_parity = 0;
632 		} else {
633 			uint64_t vdev_size;
634 
635 			/*
636 			 * This is a mirror or RAID-Z vdev.  Go through and make
637 			 * sure the contents are all the same (files vs. disks),
638 			 * keeping track of the number of elements in the
639 			 * process.
640 			 *
641 			 * We also check that the size of each vdev (if it can
642 			 * be determined) is the same.
643 			 */
644 			rep.zprl_type = type;
645 			rep.zprl_children = 0;
646 
647 			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
648 				verify(nvlist_lookup_uint64(nv,
649 				    ZPOOL_CONFIG_NPARITY,
650 				    &rep.zprl_parity) == 0);
651 				assert(rep.zprl_parity != 0);
652 			} else {
653 				rep.zprl_parity = 0;
654 			}
655 
656 			/*
657 			 * The 'dontreport' variable indicates that we've
658 			 * already reported an error for this spec, so don't
659 			 * bother doing it again.
660 			 */
661 			type = NULL;
662 			dontreport = 0;
663 			vdev_size = -1ULL;
664 			for (c = 0; c < children; c++) {
665 				nvlist_t *cnv = child[c];
666 				char *path;
667 				struct stat64 statbuf;
668 				uint64_t size = -1ULL;
669 				char *childtype;
670 				int fd, err;
671 
672 				rep.zprl_children++;
673 
674 				verify(nvlist_lookup_string(cnv,
675 				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
676 
677 				/*
678 				 * If this is a replacing or spare vdev, then
679 				 * get the real first child of the vdev: do this
680 				 * in a loop because replacing and spare vdevs
681 				 * can be nested.
682 				 */
683 				while (strcmp(childtype,
684 				    VDEV_TYPE_REPLACING) == 0 ||
685 				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
686 					nvlist_t **rchild;
687 					uint_t rchildren;
688 
689 					verify(nvlist_lookup_nvlist_array(cnv,
690 					    ZPOOL_CONFIG_CHILDREN, &rchild,
691 					    &rchildren) == 0);
692 					assert(rchildren == 2);
693 					cnv = rchild[0];
694 
695 					verify(nvlist_lookup_string(cnv,
696 					    ZPOOL_CONFIG_TYPE,
697 					    &childtype) == 0);
698 				}
699 
700 				verify(nvlist_lookup_string(cnv,
701 				    ZPOOL_CONFIG_PATH, &path) == 0);
702 
703 				/*
704 				 * If we have a raidz/mirror that combines disks
705 				 * with files, report it as an error.
706 				 */
707 				if (!dontreport && type != NULL &&
708 				    strcmp(type, childtype) != 0) {
709 					if (ret != NULL)
710 						free(ret);
711 					ret = NULL;
712 					if (fatal)
713 						vdev_error(gettext(
714 						    "mismatched replication "
715 						    "level: %s contains both "
716 						    "files and devices\n"),
717 						    rep.zprl_type);
718 					else
719 						return (NULL);
720 					dontreport = B_TRUE;
721 				}
722 
723 				/*
724 				 * According to stat(2), the value of 'st_size'
725 				 * is undefined for block devices and character
726 				 * devices.  But there is no effective way to
727 				 * determine the real size in userland.
728 				 *
729 				 * Instead, we'll take advantage of an
730 				 * implementation detail of spec_size().  If the
731 				 * device is currently open, then we (should)
732 				 * return a valid size.
733 				 *
734 				 * If we still don't get a valid size (indicated
735 				 * by a size of 0 or MAXOFFSET_T), then ignore
736 				 * this device altogether.
737 				 */
738 				if ((fd = open(path, O_RDONLY)) >= 0) {
739 					err = fstat64(fd, &statbuf);
740 					(void) close(fd);
741 				} else {
742 					err = stat64(path, &statbuf);
743 				}
744 
745 				if (err != 0 ||
746 				    statbuf.st_size == 0 ||
747 				    statbuf.st_size == MAXOFFSET_T)
748 					continue;
749 
750 				size = statbuf.st_size;
751 
752 				/*
753 				 * Also make sure that devices and
754 				 * slices have a consistent size.  If
755 				 * they differ by a significant amount
756 				 * (~16MB) then report an error.
757 				 */
758 				if (!dontreport &&
759 				    (vdev_size != -1ULL &&
760 				    (labs(size - vdev_size) >
761 				    ZPOOL_FUZZ))) {
762 					if (ret != NULL)
763 						free(ret);
764 					ret = NULL;
765 					if (fatal)
766 						vdev_error(gettext(
767 						    "%s contains devices of "
768 						    "different sizes\n"),
769 						    rep.zprl_type);
770 					else
771 						return (NULL);
772 					dontreport = B_TRUE;
773 				}
774 
775 				type = childtype;
776 				vdev_size = size;
777 			}
778 		}
779 
780 		/*
781 		 * At this point, we have the replication of the last toplevel
782 		 * vdev in 'rep'.  Compare it to 'lastrep' to see if it is
783 		 * different.
784 		 */
785 		if (lastrep.zprl_type != NULL) {
786 			if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
787 			    is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
788 				/*
789 				 * Accepted raidz and mirror when they can
790 				 * handle the same number of disk failures.
791 				 */
792 				if (raidz->zprl_parity !=
793 				    mirror->zprl_children - 1) {
794 					if (ret != NULL)
795 						free(ret);
796 					ret = NULL;
797 					if (fatal)
798 						vdev_error(gettext(
799 						    "mismatched replication "
800 						    "level: "
801 						    "%s and %s vdevs with "
802 						    "different redundancy, "
803 						    "%llu vs. %llu (%llu-way) "
804 						    "are present\n"),
805 						    raidz->zprl_type,
806 						    mirror->zprl_type,
807 						    raidz->zprl_parity,
808 						    mirror->zprl_children - 1,
809 						    mirror->zprl_children);
810 					else
811 						return (NULL);
812 				}
813 			} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
814 			    0) {
815 				if (ret != NULL)
816 					free(ret);
817 				ret = NULL;
818 				if (fatal)
819 					vdev_error(gettext(
820 					    "mismatched replication level: "
821 					    "both %s and %s vdevs are "
822 					    "present\n"),
823 					    lastrep.zprl_type, rep.zprl_type);
824 				else
825 					return (NULL);
826 			} else if (lastrep.zprl_parity != rep.zprl_parity) {
827 				if (ret)
828 					free(ret);
829 				ret = NULL;
830 				if (fatal)
831 					vdev_error(gettext(
832 					    "mismatched replication level: "
833 					    "both %llu and %llu device parity "
834 					    "%s vdevs are present\n"),
835 					    lastrep.zprl_parity,
836 					    rep.zprl_parity,
837 					    rep.zprl_type);
838 				else
839 					return (NULL);
840 			} else if (lastrep.zprl_children != rep.zprl_children) {
841 				if (ret)
842 					free(ret);
843 				ret = NULL;
844 				if (fatal)
845 					vdev_error(gettext(
846 					    "mismatched replication level: "
847 					    "both %llu-way and %llu-way %s "
848 					    "vdevs are present\n"),
849 					    lastrep.zprl_children,
850 					    rep.zprl_children,
851 					    rep.zprl_type);
852 				else
853 					return (NULL);
854 			}
855 		}
856 		lastrep = rep;
857 	}
858 
859 	if (ret != NULL)
860 		*ret = rep;
861 
862 	return (ret);
863 }
864 
865 /*
866  * Check the replication level of the vdev spec against the current pool.  Calls
867  * get_replication() to make sure the new spec is self-consistent.  If the pool
868  * has a consistent replication level, then we ignore any errors.  Otherwise,
869  * report any difference between the two.
870  */
871 static int
872 check_replication(nvlist_t *config, nvlist_t *newroot)
873 {
874 	nvlist_t **child;
875 	uint_t	children;
876 	replication_level_t *current = NULL, *new;
877 	replication_level_t *raidz, *mirror;
878 	int ret;
879 
880 	/*
881 	 * If we have a current pool configuration, check to see if it's
882 	 * self-consistent.  If not, simply return success.
883 	 */
884 	if (config != NULL) {
885 		nvlist_t *nvroot;
886 
887 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
888 		    &nvroot) == 0);
889 		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
890 			return (0);
891 	}
892 	/*
893 	 * for spares there may be no children, and therefore no
894 	 * replication level to check
895 	 */
896 	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
897 	    &child, &children) != 0) || (children == 0)) {
898 		free(current);
899 		return (0);
900 	}
901 
902 	/*
903 	 * If all we have is logs then there's no replication level to check.
904 	 */
905 	if (num_logs(newroot) == children) {
906 		free(current);
907 		return (0);
908 	}
909 
910 	/*
911 	 * Get the replication level of the new vdev spec, reporting any
912 	 * inconsistencies found.
913 	 */
914 	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
915 		free(current);
916 		return (-1);
917 	}
918 
919 	/*
920 	 * Check to see if the new vdev spec matches the replication level of
921 	 * the current pool.
922 	 */
923 	ret = 0;
924 	if (current != NULL) {
925 		if (is_raidz_mirror(current, new, &raidz, &mirror) ||
926 		    is_raidz_mirror(new, current, &raidz, &mirror)) {
927 			if (raidz->zprl_parity != mirror->zprl_children - 1) {
928 				vdev_error(gettext(
929 				    "mismatched replication level: pool and "
930 				    "new vdev with different redundancy, %s "
931 				    "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
932 				    raidz->zprl_type,
933 				    mirror->zprl_type,
934 				    raidz->zprl_parity,
935 				    mirror->zprl_children - 1,
936 				    mirror->zprl_children);
937 				ret = -1;
938 			}
939 		} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
940 			vdev_error(gettext(
941 			    "mismatched replication level: pool uses %s "
942 			    "and new vdev is %s\n"),
943 			    current->zprl_type, new->zprl_type);
944 			ret = -1;
945 		} else if (current->zprl_parity != new->zprl_parity) {
946 			vdev_error(gettext(
947 			    "mismatched replication level: pool uses %llu "
948 			    "device parity and new vdev uses %llu\n"),
949 			    current->zprl_parity, new->zprl_parity);
950 			ret = -1;
951 		} else if (current->zprl_children != new->zprl_children) {
952 			vdev_error(gettext(
953 			    "mismatched replication level: pool uses %llu-way "
954 			    "%s and new vdev uses %llu-way %s\n"),
955 			    current->zprl_children, current->zprl_type,
956 			    new->zprl_children, new->zprl_type);
957 			ret = -1;
958 		}
959 	}
960 
961 	free(new);
962 	if (current != NULL)
963 		free(current);
964 
965 	return (ret);
966 }
967 
968 /*
969  * Go through and find any whole disks in the vdev specification, labelling them
970  * as appropriate.  When constructing the vdev spec, we were unable to open this
971  * device in order to provide a devid.  Now that we have labelled the disk and
972  * know the pool slice is valid, we can construct the devid now.
973  *
974  * If the disk was already labeled with an EFI label, we will have gotten the
975  * devid already (because we were able to open the whole disk).  Otherwise, we
976  * need to get the devid after we label the disk.
977  */
978 static int
979 make_disks(zpool_handle_t *zhp, nvlist_t *nv, zpool_boot_label_t boot_type,
980     uint64_t boot_size)
981 {
982 	nvlist_t **child;
983 	uint_t c, children;
984 	char *type, *path, *diskname;
985 	char buf[MAXPATHLEN];
986 	uint64_t wholedisk;
987 	int fd;
988 	int ret;
989 	int slice;
990 	ddi_devid_t devid;
991 	char *minor = NULL, *devid_str = NULL;
992 
993 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
994 
995 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
996 	    &child, &children) != 0) {
997 
998 		if (strcmp(type, VDEV_TYPE_DISK) != 0)
999 			return (0);
1000 
1001 		/*
1002 		 * We have a disk device.  Get the path to the device
1003 		 * and see if it's a whole disk by appending the backup
1004 		 * slice and stat()ing the device.
1005 		 */
1006 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
1007 
1008 		diskname = strrchr(path, '/');
1009 		assert(diskname != NULL);
1010 		diskname++;
1011 
1012 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1013 		    &wholedisk) != 0 || !wholedisk) {
1014 			/*
1015 			 * This is not whole disk, return error if
1016 			 * boot partition creation was requested
1017 			 */
1018 			if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
1019 				(void) fprintf(stderr,
1020 				    gettext("creating boot partition is only "
1021 				    "supported on whole disk vdevs: %s\n"),
1022 				    diskname);
1023 				return (-1);
1024 			}
1025 			return (0);
1026 		}
1027 
1028 		ret = zpool_label_disk(g_zfs, zhp, diskname, boot_type,
1029 		    boot_size, &slice);
1030 		if (ret == -1)
1031 			return (ret);
1032 
1033 		/*
1034 		 * Fill in the devid, now that we've labeled the disk.
1035 		 */
1036 		(void) snprintf(buf, sizeof (buf), "%ss%d", path, slice);
1037 		if ((fd = open(buf, O_RDONLY)) < 0) {
1038 			(void) fprintf(stderr,
1039 			    gettext("cannot open '%s': %s\n"),
1040 			    buf, strerror(errno));
1041 			return (-1);
1042 		}
1043 
1044 		if (devid_get(fd, &devid) == 0) {
1045 			if (devid_get_minor_name(fd, &minor) == 0 &&
1046 			    (devid_str = devid_str_encode(devid, minor)) !=
1047 			    NULL) {
1048 				verify(nvlist_add_string(nv,
1049 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
1050 			}
1051 			if (devid_str != NULL)
1052 				devid_str_free(devid_str);
1053 			if (minor != NULL)
1054 				devid_str_free(minor);
1055 			devid_free(devid);
1056 		}
1057 
1058 		/*
1059 		 * Update the path to refer to the pool slice.  The presence of
1060 		 * the 'whole_disk' field indicates to the CLI that we should
1061 		 * chop off the slice number when displaying the device in
1062 		 * future output.
1063 		 */
1064 		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
1065 
1066 		(void) close(fd);
1067 
1068 		return (0);
1069 	}
1070 
1071 	/* illumos kernel does not support booting from multi-vdev pools. */
1072 	if ((boot_type == ZPOOL_CREATE_BOOT_LABEL)) {
1073 		if ((strcmp(type, VDEV_TYPE_ROOT) == 0) && children > 1) {
1074 			(void) fprintf(stderr, gettext("boot pool "
1075 			    "can not have more than one vdev\n"));
1076 			return (-1);
1077 		}
1078 	}
1079 
1080 	for (c = 0; c < children; c++) {
1081 		ret = make_disks(zhp, child[c], boot_type, boot_size);
1082 		if (ret != 0)
1083 			return (ret);
1084 	}
1085 
1086 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1087 	    &child, &children) == 0)
1088 		for (c = 0; c < children; c++) {
1089 			ret = make_disks(zhp, child[c], boot_type, boot_size);
1090 			if (ret != 0)
1091 				return (ret);
1092 		}
1093 
1094 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1095 	    &child, &children) == 0)
1096 		for (c = 0; c < children; c++) {
1097 			ret = make_disks(zhp, child[c], boot_type, boot_size);
1098 			if (ret != 0)
1099 				return (ret);
1100 		}
1101 
1102 	return (0);
1103 }
1104 
1105 /*
1106  * Determine if the given path is a hot spare within the given configuration.
1107  */
1108 static boolean_t
1109 is_spare(nvlist_t *config, const char *path)
1110 {
1111 	int fd;
1112 	pool_state_t state;
1113 	char *name = NULL;
1114 	nvlist_t *label;
1115 	uint64_t guid, spareguid;
1116 	nvlist_t *nvroot;
1117 	nvlist_t **spares;
1118 	uint_t i, nspares;
1119 	boolean_t inuse;
1120 
1121 	if ((fd = open(path, O_RDONLY)) < 0)
1122 		return (B_FALSE);
1123 
1124 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
1125 	    !inuse ||
1126 	    state != POOL_STATE_SPARE ||
1127 	    zpool_read_label(fd, &label) != 0) {
1128 		free(name);
1129 		(void) close(fd);
1130 		return (B_FALSE);
1131 	}
1132 	free(name);
1133 	(void) close(fd);
1134 
1135 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
1136 	nvlist_free(label);
1137 
1138 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1139 	    &nvroot) == 0);
1140 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1141 	    &spares, &nspares) == 0) {
1142 		for (i = 0; i < nspares; i++) {
1143 			verify(nvlist_lookup_uint64(spares[i],
1144 			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
1145 			if (spareguid == guid)
1146 				return (B_TRUE);
1147 		}
1148 	}
1149 
1150 	return (B_FALSE);
1151 }
1152 
1153 /*
1154  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1155  * the majority of this task.
1156  */
1157 static boolean_t
1158 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1159     boolean_t replacing, boolean_t isspare)
1160 {
1161 	nvlist_t **child;
1162 	uint_t c, children;
1163 	char *type, *path;
1164 	int ret = 0;
1165 	char buf[MAXPATHLEN];
1166 	uint64_t wholedisk;
1167 	boolean_t anyinuse = B_FALSE;
1168 
1169 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1170 
1171 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1172 	    &child, &children) != 0) {
1173 
1174 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
1175 
1176 		/*
1177 		 * As a generic check, we look to see if this is a replace of a
1178 		 * hot spare within the same pool.  If so, we allow it
1179 		 * regardless of what libdiskmgt or zpool_in_use() says.
1180 		 */
1181 		if (replacing) {
1182 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1183 			    &wholedisk) == 0 && wholedisk)
1184 				(void) snprintf(buf, sizeof (buf), "%ss0",
1185 				    path);
1186 			else
1187 				(void) strlcpy(buf, path, sizeof (buf));
1188 
1189 			if (is_spare(config, buf))
1190 				return (B_FALSE);
1191 		}
1192 
1193 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
1194 			ret = check_device(path, force, isspare);
1195 		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1196 			ret = check_file(path, force, isspare);
1197 
1198 		return (ret != 0);
1199 	}
1200 
1201 	for (c = 0; c < children; c++)
1202 		if (is_device_in_use(config, child[c], force, replacing,
1203 		    B_FALSE))
1204 			anyinuse = B_TRUE;
1205 
1206 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1207 	    &child, &children) == 0)
1208 		for (c = 0; c < children; c++)
1209 			if (is_device_in_use(config, child[c], force, replacing,
1210 			    B_TRUE))
1211 				anyinuse = B_TRUE;
1212 
1213 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1214 	    &child, &children) == 0)
1215 		for (c = 0; c < children; c++)
1216 			if (is_device_in_use(config, child[c], force, replacing,
1217 			    B_FALSE))
1218 				anyinuse = B_TRUE;
1219 
1220 	return (anyinuse);
1221 }
1222 
1223 static const char *
1224 is_grouping(const char *type, int *mindev, int *maxdev)
1225 {
1226 	if (strncmp(type, "raidz", 5) == 0) {
1227 		const char *p = type + 5;
1228 		char *end;
1229 		long nparity;
1230 
1231 		if (*p == '\0') {
1232 			nparity = 1;
1233 		} else if (*p == '0') {
1234 			return (NULL); /* no zero prefixes allowed */
1235 		} else {
1236 			errno = 0;
1237 			nparity = strtol(p, &end, 10);
1238 			if (errno != 0 || nparity < 1 || nparity >= 255 ||
1239 			    *end != '\0')
1240 				return (NULL);
1241 		}
1242 
1243 		if (mindev != NULL)
1244 			*mindev = nparity + 1;
1245 		if (maxdev != NULL)
1246 			*maxdev = 255;
1247 		return (VDEV_TYPE_RAIDZ);
1248 	}
1249 
1250 	if (maxdev != NULL)
1251 		*maxdev = INT_MAX;
1252 
1253 	if (strcmp(type, "mirror") == 0) {
1254 		if (mindev != NULL)
1255 			*mindev = 2;
1256 		return (VDEV_TYPE_MIRROR);
1257 	}
1258 
1259 	if (strcmp(type, "spare") == 0) {
1260 		if (mindev != NULL)
1261 			*mindev = 1;
1262 		return (VDEV_TYPE_SPARE);
1263 	}
1264 
1265 	if (strcmp(type, "log") == 0) {
1266 		if (mindev != NULL)
1267 			*mindev = 1;
1268 		return (VDEV_TYPE_LOG);
1269 	}
1270 
1271 	if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
1272 	    strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1273 		if (mindev != NULL)
1274 			*mindev = 1;
1275 		return (type);
1276 	}
1277 
1278 	if (strcmp(type, "cache") == 0) {
1279 		if (mindev != NULL)
1280 			*mindev = 1;
1281 		return (VDEV_TYPE_L2CACHE);
1282 	}
1283 
1284 	return (NULL);
1285 }
1286 
1287 /*
1288  * Construct a syntactically valid vdev specification,
1289  * and ensure that all devices and files exist and can be opened.
1290  * Note: we don't bother freeing anything in the error paths
1291  * because the program is just going to exit anyway.
1292  */
1293 nvlist_t *
1294 construct_spec(nvlist_t *props, int argc, char **argv)
1295 {
1296 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1297 	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1298 	const char *type;
1299 	uint64_t is_log, is_special, is_dedup;
1300 	boolean_t seen_logs;
1301 
1302 	top = NULL;
1303 	toplevels = 0;
1304 	spares = NULL;
1305 	l2cache = NULL;
1306 	nspares = 0;
1307 	nlogs = 0;
1308 	nl2cache = 0;
1309 	is_log = is_special = is_dedup = B_FALSE;
1310 	seen_logs = B_FALSE;
1311 
1312 	while (argc > 0) {
1313 		nv = NULL;
1314 
1315 		/*
1316 		 * If it's a mirror or raidz, the subsequent arguments are
1317 		 * its leaves -- until we encounter the next mirror or raidz.
1318 		 */
1319 		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1320 			nvlist_t **child = NULL;
1321 			int c, children = 0;
1322 
1323 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1324 				if (spares != NULL) {
1325 					(void) fprintf(stderr,
1326 					    gettext("invalid vdev "
1327 					    "specification: 'spare' can be "
1328 					    "specified only once\n"));
1329 					return (NULL);
1330 				}
1331 				is_log = is_special = is_dedup = B_FALSE;
1332 			}
1333 
1334 			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1335 				if (seen_logs) {
1336 					(void) fprintf(stderr,
1337 					    gettext("invalid vdev "
1338 					    "specification: 'log' can be "
1339 					    "specified only once\n"));
1340 					return (NULL);
1341 				}
1342 				seen_logs = B_TRUE;
1343 				is_log = B_TRUE;
1344 				is_special = B_FALSE;
1345 				is_dedup = B_FALSE;
1346 				argc--;
1347 				argv++;
1348 				/*
1349 				 * A log is not a real grouping device.
1350 				 * We just set is_log and continue.
1351 				 */
1352 				continue;
1353 			}
1354 
1355 			if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
1356 				is_special = B_TRUE;
1357 				is_log = B_FALSE;
1358 				is_dedup = B_FALSE;
1359 				argc--;
1360 				argv++;
1361 				continue;
1362 			}
1363 
1364 			if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1365 				is_dedup = B_TRUE;
1366 				is_log = B_FALSE;
1367 				is_special = B_FALSE;
1368 				argc--;
1369 				argv++;
1370 				continue;
1371 			}
1372 
1373 			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1374 				if (l2cache != NULL) {
1375 					(void) fprintf(stderr,
1376 					    gettext("invalid vdev "
1377 					    "specification: 'cache' can be "
1378 					    "specified only once\n"));
1379 					return (NULL);
1380 				}
1381 				is_log = is_special = is_dedup = B_FALSE;
1382 			}
1383 
1384 			if (is_log || is_special || is_dedup) {
1385 				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1386 					(void) fprintf(stderr,
1387 					    gettext("invalid vdev "
1388 					    "specification: unsupported '%s' "
1389 					    "device: %s\n"), is_log ? "log" :
1390 					    "special", type);
1391 					return (NULL);
1392 				}
1393 				nlogs++;
1394 			}
1395 
1396 			for (c = 1; c < argc; c++) {
1397 				if (is_grouping(argv[c], NULL, NULL) != NULL)
1398 					break;
1399 				children++;
1400 				child = realloc(child,
1401 				    children * sizeof (nvlist_t *));
1402 				if (child == NULL)
1403 					zpool_no_memory();
1404 				if ((nv = make_leaf_vdev(props, argv[c],
1405 				    B_FALSE)) == NULL)
1406 					return (NULL);
1407 				child[children - 1] = nv;
1408 			}
1409 
1410 			if (children < mindev) {
1411 				(void) fprintf(stderr, gettext("invalid vdev "
1412 				    "specification: %s requires at least %d "
1413 				    "devices\n"), argv[0], mindev);
1414 				return (NULL);
1415 			}
1416 
1417 			if (children > maxdev) {
1418 				(void) fprintf(stderr, gettext("invalid vdev "
1419 				    "specification: %s supports no more than "
1420 				    "%d devices\n"), argv[0], maxdev);
1421 				return (NULL);
1422 			}
1423 
1424 			argc -= c;
1425 			argv += c;
1426 
1427 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1428 				spares = child;
1429 				nspares = children;
1430 				continue;
1431 			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1432 				l2cache = child;
1433 				nl2cache = children;
1434 				continue;
1435 			} else {
1436 				/* create a top-level vdev with children */
1437 				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1438 				    0) == 0);
1439 				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1440 				    type) == 0);
1441 				verify(nvlist_add_uint64(nv,
1442 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1443 				if (is_log)
1444 					verify(nvlist_add_string(nv,
1445 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1446 					    VDEV_ALLOC_BIAS_LOG) == 0);
1447 				if (is_special) {
1448 					verify(nvlist_add_string(nv,
1449 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1450 					    VDEV_ALLOC_BIAS_SPECIAL) == 0);
1451 				}
1452 				if (is_dedup) {
1453 					verify(nvlist_add_string(nv,
1454 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1455 					    VDEV_ALLOC_BIAS_DEDUP) == 0);
1456 				}
1457 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1458 					verify(nvlist_add_uint64(nv,
1459 					    ZPOOL_CONFIG_NPARITY,
1460 					    mindev - 1) == 0);
1461 				}
1462 				verify(nvlist_add_nvlist_array(nv,
1463 				    ZPOOL_CONFIG_CHILDREN, child,
1464 				    children) == 0);
1465 
1466 				for (c = 0; c < children; c++)
1467 					nvlist_free(child[c]);
1468 				free(child);
1469 			}
1470 		} else {
1471 			/*
1472 			 * We have a device.  Pass off to make_leaf_vdev() to
1473 			 * construct the appropriate nvlist describing the vdev.
1474 			 */
1475 			if ((nv = make_leaf_vdev(props, argv[0], is_log))
1476 			    == NULL)
1477 				return (NULL);
1478 			if (is_log)
1479 				nlogs++;
1480 			if (is_special) {
1481 				verify(nvlist_add_string(nv,
1482 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1483 				    VDEV_ALLOC_BIAS_SPECIAL) == 0);
1484 			}
1485 			if (is_dedup) {
1486 				verify(nvlist_add_string(nv,
1487 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1488 				    VDEV_ALLOC_BIAS_DEDUP) == 0);
1489 			}
1490 			argc--;
1491 			argv++;
1492 		}
1493 
1494 		toplevels++;
1495 		top = realloc(top, toplevels * sizeof (nvlist_t *));
1496 		if (top == NULL)
1497 			zpool_no_memory();
1498 		top[toplevels - 1] = nv;
1499 	}
1500 
1501 	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1502 		(void) fprintf(stderr, gettext("invalid vdev "
1503 		    "specification: at least one toplevel vdev must be "
1504 		    "specified\n"));
1505 		return (NULL);
1506 	}
1507 
1508 	if (seen_logs && nlogs == 0) {
1509 		(void) fprintf(stderr, gettext("invalid vdev specification: "
1510 		    "log requires at least 1 device\n"));
1511 		return (NULL);
1512 	}
1513 
1514 	/*
1515 	 * Finally, create nvroot and add all top-level vdevs to it.
1516 	 */
1517 	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1518 	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1519 	    VDEV_TYPE_ROOT) == 0);
1520 	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1521 	    top, toplevels) == 0);
1522 	if (nspares != 0)
1523 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1524 		    spares, nspares) == 0);
1525 	if (nl2cache != 0)
1526 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1527 		    l2cache, nl2cache) == 0);
1528 
1529 	for (t = 0; t < toplevels; t++)
1530 		nvlist_free(top[t]);
1531 	for (t = 0; t < nspares; t++)
1532 		nvlist_free(spares[t]);
1533 	for (t = 0; t < nl2cache; t++)
1534 		nvlist_free(l2cache[t]);
1535 	if (spares)
1536 		free(spares);
1537 	if (l2cache)
1538 		free(l2cache);
1539 	free(top);
1540 
1541 	return (nvroot);
1542 }
1543 
1544 nvlist_t *
1545 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1546     splitflags_t flags, int argc, char **argv)
1547 {
1548 	nvlist_t *newroot = NULL, **child;
1549 	uint_t c, children;
1550 	zpool_boot_label_t boot_type;
1551 
1552 	if (argc > 0) {
1553 		if ((newroot = construct_spec(props, argc, argv)) == NULL) {
1554 			(void) fprintf(stderr, gettext("Unable to build a "
1555 			    "pool from the specified devices\n"));
1556 			return (NULL);
1557 		}
1558 
1559 		if (zpool_is_bootable(zhp))
1560 			boot_type = ZPOOL_COPY_BOOT_LABEL;
1561 		else
1562 			boot_type = ZPOOL_NO_BOOT_LABEL;
1563 
1564 		if (!flags.dryrun &&
1565 		    make_disks(zhp, newroot, boot_type, 0) != 0) {
1566 			nvlist_free(newroot);
1567 			return (NULL);
1568 		}
1569 
1570 		/* avoid any tricks in the spec */
1571 		verify(nvlist_lookup_nvlist_array(newroot,
1572 		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1573 		for (c = 0; c < children; c++) {
1574 			char *path;
1575 			const char *type;
1576 			int min, max;
1577 
1578 			verify(nvlist_lookup_string(child[c],
1579 			    ZPOOL_CONFIG_PATH, &path) == 0);
1580 			if ((type = is_grouping(path, &min, &max)) != NULL) {
1581 				(void) fprintf(stderr, gettext("Cannot use "
1582 				    "'%s' as a device for splitting\n"), type);
1583 				nvlist_free(newroot);
1584 				return (NULL);
1585 			}
1586 		}
1587 	}
1588 
1589 	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1590 		nvlist_free(newroot);
1591 		return (NULL);
1592 	}
1593 
1594 	return (newroot);
1595 }
1596 
1597 static int
1598 num_normal_vdevs(nvlist_t *nvroot)
1599 {
1600 	nvlist_t **top;
1601 	uint_t t, toplevels, normal = 0;
1602 
1603 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1604 	    &top, &toplevels) == 0);
1605 
1606 	for (t = 0; t < toplevels; t++) {
1607 		uint64_t log = B_FALSE;
1608 
1609 		(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
1610 		if (log)
1611 			continue;
1612 		if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
1613 			continue;
1614 
1615 		normal++;
1616 	}
1617 
1618 	return (normal);
1619 }
1620 
1621 /*
1622  * Get and validate the contents of the given vdev specification.  This ensures
1623  * that the nvlist returned is well-formed, that all the devices exist, and that
1624  * they are not currently in use by any other known consumer.  The 'poolconfig'
1625  * parameter is the current configuration of the pool when adding devices
1626  * existing pool, and is used to perform additional checks, such as changing the
1627  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1628  * new pool.  The 'force' flag controls whether devices should be forcefully
1629  * added, even if they appear in use.
1630  */
1631 nvlist_t *
1632 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
1633     boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type,
1634     uint64_t boot_size, int argc, char **argv)
1635 {
1636 	nvlist_t *newroot;
1637 	nvlist_t *poolconfig = NULL;
1638 	is_force = force;
1639 
1640 	/*
1641 	 * Construct the vdev specification.  If this is successful, we know
1642 	 * that we have a valid specification, and that all devices can be
1643 	 * opened.
1644 	 */
1645 	if ((newroot = construct_spec(props, argc, argv)) == NULL)
1646 		return (NULL);
1647 
1648 	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
1649 		return (NULL);
1650 
1651 	/*
1652 	 * Validate each device to make sure that its not shared with another
1653 	 * subsystem.  We do this even if 'force' is set, because there are some
1654 	 * uses (such as a dedicated dump device) that even '-f' cannot
1655 	 * override.
1656 	 */
1657 	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1658 		nvlist_free(newroot);
1659 		return (NULL);
1660 	}
1661 
1662 	/*
1663 	 * Check the replication level of the given vdevs and report any errors
1664 	 * found.  We include the existing pool spec, if any, as we need to
1665 	 * catch changes against the existing replication level.
1666 	 */
1667 	if (check_rep && check_replication(poolconfig, newroot) != 0) {
1668 		nvlist_free(newroot);
1669 		return (NULL);
1670 	}
1671 
1672 	/*
1673 	 * On pool create the new vdev spec must have one normal vdev.
1674 	 */
1675 	if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
1676 		vdev_error(gettext("at least one general top-level vdev must "
1677 		    "be specified\n"));
1678 		nvlist_free(newroot);
1679 		return (NULL);
1680 	}
1681 
1682 	/*
1683 	 * Run through the vdev specification and label any whole disks found.
1684 	 */
1685 	if (!dryrun && make_disks(zhp, newroot, boot_type, boot_size) != 0) {
1686 		nvlist_free(newroot);
1687 		return (NULL);
1688 	}
1689 
1690 	return (newroot);
1691 }
1692