xref: /illumos-gate/usr/src/cmd/zpool/zpool_vdev.c (revision 67d74cc3e7c9d9461311136a0b2069813a3fd927)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25  * Copyright (c) 2016, 2017 Intel Corporation.
26  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27  */
28 
29 /*
30  * Functions to convert between a list of vdevs and an nvlist representing the
31  * configuration.  Each entry in the list can be one of:
32  *
33  * 	Device vdevs
34  * 		disk=(path=..., devid=...)
35  * 		file=(path=...)
36  *
37  * 	Group vdevs
38  * 		raidz[1|2]=(...)
39  * 		mirror=(...)
40  *
41  * 	Hot spares
42  *
43  * While the underlying implementation supports it, group vdevs cannot contain
44  * other group vdevs.  All userland verification of devices is contained within
45  * this file.  If successful, the nvlist returned can be passed directly to the
46  * kernel; we've done as much verification as possible in userland.
47  *
48  * Hot spares are a special case, and passed down as an array of disk vdevs, at
49  * the same level as the root of the vdev tree.
50  *
51  * The only function exported by this file is 'make_root_vdev'.  The
52  * function performs several passes:
53  *
54  * 	1. Construct the vdev specification.  Performs syntax validation and
55  *         makes sure each device is valid.
56  * 	2. Check for devices in use.  Using libdiskmgt, makes sure that no
57  *         devices are also in use.  Some can be overridden using the 'force'
58  *         flag, others cannot.
59  * 	3. Check for replication errors if the 'force' flag is not specified.
60  *         validates that the replication level is consistent across the
61  *         entire pool.
62  * 	4. Call libzfs to label any whole disks with an EFI label.
63  */
64 
65 #include <assert.h>
66 #include <devid.h>
67 #include <errno.h>
68 #include <fcntl.h>
69 #include <libdiskmgt.h>
70 #include <libintl.h>
71 #include <libnvpair.h>
72 #include <limits.h>
73 #include <stdio.h>
74 #include <string.h>
75 #include <unistd.h>
76 #include <sys/efi_partition.h>
77 #include <sys/stat.h>
78 #include <sys/vtoc.h>
79 #include <sys/mntent.h>
80 
81 #include "zpool_util.h"
82 
83 #define	BACKUP_SLICE	"s2"
84 
85 /*
86  * For any given vdev specification, we can have multiple errors.  The
87  * vdev_error() function keeps track of whether we have seen an error yet, and
88  * prints out a header if its the first error we've seen.
89  */
90 boolean_t error_seen;
91 boolean_t is_force;
92 
93 /*PRINTFLIKE1*/
94 static void
95 vdev_error(const char *fmt, ...)
96 {
97 	va_list ap;
98 
99 	if (!error_seen) {
100 		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
101 		if (!is_force)
102 			(void) fprintf(stderr, gettext("use '-f' to override "
103 			    "the following errors:\n"));
104 		else
105 			(void) fprintf(stderr, gettext("the following errors "
106 			    "must be manually repaired:\n"));
107 		error_seen = B_TRUE;
108 	}
109 
110 	va_start(ap, fmt);
111 	(void) vfprintf(stderr, fmt, ap);
112 	va_end(ap);
113 }
114 
115 static void
116 libdiskmgt_error(int error)
117 {
118 	/*
119 	 * ENXIO/ENODEV is a valid error message if the device doesn't live in
120 	 * /dev/dsk.  Don't bother printing an error message in this case.
121 	 */
122 	if (error == ENXIO || error == ENODEV)
123 		return;
124 
125 	(void) fprintf(stderr, gettext("warning: device in use checking "
126 	    "failed: %s\n"), strerror(error));
127 }
128 
129 /*
130  * Validate a device, passing the bulk of the work off to libdiskmgt.
131  */
132 static int
133 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
134 {
135 	char *msg;
136 	int error = 0;
137 	dm_who_type_t who;
138 
139 	if (force)
140 		who = DM_WHO_ZPOOL_FORCE;
141 	else if (isspare)
142 		who = DM_WHO_ZPOOL_SPARE;
143 	else
144 		who = DM_WHO_ZPOOL;
145 
146 	if (dm_inuse((char *)path, &msg, who, &error) || error) {
147 		if (error != 0) {
148 			libdiskmgt_error(error);
149 			return (0);
150 		} else {
151 			vdev_error("%s", msg);
152 			free(msg);
153 			return (-1);
154 		}
155 	}
156 
157 	/*
158 	 * If we're given a whole disk, ignore overlapping slices since we're
159 	 * about to label it anyway.
160 	 */
161 	error = 0;
162 	if (!wholedisk && !force &&
163 	    (dm_isoverlapping((char *)path, &msg, &error) || error)) {
164 		if (error == 0) {
165 			/* dm_isoverlapping returned -1 */
166 			vdev_error(gettext("%s overlaps with %s\n"), path, msg);
167 			free(msg);
168 			return (-1);
169 		} else if (error != ENODEV) {
170 			/* libdiskmgt's devcache only handles physical drives */
171 			libdiskmgt_error(error);
172 			return (0);
173 		}
174 	}
175 
176 	return (0);
177 }
178 
179 
180 /*
181  * Validate a whole disk.  Iterate over all slices on the disk and make sure
182  * that none is in use by calling check_slice().
183  */
184 static int
185 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
186 {
187 	dm_descriptor_t *drive, *media, *slice;
188 	int err = 0;
189 	int i;
190 	int ret;
191 
192 	/*
193 	 * Get the drive associated with this disk.  This should never fail,
194 	 * because we already have an alias handle open for the device.
195 	 */
196 	if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
197 	    &err)) == NULL || *drive == NULL) {
198 		if (err)
199 			libdiskmgt_error(err);
200 		return (0);
201 	}
202 
203 	if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
204 	    &err)) == NULL) {
205 		dm_free_descriptors(drive);
206 		if (err)
207 			libdiskmgt_error(err);
208 		return (0);
209 	}
210 
211 	dm_free_descriptors(drive);
212 
213 	/*
214 	 * It is possible that the user has specified a removable media drive,
215 	 * and the media is not present.
216 	 */
217 	if (*media == NULL) {
218 		dm_free_descriptors(media);
219 		vdev_error(gettext("'%s' has no media in drive\n"), name);
220 		return (-1);
221 	}
222 
223 	if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
224 	    &err)) == NULL) {
225 		dm_free_descriptors(media);
226 		if (err)
227 			libdiskmgt_error(err);
228 		return (0);
229 	}
230 
231 	dm_free_descriptors(media);
232 
233 	ret = 0;
234 
235 	/*
236 	 * Iterate over all slices and report any errors.  We don't care about
237 	 * overlapping slices because we are using the whole disk.
238 	 */
239 	for (i = 0; slice[i] != NULL; i++) {
240 		char *name = dm_get_name(slice[i], &err);
241 
242 		if (check_slice(name, force, B_TRUE, isspare) != 0)
243 			ret = -1;
244 
245 		dm_free_name(name);
246 	}
247 
248 	dm_free_descriptors(slice);
249 	return (ret);
250 }
251 
252 /*
253  * Validate a device.
254  */
255 static int
256 check_device(const char *path, boolean_t force, boolean_t isspare)
257 {
258 	dm_descriptor_t desc;
259 	int err;
260 	char *dev;
261 
262 	/*
263 	 * For whole disks, libdiskmgt does not include the leading dev path.
264 	 */
265 	dev = strrchr(path, '/');
266 	assert(dev != NULL);
267 	dev++;
268 	if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
269 		err = check_disk(path, desc, force, isspare);
270 		dm_free_descriptor(desc);
271 		return (err);
272 	}
273 
274 	return (check_slice(path, force, B_FALSE, isspare));
275 }
276 
277 /*
278  * Check that a file is valid.  All we can do in this case is check that it's
279  * not in use by another pool, and not in use by swap.
280  */
281 static int
282 check_file(const char *file, boolean_t force, boolean_t isspare)
283 {
284 	char  *name;
285 	int fd;
286 	int ret = 0;
287 	int err;
288 	pool_state_t state;
289 	boolean_t inuse;
290 
291 	if (dm_inuse_swap(file, &err)) {
292 		if (err)
293 			libdiskmgt_error(err);
294 		else
295 			vdev_error(gettext("%s is currently used by swap. "
296 			    "Please see swap(1M).\n"), file);
297 		return (-1);
298 	}
299 
300 	if ((fd = open(file, O_RDONLY)) < 0)
301 		return (0);
302 
303 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
304 		const char *desc;
305 
306 		switch (state) {
307 		case POOL_STATE_ACTIVE:
308 			desc = gettext("active");
309 			break;
310 
311 		case POOL_STATE_EXPORTED:
312 			desc = gettext("exported");
313 			break;
314 
315 		case POOL_STATE_POTENTIALLY_ACTIVE:
316 			desc = gettext("potentially active");
317 			break;
318 
319 		default:
320 			desc = gettext("unknown");
321 			break;
322 		}
323 
324 		/*
325 		 * Allow hot spares to be shared between pools.
326 		 */
327 		if (state == POOL_STATE_SPARE && isspare)
328 			return (0);
329 
330 		if (state == POOL_STATE_ACTIVE ||
331 		    state == POOL_STATE_SPARE || !force) {
332 			switch (state) {
333 			case POOL_STATE_SPARE:
334 				vdev_error(gettext("%s is reserved as a hot "
335 				    "spare for pool %s\n"), file, name);
336 				break;
337 			default:
338 				vdev_error(gettext("%s is part of %s pool "
339 				    "'%s'\n"), file, desc, name);
340 				break;
341 			}
342 			ret = -1;
343 		}
344 
345 		free(name);
346 	}
347 
348 	(void) close(fd);
349 	return (ret);
350 }
351 
352 
353 /*
354  * By "whole disk" we mean an entire physical disk (something we can
355  * label, toggle the write cache on, etc.) as opposed to the full
356  * capacity of a pseudo-device such as lofi or did.  We act as if we
357  * are labeling the disk, which should be a pretty good test of whether
358  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
359  * it isn't.
360  */
361 static boolean_t
362 is_whole_disk(const char *arg)
363 {
364 	struct dk_gpt *label;
365 	int	fd;
366 	char	path[MAXPATHLEN];
367 
368 	(void) snprintf(path, sizeof (path), "%s%s%s",
369 	    ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
370 	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
371 		return (B_FALSE);
372 	if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
373 		(void) close(fd);
374 		return (B_FALSE);
375 	}
376 	efi_free(label);
377 	(void) close(fd);
378 	return (B_TRUE);
379 }
380 
381 /*
382  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
383  * device, fill in the device id to make a complete nvlist.  Valid forms for a
384  * leaf vdev are:
385  *
386  * 	/dev/dsk/xxx	Complete disk path
387  * 	/xxx		Full path to file
388  * 	xxx		Shorthand for /dev/dsk/xxx
389  */
390 static nvlist_t *
391 make_leaf_vdev(const char *arg, uint64_t is_log)
392 {
393 	char path[MAXPATHLEN];
394 	struct stat64 statbuf;
395 	nvlist_t *vdev = NULL;
396 	char *type = NULL;
397 	boolean_t wholedisk = B_FALSE;
398 
399 	/*
400 	 * Determine what type of vdev this is, and put the full path into
401 	 * 'path'.  We detect whether this is a device of file afterwards by
402 	 * checking the st_mode of the file.
403 	 */
404 	if (arg[0] == '/') {
405 		/*
406 		 * Complete device or file path.  Exact type is determined by
407 		 * examining the file descriptor afterwards.
408 		 */
409 		wholedisk = is_whole_disk(arg);
410 		if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
411 			(void) fprintf(stderr,
412 			    gettext("cannot open '%s': %s\n"),
413 			    arg, strerror(errno));
414 			return (NULL);
415 		}
416 
417 		(void) strlcpy(path, arg, sizeof (path));
418 	} else {
419 		/*
420 		 * This may be a short path for a device, or it could be total
421 		 * gibberish.  Check to see if it's a known device in
422 		 * /dev/dsk/.  As part of this check, see if we've been given a
423 		 * an entire disk (minus the slice number).
424 		 */
425 		(void) snprintf(path, sizeof (path), "%s/%s", ZFS_DISK_ROOT,
426 		    arg);
427 		wholedisk = is_whole_disk(path);
428 		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
429 			/*
430 			 * If we got ENOENT, then the user gave us
431 			 * gibberish, so try to direct them with a
432 			 * reasonable error message.  Otherwise,
433 			 * regurgitate strerror() since it's the best we
434 			 * can do.
435 			 */
436 			if (errno == ENOENT) {
437 				(void) fprintf(stderr,
438 				    gettext("cannot open '%s': no such "
439 				    "device in %s\n"), arg, ZFS_DISK_ROOT);
440 				(void) fprintf(stderr,
441 				    gettext("must be a full path or "
442 				    "shorthand device name\n"));
443 				return (NULL);
444 			} else {
445 				(void) fprintf(stderr,
446 				    gettext("cannot open '%s': %s\n"),
447 				    path, strerror(errno));
448 				return (NULL);
449 			}
450 		}
451 	}
452 
453 	/*
454 	 * Determine whether this is a device or a file.
455 	 */
456 	if (wholedisk || S_ISBLK(statbuf.st_mode)) {
457 		type = VDEV_TYPE_DISK;
458 	} else if (S_ISREG(statbuf.st_mode)) {
459 		type = VDEV_TYPE_FILE;
460 	} else {
461 		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
462 		    "block device or regular file\n"), path);
463 		return (NULL);
464 	}
465 
466 	/*
467 	 * Finally, we have the complete device or file, and we know that it is
468 	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
469 	 * vdevs have a 'path' element, and devices also have a 'devid' element.
470 	 */
471 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
472 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
473 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
474 	verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
475 	if (is_log)
476 		verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS,
477 		    VDEV_ALLOC_BIAS_LOG) == 0);
478 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
479 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
480 		    (uint64_t)wholedisk) == 0);
481 
482 	/*
483 	 * For a whole disk, defer getting its devid until after labeling it.
484 	 */
485 	if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
486 		/*
487 		 * Get the devid for the device.
488 		 */
489 		int fd;
490 		ddi_devid_t devid;
491 		char *minor = NULL, *devid_str = NULL;
492 
493 		if ((fd = open(path, O_RDONLY)) < 0) {
494 			(void) fprintf(stderr, gettext("cannot open '%s': "
495 			    "%s\n"), path, strerror(errno));
496 			nvlist_free(vdev);
497 			return (NULL);
498 		}
499 
500 		if (devid_get(fd, &devid) == 0) {
501 			if (devid_get_minor_name(fd, &minor) == 0 &&
502 			    (devid_str = devid_str_encode(devid, minor)) !=
503 			    NULL) {
504 				verify(nvlist_add_string(vdev,
505 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
506 			}
507 			if (devid_str != NULL)
508 				devid_str_free(devid_str);
509 			if (minor != NULL)
510 				devid_str_free(minor);
511 			devid_free(devid);
512 		}
513 
514 		(void) close(fd);
515 	}
516 
517 	return (vdev);
518 }
519 
520 /*
521  * Go through and verify the replication level of the pool is consistent.
522  * Performs the following checks:
523  *
524  * 	For the new spec, verifies that devices in mirrors and raidz are the
525  * 	same size.
526  *
527  * 	If the current configuration already has inconsistent replication
528  * 	levels, ignore any other potential problems in the new spec.
529  *
530  * 	Otherwise, make sure that the current spec (if there is one) and the new
531  * 	spec have consistent replication levels.
532  *
533  *	If there is no current spec (create), make sure new spec has at least
534  *	one general purpose vdev.
535  */
536 typedef struct replication_level {
537 	char *zprl_type;
538 	uint64_t zprl_children;
539 	uint64_t zprl_parity;
540 } replication_level_t;
541 
542 #define	ZPOOL_FUZZ	(16 * 1024 * 1024)
543 
544 static boolean_t
545 is_raidz_mirror(replication_level_t *a, replication_level_t *b,
546     replication_level_t **raidz, replication_level_t **mirror)
547 {
548 	if (strcmp(a->zprl_type, "raidz") == 0 &&
549 	    strcmp(b->zprl_type, "mirror") == 0) {
550 		*raidz = a;
551 		*mirror = b;
552 		return (B_TRUE);
553 	}
554 	return (B_FALSE);
555 }
556 
557 /*
558  * Given a list of toplevel vdevs, return the current replication level.  If
559  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
560  * an error message will be displayed for each self-inconsistent vdev.
561  */
562 static replication_level_t *
563 get_replication(nvlist_t *nvroot, boolean_t fatal)
564 {
565 	nvlist_t **top;
566 	uint_t t, toplevels;
567 	nvlist_t **child;
568 	uint_t c, children;
569 	nvlist_t *nv;
570 	char *type;
571 	replication_level_t lastrep = {0};
572 	replication_level_t rep;
573 	replication_level_t *ret;
574 	replication_level_t *raidz, *mirror;
575 	boolean_t dontreport;
576 
577 	ret = safe_malloc(sizeof (replication_level_t));
578 
579 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
580 	    &top, &toplevels) == 0);
581 
582 	for (t = 0; t < toplevels; t++) {
583 		uint64_t is_log = B_FALSE;
584 
585 		nv = top[t];
586 
587 		/*
588 		 * For separate logs we ignore the top level vdev replication
589 		 * constraints.
590 		 */
591 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
592 		if (is_log)
593 			continue;
594 
595 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
596 		    &type) == 0);
597 		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
598 		    &child, &children) != 0) {
599 			/*
600 			 * This is a 'file' or 'disk' vdev.
601 			 */
602 			rep.zprl_type = type;
603 			rep.zprl_children = 1;
604 			rep.zprl_parity = 0;
605 		} else {
606 			uint64_t vdev_size;
607 
608 			/*
609 			 * This is a mirror or RAID-Z vdev.  Go through and make
610 			 * sure the contents are all the same (files vs. disks),
611 			 * keeping track of the number of elements in the
612 			 * process.
613 			 *
614 			 * We also check that the size of each vdev (if it can
615 			 * be determined) is the same.
616 			 */
617 			rep.zprl_type = type;
618 			rep.zprl_children = 0;
619 
620 			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
621 				verify(nvlist_lookup_uint64(nv,
622 				    ZPOOL_CONFIG_NPARITY,
623 				    &rep.zprl_parity) == 0);
624 				assert(rep.zprl_parity != 0);
625 			} else {
626 				rep.zprl_parity = 0;
627 			}
628 
629 			/*
630 			 * The 'dontreport' variable indicates that we've
631 			 * already reported an error for this spec, so don't
632 			 * bother doing it again.
633 			 */
634 			type = NULL;
635 			dontreport = 0;
636 			vdev_size = -1ULL;
637 			for (c = 0; c < children; c++) {
638 				nvlist_t *cnv = child[c];
639 				char *path;
640 				struct stat64 statbuf;
641 				uint64_t size = -1ULL;
642 				char *childtype;
643 				int fd, err;
644 
645 				rep.zprl_children++;
646 
647 				verify(nvlist_lookup_string(cnv,
648 				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
649 
650 				/*
651 				 * If this is a replacing or spare vdev, then
652 				 * get the real first child of the vdev: do this
653 				 * in a loop because replacing and spare vdevs
654 				 * can be nested.
655 				 */
656 				while (strcmp(childtype,
657 				    VDEV_TYPE_REPLACING) == 0 ||
658 				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
659 					nvlist_t **rchild;
660 					uint_t rchildren;
661 
662 					verify(nvlist_lookup_nvlist_array(cnv,
663 					    ZPOOL_CONFIG_CHILDREN, &rchild,
664 					    &rchildren) == 0);
665 					assert(rchildren == 2);
666 					cnv = rchild[0];
667 
668 					verify(nvlist_lookup_string(cnv,
669 					    ZPOOL_CONFIG_TYPE,
670 					    &childtype) == 0);
671 				}
672 
673 				verify(nvlist_lookup_string(cnv,
674 				    ZPOOL_CONFIG_PATH, &path) == 0);
675 
676 				/*
677 				 * If we have a raidz/mirror that combines disks
678 				 * with files, report it as an error.
679 				 */
680 				if (!dontreport && type != NULL &&
681 				    strcmp(type, childtype) != 0) {
682 					if (ret != NULL)
683 						free(ret);
684 					ret = NULL;
685 					if (fatal)
686 						vdev_error(gettext(
687 						    "mismatched replication "
688 						    "level: %s contains both "
689 						    "files and devices\n"),
690 						    rep.zprl_type);
691 					else
692 						return (NULL);
693 					dontreport = B_TRUE;
694 				}
695 
696 				/*
697 				 * According to stat(2), the value of 'st_size'
698 				 * is undefined for block devices and character
699 				 * devices.  But there is no effective way to
700 				 * determine the real size in userland.
701 				 *
702 				 * Instead, we'll take advantage of an
703 				 * implementation detail of spec_size().  If the
704 				 * device is currently open, then we (should)
705 				 * return a valid size.
706 				 *
707 				 * If we still don't get a valid size (indicated
708 				 * by a size of 0 or MAXOFFSET_T), then ignore
709 				 * this device altogether.
710 				 */
711 				if ((fd = open(path, O_RDONLY)) >= 0) {
712 					err = fstat64(fd, &statbuf);
713 					(void) close(fd);
714 				} else {
715 					err = stat64(path, &statbuf);
716 				}
717 
718 				if (err != 0 ||
719 				    statbuf.st_size == 0 ||
720 				    statbuf.st_size == MAXOFFSET_T)
721 					continue;
722 
723 				size = statbuf.st_size;
724 
725 				/*
726 				 * Also make sure that devices and
727 				 * slices have a consistent size.  If
728 				 * they differ by a significant amount
729 				 * (~16MB) then report an error.
730 				 */
731 				if (!dontreport &&
732 				    (vdev_size != -1ULL &&
733 				    (labs(size - vdev_size) >
734 				    ZPOOL_FUZZ))) {
735 					if (ret != NULL)
736 						free(ret);
737 					ret = NULL;
738 					if (fatal)
739 						vdev_error(gettext(
740 						    "%s contains devices of "
741 						    "different sizes\n"),
742 						    rep.zprl_type);
743 					else
744 						return (NULL);
745 					dontreport = B_TRUE;
746 				}
747 
748 				type = childtype;
749 				vdev_size = size;
750 			}
751 		}
752 
753 		/*
754 		 * At this point, we have the replication of the last toplevel
755 		 * vdev in 'rep'.  Compare it to 'lastrep' to see if it is
756 		 * different.
757 		 */
758 		if (lastrep.zprl_type != NULL) {
759 			if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
760 			    is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
761 				/*
762 				 * Accepted raidz and mirror when they can
763 				 * handle the same number of disk failures.
764 				 */
765 				if (raidz->zprl_parity !=
766 				    mirror->zprl_children - 1) {
767 					if (ret != NULL)
768 						free(ret);
769 					ret = NULL;
770 					if (fatal)
771 						vdev_error(gettext(
772 						    "mismatched replication "
773 						    "level: "
774 						    "%s and %s vdevs with "
775 						    "different redundancy, "
776 						    "%llu vs. %llu (%llu-way) "
777 						    "are present\n"),
778 						    raidz->zprl_type,
779 						    mirror->zprl_type,
780 						    raidz->zprl_parity,
781 						    mirror->zprl_children - 1,
782 						    mirror->zprl_children);
783 					else
784 						return (NULL);
785 				}
786 			} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
787 			    0) {
788 				if (ret != NULL)
789 					free(ret);
790 				ret = NULL;
791 				if (fatal)
792 					vdev_error(gettext(
793 					    "mismatched replication level: "
794 					    "both %s and %s vdevs are "
795 					    "present\n"),
796 					    lastrep.zprl_type, rep.zprl_type);
797 				else
798 					return (NULL);
799 			} else if (lastrep.zprl_parity != rep.zprl_parity) {
800 				if (ret)
801 					free(ret);
802 				ret = NULL;
803 				if (fatal)
804 					vdev_error(gettext(
805 					    "mismatched replication level: "
806 					    "both %llu and %llu device parity "
807 					    "%s vdevs are present\n"),
808 					    lastrep.zprl_parity,
809 					    rep.zprl_parity,
810 					    rep.zprl_type);
811 				else
812 					return (NULL);
813 			} else if (lastrep.zprl_children != rep.zprl_children) {
814 				if (ret)
815 					free(ret);
816 				ret = NULL;
817 				if (fatal)
818 					vdev_error(gettext(
819 					    "mismatched replication level: "
820 					    "both %llu-way and %llu-way %s "
821 					    "vdevs are present\n"),
822 					    lastrep.zprl_children,
823 					    rep.zprl_children,
824 					    rep.zprl_type);
825 				else
826 					return (NULL);
827 			}
828 		}
829 		lastrep = rep;
830 	}
831 
832 	if (ret != NULL)
833 		*ret = rep;
834 
835 	return (ret);
836 }
837 
838 /*
839  * Check the replication level of the vdev spec against the current pool.  Calls
840  * get_replication() to make sure the new spec is self-consistent.  If the pool
841  * has a consistent replication level, then we ignore any errors.  Otherwise,
842  * report any difference between the two.
843  */
844 static int
845 check_replication(nvlist_t *config, nvlist_t *newroot)
846 {
847 	nvlist_t **child;
848 	uint_t	children;
849 	replication_level_t *current = NULL, *new;
850 	replication_level_t *raidz, *mirror;
851 	int ret;
852 
853 	/*
854 	 * If we have a current pool configuration, check to see if it's
855 	 * self-consistent.  If not, simply return success.
856 	 */
857 	if (config != NULL) {
858 		nvlist_t *nvroot;
859 
860 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
861 		    &nvroot) == 0);
862 		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
863 			return (0);
864 	}
865 	/*
866 	 * for spares there may be no children, and therefore no
867 	 * replication level to check
868 	 */
869 	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
870 	    &child, &children) != 0) || (children == 0)) {
871 		free(current);
872 		return (0);
873 	}
874 
875 	/*
876 	 * If all we have is logs then there's no replication level to check.
877 	 */
878 	if (num_logs(newroot) == children) {
879 		free(current);
880 		return (0);
881 	}
882 
883 	/*
884 	 * Get the replication level of the new vdev spec, reporting any
885 	 * inconsistencies found.
886 	 */
887 	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
888 		free(current);
889 		return (-1);
890 	}
891 
892 	/*
893 	 * Check to see if the new vdev spec matches the replication level of
894 	 * the current pool.
895 	 */
896 	ret = 0;
897 	if (current != NULL) {
898 		if (is_raidz_mirror(current, new, &raidz, &mirror) ||
899 		    is_raidz_mirror(new, current, &raidz, &mirror)) {
900 			if (raidz->zprl_parity != mirror->zprl_children - 1) {
901 				vdev_error(gettext(
902 				    "mismatched replication level: pool and "
903 				    "new vdev with different redundancy, %s "
904 				    "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
905 				    raidz->zprl_type,
906 				    mirror->zprl_type,
907 				    raidz->zprl_parity,
908 				    mirror->zprl_children - 1,
909 				    mirror->zprl_children);
910 				ret = -1;
911 			}
912 		} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
913 			vdev_error(gettext(
914 			    "mismatched replication level: pool uses %s "
915 			    "and new vdev is %s\n"),
916 			    current->zprl_type, new->zprl_type);
917 			ret = -1;
918 		} else if (current->zprl_parity != new->zprl_parity) {
919 			vdev_error(gettext(
920 			    "mismatched replication level: pool uses %llu "
921 			    "device parity and new vdev uses %llu\n"),
922 			    current->zprl_parity, new->zprl_parity);
923 			ret = -1;
924 		} else if (current->zprl_children != new->zprl_children) {
925 			vdev_error(gettext(
926 			    "mismatched replication level: pool uses %llu-way "
927 			    "%s and new vdev uses %llu-way %s\n"),
928 			    current->zprl_children, current->zprl_type,
929 			    new->zprl_children, new->zprl_type);
930 			ret = -1;
931 		}
932 	}
933 
934 	free(new);
935 	if (current != NULL)
936 		free(current);
937 
938 	return (ret);
939 }
940 
941 /*
942  * Go through and find any whole disks in the vdev specification, labelling them
943  * as appropriate.  When constructing the vdev spec, we were unable to open this
944  * device in order to provide a devid.  Now that we have labelled the disk and
945  * know the pool slice is valid, we can construct the devid now.
946  *
947  * If the disk was already labeled with an EFI label, we will have gotten the
948  * devid already (because we were able to open the whole disk).  Otherwise, we
949  * need to get the devid after we label the disk.
950  */
951 static int
952 make_disks(zpool_handle_t *zhp, nvlist_t *nv, zpool_boot_label_t boot_type,
953     uint64_t boot_size)
954 {
955 	nvlist_t **child;
956 	uint_t c, children;
957 	char *type, *path, *diskname;
958 	char buf[MAXPATHLEN];
959 	uint64_t wholedisk;
960 	int fd;
961 	int ret;
962 	int slice;
963 	ddi_devid_t devid;
964 	char *minor = NULL, *devid_str = NULL;
965 
966 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
967 
968 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
969 	    &child, &children) != 0) {
970 
971 		if (strcmp(type, VDEV_TYPE_DISK) != 0)
972 			return (0);
973 
974 		/*
975 		 * We have a disk device.  Get the path to the device
976 		 * and see if it's a whole disk by appending the backup
977 		 * slice and stat()ing the device.
978 		 */
979 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
980 
981 		diskname = strrchr(path, '/');
982 		assert(diskname != NULL);
983 		diskname++;
984 
985 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
986 		    &wholedisk) != 0 || !wholedisk) {
987 			/*
988 			 * This is not whole disk, return error if
989 			 * boot partition creation was requested
990 			 */
991 			if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
992 				(void) fprintf(stderr,
993 				    gettext("creating boot partition is only "
994 				    "supported on whole disk vdevs: %s\n"),
995 				    diskname);
996 				return (-1);
997 			}
998 			return (0);
999 		}
1000 
1001 		ret = zpool_label_disk(g_zfs, zhp, diskname, boot_type,
1002 		    boot_size, &slice);
1003 		if (ret == -1)
1004 			return (ret);
1005 
1006 		/*
1007 		 * Fill in the devid, now that we've labeled the disk.
1008 		 */
1009 		(void) snprintf(buf, sizeof (buf), "%ss%d", path, slice);
1010 		if ((fd = open(buf, O_RDONLY)) < 0) {
1011 			(void) fprintf(stderr,
1012 			    gettext("cannot open '%s': %s\n"),
1013 			    buf, strerror(errno));
1014 			return (-1);
1015 		}
1016 
1017 		if (devid_get(fd, &devid) == 0) {
1018 			if (devid_get_minor_name(fd, &minor) == 0 &&
1019 			    (devid_str = devid_str_encode(devid, minor)) !=
1020 			    NULL) {
1021 				verify(nvlist_add_string(nv,
1022 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
1023 			}
1024 			if (devid_str != NULL)
1025 				devid_str_free(devid_str);
1026 			if (minor != NULL)
1027 				devid_str_free(minor);
1028 			devid_free(devid);
1029 		}
1030 
1031 		/*
1032 		 * Update the path to refer to the pool slice.  The presence of
1033 		 * the 'whole_disk' field indicates to the CLI that we should
1034 		 * chop off the slice number when displaying the device in
1035 		 * future output.
1036 		 */
1037 		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
1038 
1039 		(void) close(fd);
1040 
1041 		return (0);
1042 	}
1043 
1044 	/* illumos kernel does not support booting from multi-vdev pools. */
1045 	if ((boot_type == ZPOOL_CREATE_BOOT_LABEL)) {
1046 		if ((strcmp(type, VDEV_TYPE_ROOT) == 0) && children > 1) {
1047 			(void) fprintf(stderr, gettext("boot pool "
1048 			    "can not have more than one vdev\n"));
1049 			return (-1);
1050 		}
1051 	}
1052 
1053 	for (c = 0; c < children; c++) {
1054 		ret = make_disks(zhp, child[c], boot_type, boot_size);
1055 		if (ret != 0)
1056 			return (ret);
1057 	}
1058 
1059 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1060 	    &child, &children) == 0)
1061 		for (c = 0; c < children; c++) {
1062 			ret = make_disks(zhp, child[c], boot_type, boot_size);
1063 			if (ret != 0)
1064 				return (ret);
1065 		}
1066 
1067 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1068 	    &child, &children) == 0)
1069 		for (c = 0; c < children; c++) {
1070 			ret = make_disks(zhp, child[c], boot_type, boot_size);
1071 			if (ret != 0)
1072 				return (ret);
1073 		}
1074 
1075 	return (0);
1076 }
1077 
1078 /*
1079  * Determine if the given path is a hot spare within the given configuration.
1080  */
1081 static boolean_t
1082 is_spare(nvlist_t *config, const char *path)
1083 {
1084 	int fd;
1085 	pool_state_t state;
1086 	char *name = NULL;
1087 	nvlist_t *label;
1088 	uint64_t guid, spareguid;
1089 	nvlist_t *nvroot;
1090 	nvlist_t **spares;
1091 	uint_t i, nspares;
1092 	boolean_t inuse;
1093 
1094 	if ((fd = open(path, O_RDONLY)) < 0)
1095 		return (B_FALSE);
1096 
1097 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
1098 	    !inuse ||
1099 	    state != POOL_STATE_SPARE ||
1100 	    zpool_read_label(fd, &label) != 0) {
1101 		free(name);
1102 		(void) close(fd);
1103 		return (B_FALSE);
1104 	}
1105 	free(name);
1106 	(void) close(fd);
1107 
1108 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
1109 	nvlist_free(label);
1110 
1111 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1112 	    &nvroot) == 0);
1113 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1114 	    &spares, &nspares) == 0) {
1115 		for (i = 0; i < nspares; i++) {
1116 			verify(nvlist_lookup_uint64(spares[i],
1117 			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
1118 			if (spareguid == guid)
1119 				return (B_TRUE);
1120 		}
1121 	}
1122 
1123 	return (B_FALSE);
1124 }
1125 
1126 /*
1127  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1128  * the majority of this task.
1129  */
1130 static boolean_t
1131 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1132     boolean_t replacing, boolean_t isspare)
1133 {
1134 	nvlist_t **child;
1135 	uint_t c, children;
1136 	char *type, *path;
1137 	int ret = 0;
1138 	char buf[MAXPATHLEN];
1139 	uint64_t wholedisk;
1140 	boolean_t anyinuse = B_FALSE;
1141 
1142 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1143 
1144 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1145 	    &child, &children) != 0) {
1146 
1147 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
1148 
1149 		/*
1150 		 * As a generic check, we look to see if this is a replace of a
1151 		 * hot spare within the same pool.  If so, we allow it
1152 		 * regardless of what libdiskmgt or zpool_in_use() says.
1153 		 */
1154 		if (replacing) {
1155 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1156 			    &wholedisk) == 0 && wholedisk)
1157 				(void) snprintf(buf, sizeof (buf), "%ss0",
1158 				    path);
1159 			else
1160 				(void) strlcpy(buf, path, sizeof (buf));
1161 
1162 			if (is_spare(config, buf))
1163 				return (B_FALSE);
1164 		}
1165 
1166 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
1167 			ret = check_device(path, force, isspare);
1168 		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1169 			ret = check_file(path, force, isspare);
1170 
1171 		return (ret != 0);
1172 	}
1173 
1174 	for (c = 0; c < children; c++)
1175 		if (is_device_in_use(config, child[c], force, replacing,
1176 		    B_FALSE))
1177 			anyinuse = B_TRUE;
1178 
1179 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1180 	    &child, &children) == 0)
1181 		for (c = 0; c < children; c++)
1182 			if (is_device_in_use(config, child[c], force, replacing,
1183 			    B_TRUE))
1184 				anyinuse = B_TRUE;
1185 
1186 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1187 	    &child, &children) == 0)
1188 		for (c = 0; c < children; c++)
1189 			if (is_device_in_use(config, child[c], force, replacing,
1190 			    B_FALSE))
1191 				anyinuse = B_TRUE;
1192 
1193 	return (anyinuse);
1194 }
1195 
1196 static const char *
1197 is_grouping(const char *type, int *mindev, int *maxdev)
1198 {
1199 	if (strncmp(type, "raidz", 5) == 0) {
1200 		const char *p = type + 5;
1201 		char *end;
1202 		long nparity;
1203 
1204 		if (*p == '\0') {
1205 			nparity = 1;
1206 		} else if (*p == '0') {
1207 			return (NULL); /* no zero prefixes allowed */
1208 		} else {
1209 			errno = 0;
1210 			nparity = strtol(p, &end, 10);
1211 			if (errno != 0 || nparity < 1 || nparity >= 255 ||
1212 			    *end != '\0')
1213 				return (NULL);
1214 		}
1215 
1216 		if (mindev != NULL)
1217 			*mindev = nparity + 1;
1218 		if (maxdev != NULL)
1219 			*maxdev = 255;
1220 		return (VDEV_TYPE_RAIDZ);
1221 	}
1222 
1223 	if (maxdev != NULL)
1224 		*maxdev = INT_MAX;
1225 
1226 	if (strcmp(type, "mirror") == 0) {
1227 		if (mindev != NULL)
1228 			*mindev = 2;
1229 		return (VDEV_TYPE_MIRROR);
1230 	}
1231 
1232 	if (strcmp(type, "spare") == 0) {
1233 		if (mindev != NULL)
1234 			*mindev = 1;
1235 		return (VDEV_TYPE_SPARE);
1236 	}
1237 
1238 	if (strcmp(type, "log") == 0) {
1239 		if (mindev != NULL)
1240 			*mindev = 1;
1241 		return (VDEV_TYPE_LOG);
1242 	}
1243 
1244 	if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
1245 	    strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1246 		if (mindev != NULL)
1247 			*mindev = 1;
1248 		return (type);
1249 	}
1250 
1251 	if (strcmp(type, "cache") == 0) {
1252 		if (mindev != NULL)
1253 			*mindev = 1;
1254 		return (VDEV_TYPE_L2CACHE);
1255 	}
1256 
1257 	return (NULL);
1258 }
1259 
1260 /*
1261  * Construct a syntactically valid vdev specification,
1262  * and ensure that all devices and files exist and can be opened.
1263  * Note: we don't bother freeing anything in the error paths
1264  * because the program is just going to exit anyway.
1265  */
1266 nvlist_t *
1267 construct_spec(int argc, char **argv)
1268 {
1269 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1270 	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1271 	const char *type;
1272 	uint64_t is_log, is_special, is_dedup;
1273 	boolean_t seen_logs;
1274 
1275 	top = NULL;
1276 	toplevels = 0;
1277 	spares = NULL;
1278 	l2cache = NULL;
1279 	nspares = 0;
1280 	nlogs = 0;
1281 	nl2cache = 0;
1282 	is_log = is_special = is_dedup = B_FALSE;
1283 	seen_logs = B_FALSE;
1284 
1285 	while (argc > 0) {
1286 		nv = NULL;
1287 
1288 		/*
1289 		 * If it's a mirror or raidz, the subsequent arguments are
1290 		 * its leaves -- until we encounter the next mirror or raidz.
1291 		 */
1292 		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1293 			nvlist_t **child = NULL;
1294 			int c, children = 0;
1295 
1296 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1297 				if (spares != NULL) {
1298 					(void) fprintf(stderr,
1299 					    gettext("invalid vdev "
1300 					    "specification: 'spare' can be "
1301 					    "specified only once\n"));
1302 					return (NULL);
1303 				}
1304 				is_log = is_special = is_dedup = B_FALSE;
1305 			}
1306 
1307 			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1308 				if (seen_logs) {
1309 					(void) fprintf(stderr,
1310 					    gettext("invalid vdev "
1311 					    "specification: 'log' can be "
1312 					    "specified only once\n"));
1313 					return (NULL);
1314 				}
1315 				seen_logs = B_TRUE;
1316 				is_log = B_TRUE;
1317 				is_special = B_FALSE;
1318 				is_dedup = B_FALSE;
1319 				argc--;
1320 				argv++;
1321 				/*
1322 				 * A log is not a real grouping device.
1323 				 * We just set is_log and continue.
1324 				 */
1325 				continue;
1326 			}
1327 
1328 			if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
1329 				is_special = B_TRUE;
1330 				is_log = B_FALSE;
1331 				is_dedup = B_FALSE;
1332 				argc--;
1333 				argv++;
1334 				continue;
1335 			}
1336 
1337 			if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1338 				is_dedup = B_TRUE;
1339 				is_log = B_FALSE;
1340 				is_special = B_FALSE;
1341 				argc--;
1342 				argv++;
1343 				continue;
1344 			}
1345 
1346 			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1347 				if (l2cache != NULL) {
1348 					(void) fprintf(stderr,
1349 					    gettext("invalid vdev "
1350 					    "specification: 'cache' can be "
1351 					    "specified only once\n"));
1352 					return (NULL);
1353 				}
1354 				is_log = is_special = is_dedup = B_FALSE;
1355 			}
1356 
1357 			if (is_log || is_special || is_dedup) {
1358 				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1359 					(void) fprintf(stderr,
1360 					    gettext("invalid vdev "
1361 					    "specification: unsupported '%s' "
1362 					    "device: %s\n"), is_log ? "log" :
1363 					    "special", type);
1364 					return (NULL);
1365 				}
1366 				nlogs++;
1367 			}
1368 
1369 			for (c = 1; c < argc; c++) {
1370 				if (is_grouping(argv[c], NULL, NULL) != NULL)
1371 					break;
1372 				children++;
1373 				child = realloc(child,
1374 				    children * sizeof (nvlist_t *));
1375 				if (child == NULL)
1376 					zpool_no_memory();
1377 				if ((nv = make_leaf_vdev(argv[c], B_FALSE))
1378 				    == NULL)
1379 					return (NULL);
1380 				child[children - 1] = nv;
1381 			}
1382 
1383 			if (children < mindev) {
1384 				(void) fprintf(stderr, gettext("invalid vdev "
1385 				    "specification: %s requires at least %d "
1386 				    "devices\n"), argv[0], mindev);
1387 				return (NULL);
1388 			}
1389 
1390 			if (children > maxdev) {
1391 				(void) fprintf(stderr, gettext("invalid vdev "
1392 				    "specification: %s supports no more than "
1393 				    "%d devices\n"), argv[0], maxdev);
1394 				return (NULL);
1395 			}
1396 
1397 			argc -= c;
1398 			argv += c;
1399 
1400 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1401 				spares = child;
1402 				nspares = children;
1403 				continue;
1404 			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1405 				l2cache = child;
1406 				nl2cache = children;
1407 				continue;
1408 			} else {
1409 				/* create a top-level vdev with children */
1410 				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1411 				    0) == 0);
1412 				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1413 				    type) == 0);
1414 				verify(nvlist_add_uint64(nv,
1415 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1416 				if (is_log)
1417 					verify(nvlist_add_string(nv,
1418 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1419 					    VDEV_ALLOC_BIAS_LOG) == 0);
1420 				if (is_special) {
1421 					verify(nvlist_add_string(nv,
1422 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1423 					    VDEV_ALLOC_BIAS_SPECIAL) == 0);
1424 				}
1425 				if (is_dedup) {
1426 					verify(nvlist_add_string(nv,
1427 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1428 					    VDEV_ALLOC_BIAS_DEDUP) == 0);
1429 				}
1430 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1431 					verify(nvlist_add_uint64(nv,
1432 					    ZPOOL_CONFIG_NPARITY,
1433 					    mindev - 1) == 0);
1434 				}
1435 				verify(nvlist_add_nvlist_array(nv,
1436 				    ZPOOL_CONFIG_CHILDREN, child,
1437 				    children) == 0);
1438 
1439 				for (c = 0; c < children; c++)
1440 					nvlist_free(child[c]);
1441 				free(child);
1442 			}
1443 		} else {
1444 			/*
1445 			 * We have a device.  Pass off to make_leaf_vdev() to
1446 			 * construct the appropriate nvlist describing the vdev.
1447 			 */
1448 			if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL)
1449 				return (NULL);
1450 			if (is_log)
1451 				nlogs++;
1452 			if (is_special) {
1453 				verify(nvlist_add_string(nv,
1454 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1455 				    VDEV_ALLOC_BIAS_SPECIAL) == 0);
1456 			}
1457 			if (is_dedup) {
1458 				verify(nvlist_add_string(nv,
1459 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1460 				    VDEV_ALLOC_BIAS_DEDUP) == 0);
1461 			}
1462 			argc--;
1463 			argv++;
1464 		}
1465 
1466 		toplevels++;
1467 		top = realloc(top, toplevels * sizeof (nvlist_t *));
1468 		if (top == NULL)
1469 			zpool_no_memory();
1470 		top[toplevels - 1] = nv;
1471 	}
1472 
1473 	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1474 		(void) fprintf(stderr, gettext("invalid vdev "
1475 		    "specification: at least one toplevel vdev must be "
1476 		    "specified\n"));
1477 		return (NULL);
1478 	}
1479 
1480 	if (seen_logs && nlogs == 0) {
1481 		(void) fprintf(stderr, gettext("invalid vdev specification: "
1482 		    "log requires at least 1 device\n"));
1483 		return (NULL);
1484 	}
1485 
1486 	/*
1487 	 * Finally, create nvroot and add all top-level vdevs to it.
1488 	 */
1489 	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1490 	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1491 	    VDEV_TYPE_ROOT) == 0);
1492 	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1493 	    top, toplevels) == 0);
1494 	if (nspares != 0)
1495 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1496 		    spares, nspares) == 0);
1497 	if (nl2cache != 0)
1498 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1499 		    l2cache, nl2cache) == 0);
1500 
1501 	for (t = 0; t < toplevels; t++)
1502 		nvlist_free(top[t]);
1503 	for (t = 0; t < nspares; t++)
1504 		nvlist_free(spares[t]);
1505 	for (t = 0; t < nl2cache; t++)
1506 		nvlist_free(l2cache[t]);
1507 	if (spares)
1508 		free(spares);
1509 	if (l2cache)
1510 		free(l2cache);
1511 	free(top);
1512 
1513 	return (nvroot);
1514 }
1515 
1516 nvlist_t *
1517 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1518     splitflags_t flags, int argc, char **argv)
1519 {
1520 	nvlist_t *newroot = NULL, **child;
1521 	uint_t c, children;
1522 	zpool_boot_label_t boot_type;
1523 
1524 	if (argc > 0) {
1525 		if ((newroot = construct_spec(argc, argv)) == NULL) {
1526 			(void) fprintf(stderr, gettext("Unable to build a "
1527 			    "pool from the specified devices\n"));
1528 			return (NULL);
1529 		}
1530 
1531 		if (zpool_is_bootable(zhp))
1532 			boot_type = ZPOOL_COPY_BOOT_LABEL;
1533 		else
1534 			boot_type = ZPOOL_NO_BOOT_LABEL;
1535 
1536 		if (!flags.dryrun &&
1537 		    make_disks(zhp, newroot, boot_type, 0) != 0) {
1538 			nvlist_free(newroot);
1539 			return (NULL);
1540 		}
1541 
1542 		/* avoid any tricks in the spec */
1543 		verify(nvlist_lookup_nvlist_array(newroot,
1544 		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1545 		for (c = 0; c < children; c++) {
1546 			char *path;
1547 			const char *type;
1548 			int min, max;
1549 
1550 			verify(nvlist_lookup_string(child[c],
1551 			    ZPOOL_CONFIG_PATH, &path) == 0);
1552 			if ((type = is_grouping(path, &min, &max)) != NULL) {
1553 				(void) fprintf(stderr, gettext("Cannot use "
1554 				    "'%s' as a device for splitting\n"), type);
1555 				nvlist_free(newroot);
1556 				return (NULL);
1557 			}
1558 		}
1559 	}
1560 
1561 	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1562 		nvlist_free(newroot);
1563 		return (NULL);
1564 	}
1565 
1566 	return (newroot);
1567 }
1568 
1569 static int
1570 num_normal_vdevs(nvlist_t *nvroot)
1571 {
1572 	nvlist_t **top;
1573 	uint_t t, toplevels, normal = 0;
1574 
1575 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1576 	    &top, &toplevels) == 0);
1577 
1578 	for (t = 0; t < toplevels; t++) {
1579 		uint64_t log = B_FALSE;
1580 
1581 		(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
1582 		if (log)
1583 			continue;
1584 		if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
1585 			continue;
1586 
1587 		normal++;
1588 	}
1589 
1590 	return (normal);
1591 }
1592 
1593 /*
1594  * Get and validate the contents of the given vdev specification.  This ensures
1595  * that the nvlist returned is well-formed, that all the devices exist, and that
1596  * they are not currently in use by any other known consumer.  The 'poolconfig'
1597  * parameter is the current configuration of the pool when adding devices
1598  * existing pool, and is used to perform additional checks, such as changing the
1599  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1600  * new pool.  The 'force' flag controls whether devices should be forcefully
1601  * added, even if they appear in use.
1602  */
1603 nvlist_t *
1604 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
1605     boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type,
1606     uint64_t boot_size, int argc, char **argv)
1607 {
1608 	nvlist_t *newroot;
1609 	nvlist_t *poolconfig = NULL;
1610 	is_force = force;
1611 
1612 	/*
1613 	 * Construct the vdev specification.  If this is successful, we know
1614 	 * that we have a valid specification, and that all devices can be
1615 	 * opened.
1616 	 */
1617 	if ((newroot = construct_spec(argc, argv)) == NULL)
1618 		return (NULL);
1619 
1620 	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
1621 		return (NULL);
1622 
1623 	/*
1624 	 * Validate each device to make sure that its not shared with another
1625 	 * subsystem.  We do this even if 'force' is set, because there are some
1626 	 * uses (such as a dedicated dump device) that even '-f' cannot
1627 	 * override.
1628 	 */
1629 	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1630 		nvlist_free(newroot);
1631 		return (NULL);
1632 	}
1633 
1634 	/*
1635 	 * Check the replication level of the given vdevs and report any errors
1636 	 * found.  We include the existing pool spec, if any, as we need to
1637 	 * catch changes against the existing replication level.
1638 	 */
1639 	if (check_rep && check_replication(poolconfig, newroot) != 0) {
1640 		nvlist_free(newroot);
1641 		return (NULL);
1642 	}
1643 
1644 	/*
1645 	 * On pool create the new vdev spec must have one normal vdev.
1646 	 */
1647 	if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
1648 		vdev_error(gettext("at least one general top-level vdev must "
1649 		    "be specified\n"));
1650 		nvlist_free(newroot);
1651 		return (NULL);
1652 	}
1653 
1654 	/*
1655 	 * Run through the vdev specification and label any whole disks found.
1656 	 */
1657 	if (!dryrun && make_disks(zhp, newroot, boot_type, boot_size) != 0) {
1658 		nvlist_free(newroot);
1659 		return (NULL);
1660 	}
1661 
1662 	return (newroot);
1663 }
1664