xref: /freebsd/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c (revision 6ba2210ee039f2f12878c217bcf058e9c8b26b29)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
25  * Copyright (c) 2016, 2017 Intel Corporation.
26  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
27  */
28 
29 /*
30  * Functions to convert between a list of vdevs and an nvlist representing the
31  * configuration.  Each entry in the list can be one of:
32  *
33  * 	Device vdevs
34  * 		disk=(path=..., devid=...)
35  * 		file=(path=...)
36  *
37  * 	Group vdevs
38  * 		raidz[1|2]=(...)
39  * 		mirror=(...)
40  *
41  * 	Hot spares
42  *
43  * While the underlying implementation supports it, group vdevs cannot contain
44  * other group vdevs.  All userland verification of devices is contained within
45  * this file.  If successful, the nvlist returned can be passed directly to the
46  * kernel; we've done as much verification as possible in userland.
47  *
48  * Hot spares are a special case, and passed down as an array of disk vdevs, at
49  * the same level as the root of the vdev tree.
50  *
51  * The only function exported by this file is 'make_root_vdev'.  The
52  * function performs several passes:
53  *
54  * 	1. Construct the vdev specification.  Performs syntax validation and
55  *         makes sure each device is valid.
56  * 	2. Check for devices in use.  Using libblkid to make sure that no
57  *         devices are also in use.  Some can be overridden using the 'force'
58  *         flag, others cannot.
59  * 	3. Check for replication errors if the 'force' flag is not specified.
60  *         validates that the replication level is consistent across the
61  *         entire pool.
62  * 	4. Call libzfs to label any whole disks with an EFI label.
63  */
64 
65 #include <assert.h>
66 #include <ctype.h>
67 #include <errno.h>
68 #include <fcntl.h>
69 #include <libintl.h>
70 #include <libnvpair.h>
71 #include <libzutil.h>
72 #include <limits.h>
73 #include <sys/spa.h>
74 #include <stdio.h>
75 #include <string.h>
76 #include <unistd.h>
77 #include "zpool_util.h"
78 #include <sys/zfs_context.h>
79 #include <sys/stat.h>
80 
81 /*
82  * For any given vdev specification, we can have multiple errors.  The
83  * vdev_error() function keeps track of whether we have seen an error yet, and
84  * prints out a header if its the first error we've seen.
85  */
86 boolean_t error_seen;
87 boolean_t is_force;
88 
89 void
90 vdev_error(const char *fmt, ...)
91 {
92 	va_list ap;
93 
94 	if (!error_seen) {
95 		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
96 		if (!is_force)
97 			(void) fprintf(stderr, gettext("use '-f' to override "
98 			    "the following errors:\n"));
99 		else
100 			(void) fprintf(stderr, gettext("the following errors "
101 			    "must be manually repaired:\n"));
102 		error_seen = B_TRUE;
103 	}
104 
105 	va_start(ap, fmt);
106 	(void) vfprintf(stderr, fmt, ap);
107 	va_end(ap);
108 }
109 
110 /*
111  * Check that a file is valid.  All we can do in this case is check that it's
112  * not in use by another pool, and not in use by swap.
113  */
114 int
115 check_file_generic(const char *file, boolean_t force, boolean_t isspare)
116 {
117 	char  *name;
118 	int fd;
119 	int ret = 0;
120 	pool_state_t state;
121 	boolean_t inuse;
122 
123 	if ((fd = open(file, O_RDONLY)) < 0)
124 		return (0);
125 
126 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
127 		const char *desc;
128 
129 		switch (state) {
130 		case POOL_STATE_ACTIVE:
131 			desc = gettext("active");
132 			break;
133 
134 		case POOL_STATE_EXPORTED:
135 			desc = gettext("exported");
136 			break;
137 
138 		case POOL_STATE_POTENTIALLY_ACTIVE:
139 			desc = gettext("potentially active");
140 			break;
141 
142 		default:
143 			desc = gettext("unknown");
144 			break;
145 		}
146 
147 		/*
148 		 * Allow hot spares to be shared between pools.
149 		 */
150 		if (state == POOL_STATE_SPARE && isspare) {
151 			free(name);
152 			(void) close(fd);
153 			return (0);
154 		}
155 
156 		if (state == POOL_STATE_ACTIVE ||
157 		    state == POOL_STATE_SPARE || !force) {
158 			switch (state) {
159 			case POOL_STATE_SPARE:
160 				vdev_error(gettext("%s is reserved as a hot "
161 				    "spare for pool %s\n"), file, name);
162 				break;
163 			default:
164 				vdev_error(gettext("%s is part of %s pool "
165 				    "'%s'\n"), file, desc, name);
166 				break;
167 			}
168 			ret = -1;
169 		}
170 
171 		free(name);
172 	}
173 
174 	(void) close(fd);
175 	return (ret);
176 }
177 
178 /*
179  * This may be a shorthand device path or it could be total gibberish.
180  * Check to see if it is a known device available in zfs_vdev_paths.
181  * As part of this check, see if we've been given an entire disk
182  * (minus the slice number).
183  */
184 static int
185 is_shorthand_path(const char *arg, char *path, size_t path_size,
186     struct stat64 *statbuf, boolean_t *wholedisk)
187 {
188 	int error;
189 
190 	error = zfs_resolve_shortname(arg, path, path_size);
191 	if (error == 0) {
192 		*wholedisk = zfs_dev_is_whole_disk(path);
193 		if (*wholedisk || (stat64(path, statbuf) == 0))
194 			return (0);
195 	}
196 
197 	strlcpy(path, arg, path_size);
198 	memset(statbuf, 0, sizeof (*statbuf));
199 	*wholedisk = B_FALSE;
200 
201 	return (error);
202 }
203 
204 /*
205  * Determine if the given path is a hot spare within the given configuration.
206  * If no configuration is given we rely solely on the label.
207  */
208 static boolean_t
209 is_spare(nvlist_t *config, const char *path)
210 {
211 	int fd;
212 	pool_state_t state;
213 	char *name = NULL;
214 	nvlist_t *label;
215 	uint64_t guid, spareguid;
216 	nvlist_t *nvroot;
217 	nvlist_t **spares;
218 	uint_t i, nspares;
219 	boolean_t inuse;
220 
221 	if (zpool_is_draid_spare(path))
222 		return (B_TRUE);
223 
224 	if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0)
225 		return (B_FALSE);
226 
227 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
228 	    !inuse ||
229 	    state != POOL_STATE_SPARE ||
230 	    zpool_read_label(fd, &label, NULL) != 0) {
231 		free(name);
232 		(void) close(fd);
233 		return (B_FALSE);
234 	}
235 	free(name);
236 	(void) close(fd);
237 
238 	if (config == NULL) {
239 		nvlist_free(label);
240 		return (B_TRUE);
241 	}
242 
243 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
244 	nvlist_free(label);
245 
246 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
247 	    &nvroot) == 0);
248 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
249 	    &spares, &nspares) == 0) {
250 		for (i = 0; i < nspares; i++) {
251 			verify(nvlist_lookup_uint64(spares[i],
252 			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
253 			if (spareguid == guid)
254 				return (B_TRUE);
255 		}
256 	}
257 
258 	return (B_FALSE);
259 }
260 
261 /*
262  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
263  * device, fill in the device id to make a complete nvlist.  Valid forms for a
264  * leaf vdev are:
265  *
266  *	/dev/xxx	Complete disk path
267  *	/xxx		Full path to file
268  *	xxx		Shorthand for <zfs_vdev_paths>/xxx
269  *	draid*		Virtual dRAID spare
270  */
271 static nvlist_t *
272 make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary)
273 {
274 	char path[MAXPATHLEN];
275 	struct stat64 statbuf;
276 	nvlist_t *vdev = NULL;
277 	char *type = NULL;
278 	boolean_t wholedisk = B_FALSE;
279 	uint64_t ashift = 0;
280 	int err;
281 
282 	/*
283 	 * Determine what type of vdev this is, and put the full path into
284 	 * 'path'.  We detect whether this is a device of file afterwards by
285 	 * checking the st_mode of the file.
286 	 */
287 	if (arg[0] == '/') {
288 		/*
289 		 * Complete device or file path.  Exact type is determined by
290 		 * examining the file descriptor afterwards.  Symbolic links
291 		 * are resolved to their real paths to determine whole disk
292 		 * and S_ISBLK/S_ISREG type checks.  However, we are careful
293 		 * to store the given path as ZPOOL_CONFIG_PATH to ensure we
294 		 * can leverage udev's persistent device labels.
295 		 */
296 		if (realpath(arg, path) == NULL) {
297 			(void) fprintf(stderr,
298 			    gettext("cannot resolve path '%s'\n"), arg);
299 			return (NULL);
300 		}
301 
302 		wholedisk = zfs_dev_is_whole_disk(path);
303 		if (!wholedisk && (stat64(path, &statbuf) != 0)) {
304 			(void) fprintf(stderr,
305 			    gettext("cannot open '%s': %s\n"),
306 			    path, strerror(errno));
307 			return (NULL);
308 		}
309 
310 		/* After whole disk check restore original passed path */
311 		strlcpy(path, arg, sizeof (path));
312 	} else if (zpool_is_draid_spare(arg)) {
313 		if (!is_primary) {
314 			(void) fprintf(stderr,
315 			    gettext("cannot open '%s': dRAID spares can only "
316 			    "be used to replace primary vdevs\n"), arg);
317 			return (NULL);
318 		}
319 
320 		wholedisk = B_TRUE;
321 		strlcpy(path, arg, sizeof (path));
322 		type = VDEV_TYPE_DRAID_SPARE;
323 	} else {
324 		err = is_shorthand_path(arg, path, sizeof (path),
325 		    &statbuf, &wholedisk);
326 		if (err != 0) {
327 			/*
328 			 * If we got ENOENT, then the user gave us
329 			 * gibberish, so try to direct them with a
330 			 * reasonable error message.  Otherwise,
331 			 * regurgitate strerror() since it's the best we
332 			 * can do.
333 			 */
334 			if (err == ENOENT) {
335 				(void) fprintf(stderr,
336 				    gettext("cannot open '%s': no such "
337 				    "device in %s\n"), arg, DISK_ROOT);
338 				(void) fprintf(stderr,
339 				    gettext("must be a full path or "
340 				    "shorthand device name\n"));
341 				return (NULL);
342 			} else {
343 				(void) fprintf(stderr,
344 				    gettext("cannot open '%s': %s\n"),
345 				    path, strerror(errno));
346 				return (NULL);
347 			}
348 		}
349 	}
350 
351 	if (type == NULL) {
352 		/*
353 		 * Determine whether this is a device or a file.
354 		 */
355 		if (wholedisk || S_ISBLK(statbuf.st_mode)) {
356 			type = VDEV_TYPE_DISK;
357 		} else if (S_ISREG(statbuf.st_mode)) {
358 			type = VDEV_TYPE_FILE;
359 		} else {
360 			fprintf(stderr, gettext("cannot use '%s': must "
361 			    "be a block device or regular file\n"), path);
362 			return (NULL);
363 		}
364 	}
365 
366 	/*
367 	 * Finally, we have the complete device or file, and we know that it is
368 	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
369 	 * vdevs have a 'path' element, and devices also have a 'devid' element.
370 	 */
371 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
372 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
373 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
374 
375 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
376 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
377 		    (uint64_t)wholedisk) == 0);
378 
379 	/*
380 	 * Override defaults if custom properties are provided.
381 	 */
382 	if (props != NULL) {
383 		char *value = NULL;
384 
385 		if (nvlist_lookup_string(props,
386 		    zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) {
387 			if (zfs_nicestrtonum(NULL, value, &ashift) != 0) {
388 				(void) fprintf(stderr,
389 				    gettext("ashift must be a number.\n"));
390 				return (NULL);
391 			}
392 			if (ashift != 0 &&
393 			    (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) {
394 				(void) fprintf(stderr,
395 				    gettext("invalid 'ashift=%" PRIu64 "' "
396 				    "property: only values between %" PRId32 " "
397 				    "and %" PRId32 " are allowed.\n"),
398 				    ashift, ASHIFT_MIN, ASHIFT_MAX);
399 				return (NULL);
400 			}
401 		}
402 	}
403 
404 	/*
405 	 * If the device is known to incorrectly report its physical sector
406 	 * size explicitly provide the known correct value.
407 	 */
408 	if (ashift == 0) {
409 		int sector_size;
410 
411 		if (check_sector_size_database(path, &sector_size) == B_TRUE)
412 			ashift = highbit64(sector_size) - 1;
413 	}
414 
415 	if (ashift > 0)
416 		(void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift);
417 
418 	return (vdev);
419 }
420 
421 /*
422  * Go through and verify the replication level of the pool is consistent.
423  * Performs the following checks:
424  *
425  * 	For the new spec, verifies that devices in mirrors and raidz are the
426  * 	same size.
427  *
428  * 	If the current configuration already has inconsistent replication
429  * 	levels, ignore any other potential problems in the new spec.
430  *
431  * 	Otherwise, make sure that the current spec (if there is one) and the new
432  * 	spec have consistent replication levels.
433  *
434  *	If there is no current spec (create), make sure new spec has at least
435  *	one general purpose vdev.
436  */
437 typedef struct replication_level {
438 	char *zprl_type;
439 	uint64_t zprl_children;
440 	uint64_t zprl_parity;
441 } replication_level_t;
442 
443 #define	ZPOOL_FUZZ	(16 * 1024 * 1024)
444 
445 /*
446  * N.B. For the purposes of comparing replication levels dRAID can be
447  * considered functionally equivalent to raidz.
448  */
449 static boolean_t
450 is_raidz_mirror(replication_level_t *a, replication_level_t *b,
451     replication_level_t **raidz, replication_level_t **mirror)
452 {
453 	if ((strcmp(a->zprl_type, "raidz") == 0 ||
454 	    strcmp(a->zprl_type, "draid") == 0) &&
455 	    strcmp(b->zprl_type, "mirror") == 0) {
456 		*raidz = a;
457 		*mirror = b;
458 		return (B_TRUE);
459 	}
460 	return (B_FALSE);
461 }
462 
463 /*
464  * Comparison for determining if dRAID and raidz where passed in either order.
465  */
466 static boolean_t
467 is_raidz_draid(replication_level_t *a, replication_level_t *b)
468 {
469 	if ((strcmp(a->zprl_type, "raidz") == 0 ||
470 	    strcmp(a->zprl_type, "draid") == 0) &&
471 	    (strcmp(b->zprl_type, "raidz") == 0 ||
472 	    strcmp(b->zprl_type, "draid") == 0)) {
473 		return (B_TRUE);
474 	}
475 
476 	return (B_FALSE);
477 }
478 
479 /*
480  * Given a list of toplevel vdevs, return the current replication level.  If
481  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
482  * an error message will be displayed for each self-inconsistent vdev.
483  */
484 static replication_level_t *
485 get_replication(nvlist_t *nvroot, boolean_t fatal)
486 {
487 	nvlist_t **top;
488 	uint_t t, toplevels;
489 	nvlist_t **child;
490 	uint_t c, children;
491 	nvlist_t *nv;
492 	char *type;
493 	replication_level_t lastrep = {0};
494 	replication_level_t rep;
495 	replication_level_t *ret;
496 	replication_level_t *raidz, *mirror;
497 	boolean_t dontreport;
498 
499 	ret = safe_malloc(sizeof (replication_level_t));
500 
501 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
502 	    &top, &toplevels) == 0);
503 
504 	for (t = 0; t < toplevels; t++) {
505 		uint64_t is_log = B_FALSE;
506 
507 		nv = top[t];
508 
509 		/*
510 		 * For separate logs we ignore the top level vdev replication
511 		 * constraints.
512 		 */
513 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
514 		if (is_log)
515 			continue;
516 
517 		/* Ignore holes introduced by removing aux devices */
518 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
519 		if (strcmp(type, VDEV_TYPE_HOLE) == 0)
520 			continue;
521 
522 		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
523 		    &child, &children) != 0) {
524 			/*
525 			 * This is a 'file' or 'disk' vdev.
526 			 */
527 			rep.zprl_type = type;
528 			rep.zprl_children = 1;
529 			rep.zprl_parity = 0;
530 		} else {
531 			int64_t vdev_size;
532 
533 			/*
534 			 * This is a mirror or RAID-Z vdev.  Go through and make
535 			 * sure the contents are all the same (files vs. disks),
536 			 * keeping track of the number of elements in the
537 			 * process.
538 			 *
539 			 * We also check that the size of each vdev (if it can
540 			 * be determined) is the same.
541 			 */
542 			rep.zprl_type = type;
543 			rep.zprl_children = 0;
544 
545 			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
546 			    strcmp(type, VDEV_TYPE_DRAID) == 0) {
547 				verify(nvlist_lookup_uint64(nv,
548 				    ZPOOL_CONFIG_NPARITY,
549 				    &rep.zprl_parity) == 0);
550 				assert(rep.zprl_parity != 0);
551 			} else {
552 				rep.zprl_parity = 0;
553 			}
554 
555 			/*
556 			 * The 'dontreport' variable indicates that we've
557 			 * already reported an error for this spec, so don't
558 			 * bother doing it again.
559 			 */
560 			type = NULL;
561 			dontreport = 0;
562 			vdev_size = -1LL;
563 			for (c = 0; c < children; c++) {
564 				nvlist_t *cnv = child[c];
565 				char *path;
566 				struct stat64 statbuf;
567 				int64_t size = -1LL;
568 				char *childtype;
569 				int fd, err;
570 
571 				rep.zprl_children++;
572 
573 				verify(nvlist_lookup_string(cnv,
574 				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
575 
576 				/*
577 				 * If this is a replacing or spare vdev, then
578 				 * get the real first child of the vdev: do this
579 				 * in a loop because replacing and spare vdevs
580 				 * can be nested.
581 				 */
582 				while (strcmp(childtype,
583 				    VDEV_TYPE_REPLACING) == 0 ||
584 				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
585 					nvlist_t **rchild;
586 					uint_t rchildren;
587 
588 					verify(nvlist_lookup_nvlist_array(cnv,
589 					    ZPOOL_CONFIG_CHILDREN, &rchild,
590 					    &rchildren) == 0);
591 					assert(rchildren == 2);
592 					cnv = rchild[0];
593 
594 					verify(nvlist_lookup_string(cnv,
595 					    ZPOOL_CONFIG_TYPE,
596 					    &childtype) == 0);
597 				}
598 
599 				verify(nvlist_lookup_string(cnv,
600 				    ZPOOL_CONFIG_PATH, &path) == 0);
601 
602 				/*
603 				 * If we have a raidz/mirror that combines disks
604 				 * with files, report it as an error.
605 				 */
606 				if (!dontreport && type != NULL &&
607 				    strcmp(type, childtype) != 0) {
608 					if (ret != NULL)
609 						free(ret);
610 					ret = NULL;
611 					if (fatal)
612 						vdev_error(gettext(
613 						    "mismatched replication "
614 						    "level: %s contains both "
615 						    "files and devices\n"),
616 						    rep.zprl_type);
617 					else
618 						return (NULL);
619 					dontreport = B_TRUE;
620 				}
621 
622 				/*
623 				 * According to stat(2), the value of 'st_size'
624 				 * is undefined for block devices and character
625 				 * devices.  But there is no effective way to
626 				 * determine the real size in userland.
627 				 *
628 				 * Instead, we'll take advantage of an
629 				 * implementation detail of spec_size().  If the
630 				 * device is currently open, then we (should)
631 				 * return a valid size.
632 				 *
633 				 * If we still don't get a valid size (indicated
634 				 * by a size of 0 or MAXOFFSET_T), then ignore
635 				 * this device altogether.
636 				 */
637 				if ((fd = open(path, O_RDONLY)) >= 0) {
638 					err = fstat64_blk(fd, &statbuf);
639 					(void) close(fd);
640 				} else {
641 					err = stat64(path, &statbuf);
642 				}
643 
644 				if (err != 0 ||
645 				    statbuf.st_size == 0 ||
646 				    statbuf.st_size == MAXOFFSET_T)
647 					continue;
648 
649 				size = statbuf.st_size;
650 
651 				/*
652 				 * Also make sure that devices and
653 				 * slices have a consistent size.  If
654 				 * they differ by a significant amount
655 				 * (~16MB) then report an error.
656 				 */
657 				if (!dontreport &&
658 				    (vdev_size != -1LL &&
659 				    (llabs(size - vdev_size) >
660 				    ZPOOL_FUZZ))) {
661 					if (ret != NULL)
662 						free(ret);
663 					ret = NULL;
664 					if (fatal)
665 						vdev_error(gettext(
666 						    "%s contains devices of "
667 						    "different sizes\n"),
668 						    rep.zprl_type);
669 					else
670 						return (NULL);
671 					dontreport = B_TRUE;
672 				}
673 
674 				type = childtype;
675 				vdev_size = size;
676 			}
677 		}
678 
679 		/*
680 		 * At this point, we have the replication of the last toplevel
681 		 * vdev in 'rep'.  Compare it to 'lastrep' to see if it is
682 		 * different.
683 		 */
684 		if (lastrep.zprl_type != NULL) {
685 			if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) ||
686 			    is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) {
687 				/*
688 				 * Accepted raidz and mirror when they can
689 				 * handle the same number of disk failures.
690 				 */
691 				if (raidz->zprl_parity !=
692 				    mirror->zprl_children - 1) {
693 					if (ret != NULL)
694 						free(ret);
695 					ret = NULL;
696 					if (fatal)
697 						vdev_error(gettext(
698 						    "mismatched replication "
699 						    "level: "
700 						    "%s and %s vdevs with "
701 						    "different redundancy, "
702 						    "%llu vs. %llu (%llu-way) "
703 						    "are present\n"),
704 						    raidz->zprl_type,
705 						    mirror->zprl_type,
706 						    (u_longlong_t)
707 						    raidz->zprl_parity,
708 						    (u_longlong_t)
709 						    mirror->zprl_children - 1,
710 						    (u_longlong_t)
711 						    mirror->zprl_children);
712 					else
713 						return (NULL);
714 				}
715 			} else if (is_raidz_draid(&lastrep, &rep)) {
716 				/*
717 				 * Accepted raidz and draid when they can
718 				 * handle the same number of disk failures.
719 				 */
720 				if (lastrep.zprl_parity != rep.zprl_parity) {
721 					if (ret != NULL)
722 						free(ret);
723 					ret = NULL;
724 					if (fatal)
725 						vdev_error(gettext(
726 						    "mismatched replication "
727 						    "level: %s and %s vdevs "
728 						    "with different "
729 						    "redundancy, %llu vs. "
730 						    "%llu are present\n"),
731 						    lastrep.zprl_type,
732 						    rep.zprl_type,
733 						    (u_longlong_t)
734 						    lastrep.zprl_parity,
735 						    (u_longlong_t)
736 						    rep.zprl_parity);
737 					else
738 						return (NULL);
739 				}
740 			} else if (strcmp(lastrep.zprl_type, rep.zprl_type) !=
741 			    0) {
742 				if (ret != NULL)
743 					free(ret);
744 				ret = NULL;
745 				if (fatal)
746 					vdev_error(gettext(
747 					    "mismatched replication level: "
748 					    "both %s and %s vdevs are "
749 					    "present\n"),
750 					    lastrep.zprl_type, rep.zprl_type);
751 				else
752 					return (NULL);
753 			} else if (lastrep.zprl_parity != rep.zprl_parity) {
754 				if (ret)
755 					free(ret);
756 				ret = NULL;
757 				if (fatal)
758 					vdev_error(gettext(
759 					    "mismatched replication level: "
760 					    "both %llu and %llu device parity "
761 					    "%s vdevs are present\n"),
762 					    (u_longlong_t)
763 					    lastrep.zprl_parity,
764 					    (u_longlong_t)rep.zprl_parity,
765 					    rep.zprl_type);
766 				else
767 					return (NULL);
768 			} else if (lastrep.zprl_children != rep.zprl_children) {
769 				if (ret)
770 					free(ret);
771 				ret = NULL;
772 				if (fatal)
773 					vdev_error(gettext(
774 					    "mismatched replication level: "
775 					    "both %llu-way and %llu-way %s "
776 					    "vdevs are present\n"),
777 					    (u_longlong_t)
778 					    lastrep.zprl_children,
779 					    (u_longlong_t)
780 					    rep.zprl_children,
781 					    rep.zprl_type);
782 				else
783 					return (NULL);
784 			}
785 		}
786 		lastrep = rep;
787 	}
788 
789 	if (ret != NULL)
790 		*ret = rep;
791 
792 	return (ret);
793 }
794 
795 /*
796  * Check the replication level of the vdev spec against the current pool.  Calls
797  * get_replication() to make sure the new spec is self-consistent.  If the pool
798  * has a consistent replication level, then we ignore any errors.  Otherwise,
799  * report any difference between the two.
800  */
801 static int
802 check_replication(nvlist_t *config, nvlist_t *newroot)
803 {
804 	nvlist_t **child;
805 	uint_t	children;
806 	replication_level_t *current = NULL, *new;
807 	replication_level_t *raidz, *mirror;
808 	int ret;
809 
810 	/*
811 	 * If we have a current pool configuration, check to see if it's
812 	 * self-consistent.  If not, simply return success.
813 	 */
814 	if (config != NULL) {
815 		nvlist_t *nvroot;
816 
817 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
818 		    &nvroot) == 0);
819 		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
820 			return (0);
821 	}
822 	/*
823 	 * for spares there may be no children, and therefore no
824 	 * replication level to check
825 	 */
826 	if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
827 	    &child, &children) != 0) || (children == 0)) {
828 		free(current);
829 		return (0);
830 	}
831 
832 	/*
833 	 * If all we have is logs then there's no replication level to check.
834 	 */
835 	if (num_logs(newroot) == children) {
836 		free(current);
837 		return (0);
838 	}
839 
840 	/*
841 	 * Get the replication level of the new vdev spec, reporting any
842 	 * inconsistencies found.
843 	 */
844 	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
845 		free(current);
846 		return (-1);
847 	}
848 
849 	/*
850 	 * Check to see if the new vdev spec matches the replication level of
851 	 * the current pool.
852 	 */
853 	ret = 0;
854 	if (current != NULL) {
855 		if (is_raidz_mirror(current, new, &raidz, &mirror) ||
856 		    is_raidz_mirror(new, current, &raidz, &mirror)) {
857 			if (raidz->zprl_parity != mirror->zprl_children - 1) {
858 				vdev_error(gettext(
859 				    "mismatched replication level: pool and "
860 				    "new vdev with different redundancy, %s "
861 				    "and %s vdevs, %llu vs. %llu (%llu-way)\n"),
862 				    raidz->zprl_type,
863 				    mirror->zprl_type,
864 				    (u_longlong_t)raidz->zprl_parity,
865 				    (u_longlong_t)mirror->zprl_children - 1,
866 				    (u_longlong_t)mirror->zprl_children);
867 				ret = -1;
868 			}
869 		} else if (strcmp(current->zprl_type, new->zprl_type) != 0) {
870 			vdev_error(gettext(
871 			    "mismatched replication level: pool uses %s "
872 			    "and new vdev is %s\n"),
873 			    current->zprl_type, new->zprl_type);
874 			ret = -1;
875 		} else if (current->zprl_parity != new->zprl_parity) {
876 			vdev_error(gettext(
877 			    "mismatched replication level: pool uses %llu "
878 			    "device parity and new vdev uses %llu\n"),
879 			    (u_longlong_t)current->zprl_parity,
880 			    (u_longlong_t)new->zprl_parity);
881 			ret = -1;
882 		} else if (current->zprl_children != new->zprl_children) {
883 			vdev_error(gettext(
884 			    "mismatched replication level: pool uses %llu-way "
885 			    "%s and new vdev uses %llu-way %s\n"),
886 			    (u_longlong_t)current->zprl_children,
887 			    current->zprl_type,
888 			    (u_longlong_t)new->zprl_children,
889 			    new->zprl_type);
890 			ret = -1;
891 		}
892 	}
893 
894 	free(new);
895 	if (current != NULL)
896 		free(current);
897 
898 	return (ret);
899 }
900 
901 static int
902 zero_label(char *path)
903 {
904 	const int size = 4096;
905 	char buf[size];
906 	int err, fd;
907 
908 	if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) {
909 		(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
910 		    path, strerror(errno));
911 		return (-1);
912 	}
913 
914 	memset(buf, 0, size);
915 	err = write(fd, buf, size);
916 	(void) fdatasync(fd);
917 	(void) close(fd);
918 
919 	if (err == -1) {
920 		(void) fprintf(stderr, gettext("cannot zero first %d bytes "
921 		    "of '%s': %s\n"), size, path, strerror(errno));
922 		return (-1);
923 	}
924 
925 	if (err != size) {
926 		(void) fprintf(stderr, gettext("could only zero %d/%d bytes "
927 		    "of '%s'\n"), err, size, path);
928 		return (-1);
929 	}
930 
931 	return (0);
932 }
933 
934 /*
935  * Go through and find any whole disks in the vdev specification, labelling them
936  * as appropriate.  When constructing the vdev spec, we were unable to open this
937  * device in order to provide a devid.  Now that we have labelled the disk and
938  * know that slice 0 is valid, we can construct the devid now.
939  *
940  * If the disk was already labeled with an EFI label, we will have gotten the
941  * devid already (because we were able to open the whole disk).  Otherwise, we
942  * need to get the devid after we label the disk.
943  */
944 static int
945 make_disks(zpool_handle_t *zhp, nvlist_t *nv)
946 {
947 	nvlist_t **child;
948 	uint_t c, children;
949 	char *type, *path;
950 	char devpath[MAXPATHLEN];
951 	char udevpath[MAXPATHLEN];
952 	uint64_t wholedisk;
953 	struct stat64 statbuf;
954 	int is_exclusive = 0;
955 	int fd;
956 	int ret;
957 
958 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
959 
960 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
961 	    &child, &children) != 0) {
962 
963 		if (strcmp(type, VDEV_TYPE_DISK) != 0)
964 			return (0);
965 
966 		/*
967 		 * We have a disk device.  If this is a whole disk write
968 		 * out the efi partition table, otherwise write zero's to
969 		 * the first 4k of the partition.  This is to ensure that
970 		 * libblkid will not misidentify the partition due to a
971 		 * magic value left by the previous filesystem.
972 		 */
973 		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
974 		verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
975 		    &wholedisk));
976 
977 		if (!wholedisk) {
978 			/*
979 			 * Update device id string for mpath nodes (Linux only)
980 			 */
981 			if (is_mpath_whole_disk(path))
982 				update_vdev_config_dev_strs(nv);
983 
984 			if (!is_spare(NULL, path))
985 				(void) zero_label(path);
986 			return (0);
987 		}
988 
989 		if (realpath(path, devpath) == NULL) {
990 			ret = errno;
991 			(void) fprintf(stderr,
992 			    gettext("cannot resolve path '%s'\n"), path);
993 			return (ret);
994 		}
995 
996 		/*
997 		 * Remove any previously existing symlink from a udev path to
998 		 * the device before labeling the disk.  This ensures that
999 		 * only newly created links are used.  Otherwise there is a
1000 		 * window between when udev deletes and recreates the link
1001 		 * during which access attempts will fail with ENOENT.
1002 		 */
1003 		strlcpy(udevpath, path, MAXPATHLEN);
1004 		(void) zfs_append_partition(udevpath, MAXPATHLEN);
1005 
1006 		fd = open(devpath, O_RDWR|O_EXCL);
1007 		if (fd == -1) {
1008 			if (errno == EBUSY)
1009 				is_exclusive = 1;
1010 #ifdef __FreeBSD__
1011 			if (errno == EPERM)
1012 				is_exclusive = 1;
1013 #endif
1014 		} else {
1015 			(void) close(fd);
1016 		}
1017 
1018 		/*
1019 		 * If the partition exists, contains a valid spare label,
1020 		 * and is opened exclusively there is no need to partition
1021 		 * it.  Hot spares have already been partitioned and are
1022 		 * held open exclusively by the kernel as a safety measure.
1023 		 *
1024 		 * If the provided path is for a /dev/disk/ device its
1025 		 * symbolic link will be removed, partition table created,
1026 		 * and then block until udev creates the new link.
1027 		 */
1028 		if (!is_exclusive && !is_spare(NULL, udevpath)) {
1029 			char *devnode = strrchr(devpath, '/') + 1;
1030 
1031 			ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT));
1032 			if (ret == 0) {
1033 				ret = lstat64(udevpath, &statbuf);
1034 				if (ret == 0 && S_ISLNK(statbuf.st_mode))
1035 					(void) unlink(udevpath);
1036 			}
1037 
1038 			/*
1039 			 * When labeling a pool the raw device node name
1040 			 * is provided as it appears under /dev/.
1041 			 */
1042 			if (zpool_label_disk(g_zfs, zhp, devnode) == -1)
1043 				return (-1);
1044 
1045 			/*
1046 			 * Wait for udev to signal the device is available
1047 			 * by the provided path.
1048 			 */
1049 			ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT);
1050 			if (ret) {
1051 				(void) fprintf(stderr,
1052 				    gettext("missing link: %s was "
1053 				    "partitioned but %s is missing\n"),
1054 				    devnode, udevpath);
1055 				return (ret);
1056 			}
1057 
1058 			ret = zero_label(udevpath);
1059 			if (ret)
1060 				return (ret);
1061 		}
1062 
1063 		/*
1064 		 * Update the path to refer to the partition.  The presence of
1065 		 * the 'whole_disk' field indicates to the CLI that we should
1066 		 * chop off the partition number when displaying the device in
1067 		 * future output.
1068 		 */
1069 		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0);
1070 
1071 		/*
1072 		 * Update device id strings for whole disks (Linux only)
1073 		 */
1074 		update_vdev_config_dev_strs(nv);
1075 
1076 		return (0);
1077 	}
1078 
1079 	for (c = 0; c < children; c++)
1080 		if ((ret = make_disks(zhp, child[c])) != 0)
1081 			return (ret);
1082 
1083 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1084 	    &child, &children) == 0)
1085 		for (c = 0; c < children; c++)
1086 			if ((ret = make_disks(zhp, child[c])) != 0)
1087 				return (ret);
1088 
1089 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1090 	    &child, &children) == 0)
1091 		for (c = 0; c < children; c++)
1092 			if ((ret = make_disks(zhp, child[c])) != 0)
1093 				return (ret);
1094 
1095 	return (0);
1096 }
1097 
1098 /*
1099  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1100  * the majority of this task.
1101  */
1102 static boolean_t
1103 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1104     boolean_t replacing, boolean_t isspare)
1105 {
1106 	nvlist_t **child;
1107 	uint_t c, children;
1108 	char *type, *path;
1109 	int ret = 0;
1110 	char buf[MAXPATHLEN];
1111 	uint64_t wholedisk = B_FALSE;
1112 	boolean_t anyinuse = B_FALSE;
1113 
1114 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1115 
1116 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1117 	    &child, &children) != 0) {
1118 
1119 		verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path));
1120 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
1121 			verify(!nvlist_lookup_uint64(nv,
1122 			    ZPOOL_CONFIG_WHOLE_DISK, &wholedisk));
1123 
1124 		/*
1125 		 * As a generic check, we look to see if this is a replace of a
1126 		 * hot spare within the same pool.  If so, we allow it
1127 		 * regardless of what libblkid or zpool_in_use() says.
1128 		 */
1129 		if (replacing) {
1130 			(void) strlcpy(buf, path, sizeof (buf));
1131 			if (wholedisk) {
1132 				ret = zfs_append_partition(buf,  sizeof (buf));
1133 				if (ret == -1)
1134 					return (-1);
1135 			}
1136 
1137 			if (is_spare(config, buf))
1138 				return (B_FALSE);
1139 		}
1140 
1141 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
1142 			ret = check_device(path, force, isspare, wholedisk);
1143 
1144 		else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1145 			ret = check_file(path, force, isspare);
1146 
1147 		return (ret != 0);
1148 	}
1149 
1150 	for (c = 0; c < children; c++)
1151 		if (is_device_in_use(config, child[c], force, replacing,
1152 		    B_FALSE))
1153 			anyinuse = B_TRUE;
1154 
1155 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1156 	    &child, &children) == 0)
1157 		for (c = 0; c < children; c++)
1158 			if (is_device_in_use(config, child[c], force, replacing,
1159 			    B_TRUE))
1160 				anyinuse = B_TRUE;
1161 
1162 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1163 	    &child, &children) == 0)
1164 		for (c = 0; c < children; c++)
1165 			if (is_device_in_use(config, child[c], force, replacing,
1166 			    B_FALSE))
1167 				anyinuse = B_TRUE;
1168 
1169 	return (anyinuse);
1170 }
1171 
1172 /*
1173  * Returns the parity level extracted from a raidz or draid type.
1174  * If the parity cannot be determined zero is returned.
1175  */
1176 static int
1177 get_parity(const char *type)
1178 {
1179 	long parity = 0;
1180 	const char *p;
1181 
1182 	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) {
1183 		p = type + strlen(VDEV_TYPE_RAIDZ);
1184 
1185 		if (*p == '\0') {
1186 			/* when unspecified default to single parity */
1187 			return (1);
1188 		} else if (*p == '0') {
1189 			/* no zero prefixes allowed */
1190 			return (0);
1191 		} else {
1192 			/* 0-3, no suffixes allowed */
1193 			char *end;
1194 			errno = 0;
1195 			parity = strtol(p, &end, 10);
1196 			if (errno != 0 || *end != '\0' ||
1197 			    parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) {
1198 				return (0);
1199 			}
1200 		}
1201 	} else if (strncmp(type, VDEV_TYPE_DRAID,
1202 	    strlen(VDEV_TYPE_DRAID)) == 0) {
1203 		p = type + strlen(VDEV_TYPE_DRAID);
1204 
1205 		if (*p == '\0' || *p == ':') {
1206 			/* when unspecified default to single parity */
1207 			return (1);
1208 		} else if (*p == '0') {
1209 			/* no zero prefixes allowed */
1210 			return (0);
1211 		} else {
1212 			/* 0-3, allowed suffixes: '\0' or ':' */
1213 			char *end;
1214 			errno = 0;
1215 			parity = strtol(p, &end, 10);
1216 			if (errno != 0 ||
1217 			    parity < 1 || parity > VDEV_DRAID_MAXPARITY ||
1218 			    (*end != '\0' && *end != ':')) {
1219 				return (0);
1220 			}
1221 		}
1222 	}
1223 
1224 	return ((int)parity);
1225 }
1226 
1227 /*
1228  * Assign the minimum and maximum number of devices allowed for
1229  * the specified type.  On error NULL is returned, otherwise the
1230  * type prefix is returned (raidz, mirror, etc).
1231  */
1232 static const char *
1233 is_grouping(const char *type, int *mindev, int *maxdev)
1234 {
1235 	int nparity;
1236 
1237 	if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
1238 	    strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) {
1239 		nparity = get_parity(type);
1240 		if (nparity == 0)
1241 			return (NULL);
1242 		if (mindev != NULL)
1243 			*mindev = nparity + 1;
1244 		if (maxdev != NULL)
1245 			*maxdev = 255;
1246 
1247 		if (strncmp(type, VDEV_TYPE_RAIDZ,
1248 		    strlen(VDEV_TYPE_RAIDZ)) == 0) {
1249 			return (VDEV_TYPE_RAIDZ);
1250 		} else {
1251 			return (VDEV_TYPE_DRAID);
1252 		}
1253 	}
1254 
1255 	if (maxdev != NULL)
1256 		*maxdev = INT_MAX;
1257 
1258 	if (strcmp(type, "mirror") == 0) {
1259 		if (mindev != NULL)
1260 			*mindev = 2;
1261 		return (VDEV_TYPE_MIRROR);
1262 	}
1263 
1264 	if (strcmp(type, "spare") == 0) {
1265 		if (mindev != NULL)
1266 			*mindev = 1;
1267 		return (VDEV_TYPE_SPARE);
1268 	}
1269 
1270 	if (strcmp(type, "log") == 0) {
1271 		if (mindev != NULL)
1272 			*mindev = 1;
1273 		return (VDEV_TYPE_LOG);
1274 	}
1275 
1276 	if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
1277 	    strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1278 		if (mindev != NULL)
1279 			*mindev = 1;
1280 		return (type);
1281 	}
1282 
1283 	if (strcmp(type, "cache") == 0) {
1284 		if (mindev != NULL)
1285 			*mindev = 1;
1286 		return (VDEV_TYPE_L2CACHE);
1287 	}
1288 
1289 	return (NULL);
1290 }
1291 
1292 /*
1293  * Extract the configuration parameters encoded in the dRAID type and
1294  * use them to generate a dRAID configuration.  The expected format is:
1295  *
1296  * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>]
1297  *
1298  * The intent is to be able to generate a good configuration when no
1299  * additional information is provided.  The only mandatory component
1300  * of the 'type' is the 'draid' prefix.  If a value is not provided
1301  * then reasonable defaults are used.  The optional components may
1302  * appear in any order but the d/s/c suffix is required.
1303  *
1304  * Valid inputs:
1305  * - data:     number of data devices per group (1-255)
1306  * - parity:   number of parity blocks per group (1-3)
1307  * - spares:   number of distributed spare (0-100)
1308  * - children: total number of devices (1-255)
1309  *
1310  * Examples:
1311  * - zpool create tank draid <devices...>
1312  * - zpool create tank draid2:8d:51c:2s <devices...>
1313  */
1314 static int
1315 draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children)
1316 {
1317 	uint64_t nparity = 1;
1318 	uint64_t nspares = 0;
1319 	uint64_t ndata = UINT64_MAX;
1320 	uint64_t ngroups = 1;
1321 	long value;
1322 
1323 	if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0)
1324 		return (EINVAL);
1325 
1326 	nparity = (uint64_t)get_parity(type);
1327 	if (nparity == 0)
1328 		return (EINVAL);
1329 
1330 	char *p = (char *)type;
1331 	while ((p = strchr(p, ':')) != NULL) {
1332 		char *end;
1333 
1334 		p = p + 1;
1335 		errno = 0;
1336 
1337 		if (!isdigit(p[0])) {
1338 			(void) fprintf(stderr, gettext("invalid dRAID "
1339 			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
1340 			    type);
1341 			return (EINVAL);
1342 		}
1343 
1344 		/* Expected non-zero value with c/d/s suffix */
1345 		value = strtol(p, &end, 10);
1346 		char suffix = tolower(*end);
1347 		if (errno != 0 ||
1348 		    (suffix != 'c' && suffix != 'd' && suffix != 's')) {
1349 			(void) fprintf(stderr, gettext("invalid dRAID "
1350 			    "syntax; expected [:<number><c|d|s>] not '%s'\n"),
1351 			    type);
1352 			return (EINVAL);
1353 		}
1354 
1355 		if (suffix == 'c') {
1356 			if ((uint64_t)value != children) {
1357 				fprintf(stderr,
1358 				    gettext("invalid number of dRAID children; "
1359 				    "%llu required but %llu provided\n"),
1360 				    (u_longlong_t)value,
1361 				    (u_longlong_t)children);
1362 				return (EINVAL);
1363 			}
1364 		} else if (suffix == 'd') {
1365 			ndata = (uint64_t)value;
1366 		} else if (suffix == 's') {
1367 			nspares = (uint64_t)value;
1368 		} else {
1369 			verify(0); /* Unreachable */
1370 		}
1371 	}
1372 
1373 	/*
1374 	 * When a specific number of data disks is not provided limit a
1375 	 * redundancy group to 8 data disks.  This value was selected to
1376 	 * provide a reasonable tradeoff between capacity and performance.
1377 	 */
1378 	if (ndata == UINT64_MAX) {
1379 		if (children > nspares + nparity) {
1380 			ndata = MIN(children - nspares - nparity, 8);
1381 		} else {
1382 			fprintf(stderr, gettext("request number of "
1383 			    "distributed spares %llu and parity level %llu\n"
1384 			    "leaves no disks available for data\n"),
1385 			    (u_longlong_t)nspares, (u_longlong_t)nparity);
1386 			return (EINVAL);
1387 		}
1388 	}
1389 
1390 	/* Verify the maximum allowed group size is never exceeded. */
1391 	if (ndata == 0 || (ndata + nparity > children - nspares)) {
1392 		fprintf(stderr, gettext("requested number of dRAID data "
1393 		    "disks per group %llu is too high,\nat most %llu disks "
1394 		    "are available for data\n"), (u_longlong_t)ndata,
1395 		    (u_longlong_t)(children - nspares - nparity));
1396 		return (EINVAL);
1397 	}
1398 
1399 	if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
1400 		fprintf(stderr,
1401 		    gettext("invalid dRAID parity level %llu; must be "
1402 		    "between 1 and %d\n"), (u_longlong_t)nparity,
1403 		    VDEV_DRAID_MAXPARITY);
1404 		return (EINVAL);
1405 	}
1406 
1407 	/*
1408 	 * Verify the requested number of spares can be satisfied.
1409 	 * An arbitrary limit of 100 distributed spares is applied.
1410 	 */
1411 	if (nspares > 100 || nspares > (children - (ndata + nparity))) {
1412 		fprintf(stderr,
1413 		    gettext("invalid number of dRAID spares %llu; additional "
1414 		    "disks would be required\n"), (u_longlong_t)nspares);
1415 		return (EINVAL);
1416 	}
1417 
1418 	/* Verify the requested number children is sufficient. */
1419 	if (children < (ndata + nparity + nspares)) {
1420 		fprintf(stderr, gettext("%llu disks were provided, but at "
1421 		    "least %llu disks are required for this config\n"),
1422 		    (u_longlong_t)children,
1423 		    (u_longlong_t)(ndata + nparity + nspares));
1424 	}
1425 
1426 	if (children > VDEV_DRAID_MAX_CHILDREN) {
1427 		fprintf(stderr, gettext("%llu disks were provided, but "
1428 		    "dRAID only supports up to %u disks"),
1429 		    (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN);
1430 	}
1431 
1432 	/*
1433 	 * Calculate the minimum number of groups required to fill a slice.
1434 	 * This is the LCM of the stripe width (ndata + nparity) and the
1435 	 * number of data drives (children - nspares).
1436 	 */
1437 	while (ngroups * (ndata + nparity) % (children - nspares) != 0)
1438 		ngroups++;
1439 
1440 	/* Store the basic dRAID configuration. */
1441 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity);
1442 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata);
1443 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares);
1444 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups);
1445 
1446 	return (0);
1447 }
1448 
1449 /*
1450  * Construct a syntactically valid vdev specification,
1451  * and ensure that all devices and files exist and can be opened.
1452  * Note: we don't bother freeing anything in the error paths
1453  * because the program is just going to exit anyway.
1454  */
1455 static nvlist_t *
1456 construct_spec(nvlist_t *props, int argc, char **argv)
1457 {
1458 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1459 	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1460 	const char *type, *fulltype;
1461 	boolean_t is_log, is_special, is_dedup, is_spare;
1462 	boolean_t seen_logs;
1463 
1464 	top = NULL;
1465 	toplevels = 0;
1466 	spares = NULL;
1467 	l2cache = NULL;
1468 	nspares = 0;
1469 	nlogs = 0;
1470 	nl2cache = 0;
1471 	is_log = is_special = is_dedup = is_spare = B_FALSE;
1472 	seen_logs = B_FALSE;
1473 	nvroot = NULL;
1474 
1475 	while (argc > 0) {
1476 		fulltype = argv[0];
1477 		nv = NULL;
1478 
1479 		/*
1480 		 * If it's a mirror, raidz, or draid the subsequent arguments
1481 		 * are its leaves -- until we encounter the next mirror,
1482 		 * raidz or draid.
1483 		 */
1484 		if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) {
1485 			nvlist_t **child = NULL;
1486 			int c, children = 0;
1487 
1488 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1489 				if (spares != NULL) {
1490 					(void) fprintf(stderr,
1491 					    gettext("invalid vdev "
1492 					    "specification: 'spare' can be "
1493 					    "specified only once\n"));
1494 					goto spec_out;
1495 				}
1496 				is_spare = B_TRUE;
1497 				is_log = is_special = is_dedup = B_FALSE;
1498 			}
1499 
1500 			if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1501 				if (seen_logs) {
1502 					(void) fprintf(stderr,
1503 					    gettext("invalid vdev "
1504 					    "specification: 'log' can be "
1505 					    "specified only once\n"));
1506 					goto spec_out;
1507 				}
1508 				seen_logs = B_TRUE;
1509 				is_log = B_TRUE;
1510 				is_special = is_dedup = is_spare = B_FALSE;
1511 				argc--;
1512 				argv++;
1513 				/*
1514 				 * A log is not a real grouping device.
1515 				 * We just set is_log and continue.
1516 				 */
1517 				continue;
1518 			}
1519 
1520 			if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) {
1521 				is_special = B_TRUE;
1522 				is_log = is_dedup = is_spare = B_FALSE;
1523 				argc--;
1524 				argv++;
1525 				continue;
1526 			}
1527 
1528 			if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) {
1529 				is_dedup = B_TRUE;
1530 				is_log = is_special = is_spare = B_FALSE;
1531 				argc--;
1532 				argv++;
1533 				continue;
1534 			}
1535 
1536 			if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1537 				if (l2cache != NULL) {
1538 					(void) fprintf(stderr,
1539 					    gettext("invalid vdev "
1540 					    "specification: 'cache' can be "
1541 					    "specified only once\n"));
1542 					goto spec_out;
1543 				}
1544 				is_log = is_special = B_FALSE;
1545 				is_dedup = is_spare = B_FALSE;
1546 			}
1547 
1548 			if (is_log || is_special || is_dedup) {
1549 				if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1550 					(void) fprintf(stderr,
1551 					    gettext("invalid vdev "
1552 					    "specification: unsupported '%s' "
1553 					    "device: %s\n"), is_log ? "log" :
1554 					    "special", type);
1555 					goto spec_out;
1556 				}
1557 				nlogs++;
1558 			}
1559 
1560 			for (c = 1; c < argc; c++) {
1561 				if (is_grouping(argv[c], NULL, NULL) != NULL)
1562 					break;
1563 
1564 				children++;
1565 				child = realloc(child,
1566 				    children * sizeof (nvlist_t *));
1567 				if (child == NULL)
1568 					zpool_no_memory();
1569 				if ((nv = make_leaf_vdev(props, argv[c],
1570 				    !(is_log || is_special || is_dedup ||
1571 				    is_spare))) == NULL) {
1572 					for (c = 0; c < children - 1; c++)
1573 						nvlist_free(child[c]);
1574 					free(child);
1575 					goto spec_out;
1576 				}
1577 
1578 				child[children - 1] = nv;
1579 			}
1580 
1581 			if (children < mindev) {
1582 				(void) fprintf(stderr, gettext("invalid vdev "
1583 				    "specification: %s requires at least %d "
1584 				    "devices\n"), argv[0], mindev);
1585 				for (c = 0; c < children; c++)
1586 					nvlist_free(child[c]);
1587 				free(child);
1588 				goto spec_out;
1589 			}
1590 
1591 			if (children > maxdev) {
1592 				(void) fprintf(stderr, gettext("invalid vdev "
1593 				    "specification: %s supports no more than "
1594 				    "%d devices\n"), argv[0], maxdev);
1595 				for (c = 0; c < children; c++)
1596 					nvlist_free(child[c]);
1597 				free(child);
1598 				goto spec_out;
1599 			}
1600 
1601 			argc -= c;
1602 			argv += c;
1603 
1604 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1605 				spares = child;
1606 				nspares = children;
1607 				continue;
1608 			} else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1609 				l2cache = child;
1610 				nl2cache = children;
1611 				continue;
1612 			} else {
1613 				/* create a top-level vdev with children */
1614 				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1615 				    0) == 0);
1616 				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1617 				    type) == 0);
1618 				verify(nvlist_add_uint64(nv,
1619 				    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1620 				if (is_log) {
1621 					verify(nvlist_add_string(nv,
1622 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1623 					    VDEV_ALLOC_BIAS_LOG) == 0);
1624 				}
1625 				if (is_special) {
1626 					verify(nvlist_add_string(nv,
1627 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1628 					    VDEV_ALLOC_BIAS_SPECIAL) == 0);
1629 				}
1630 				if (is_dedup) {
1631 					verify(nvlist_add_string(nv,
1632 					    ZPOOL_CONFIG_ALLOCATION_BIAS,
1633 					    VDEV_ALLOC_BIAS_DEDUP) == 0);
1634 				}
1635 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1636 					verify(nvlist_add_uint64(nv,
1637 					    ZPOOL_CONFIG_NPARITY,
1638 					    mindev - 1) == 0);
1639 				}
1640 				if (strcmp(type, VDEV_TYPE_DRAID) == 0) {
1641 					if (draid_config_by_type(nv,
1642 					    fulltype, children) != 0) {
1643 						for (c = 0; c < children; c++)
1644 							nvlist_free(child[c]);
1645 						free(child);
1646 						goto spec_out;
1647 					}
1648 				}
1649 				verify(nvlist_add_nvlist_array(nv,
1650 				    ZPOOL_CONFIG_CHILDREN, child,
1651 				    children) == 0);
1652 
1653 				for (c = 0; c < children; c++)
1654 					nvlist_free(child[c]);
1655 				free(child);
1656 			}
1657 		} else {
1658 			/*
1659 			 * We have a device.  Pass off to make_leaf_vdev() to
1660 			 * construct the appropriate nvlist describing the vdev.
1661 			 */
1662 			if ((nv = make_leaf_vdev(props, argv[0], !(is_log ||
1663 			    is_special || is_dedup || is_spare))) == NULL)
1664 				goto spec_out;
1665 
1666 			verify(nvlist_add_uint64(nv,
1667 			    ZPOOL_CONFIG_IS_LOG, is_log) == 0);
1668 			if (is_log) {
1669 				verify(nvlist_add_string(nv,
1670 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1671 				    VDEV_ALLOC_BIAS_LOG) == 0);
1672 				nlogs++;
1673 			}
1674 
1675 			if (is_special) {
1676 				verify(nvlist_add_string(nv,
1677 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1678 				    VDEV_ALLOC_BIAS_SPECIAL) == 0);
1679 			}
1680 			if (is_dedup) {
1681 				verify(nvlist_add_string(nv,
1682 				    ZPOOL_CONFIG_ALLOCATION_BIAS,
1683 				    VDEV_ALLOC_BIAS_DEDUP) == 0);
1684 			}
1685 			argc--;
1686 			argv++;
1687 		}
1688 
1689 		toplevels++;
1690 		top = realloc(top, toplevels * sizeof (nvlist_t *));
1691 		if (top == NULL)
1692 			zpool_no_memory();
1693 		top[toplevels - 1] = nv;
1694 	}
1695 
1696 	if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1697 		(void) fprintf(stderr, gettext("invalid vdev "
1698 		    "specification: at least one toplevel vdev must be "
1699 		    "specified\n"));
1700 		goto spec_out;
1701 	}
1702 
1703 	if (seen_logs && nlogs == 0) {
1704 		(void) fprintf(stderr, gettext("invalid vdev specification: "
1705 		    "log requires at least 1 device\n"));
1706 		goto spec_out;
1707 	}
1708 
1709 	/*
1710 	 * Finally, create nvroot and add all top-level vdevs to it.
1711 	 */
1712 	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1713 	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1714 	    VDEV_TYPE_ROOT) == 0);
1715 	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1716 	    top, toplevels) == 0);
1717 	if (nspares != 0)
1718 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1719 		    spares, nspares) == 0);
1720 	if (nl2cache != 0)
1721 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1722 		    l2cache, nl2cache) == 0);
1723 
1724 spec_out:
1725 	for (t = 0; t < toplevels; t++)
1726 		nvlist_free(top[t]);
1727 	for (t = 0; t < nspares; t++)
1728 		nvlist_free(spares[t]);
1729 	for (t = 0; t < nl2cache; t++)
1730 		nvlist_free(l2cache[t]);
1731 
1732 	free(spares);
1733 	free(l2cache);
1734 	free(top);
1735 
1736 	return (nvroot);
1737 }
1738 
1739 nvlist_t *
1740 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1741     splitflags_t flags, int argc, char **argv)
1742 {
1743 	nvlist_t *newroot = NULL, **child;
1744 	uint_t c, children;
1745 
1746 	if (argc > 0) {
1747 		if ((newroot = construct_spec(props, argc, argv)) == NULL) {
1748 			(void) fprintf(stderr, gettext("Unable to build a "
1749 			    "pool from the specified devices\n"));
1750 			return (NULL);
1751 		}
1752 
1753 		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
1754 			nvlist_free(newroot);
1755 			return (NULL);
1756 		}
1757 
1758 		/* avoid any tricks in the spec */
1759 		verify(nvlist_lookup_nvlist_array(newroot,
1760 		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1761 		for (c = 0; c < children; c++) {
1762 			char *path;
1763 			const char *type;
1764 			int min, max;
1765 
1766 			verify(nvlist_lookup_string(child[c],
1767 			    ZPOOL_CONFIG_PATH, &path) == 0);
1768 			if ((type = is_grouping(path, &min, &max)) != NULL) {
1769 				(void) fprintf(stderr, gettext("Cannot use "
1770 				    "'%s' as a device for splitting\n"), type);
1771 				nvlist_free(newroot);
1772 				return (NULL);
1773 			}
1774 		}
1775 	}
1776 
1777 	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1778 		nvlist_free(newroot);
1779 		return (NULL);
1780 	}
1781 
1782 	return (newroot);
1783 }
1784 
1785 static int
1786 num_normal_vdevs(nvlist_t *nvroot)
1787 {
1788 	nvlist_t **top;
1789 	uint_t t, toplevels, normal = 0;
1790 
1791 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1792 	    &top, &toplevels) == 0);
1793 
1794 	for (t = 0; t < toplevels; t++) {
1795 		uint64_t log = B_FALSE;
1796 
1797 		(void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log);
1798 		if (log)
1799 			continue;
1800 		if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS))
1801 			continue;
1802 
1803 		normal++;
1804 	}
1805 
1806 	return (normal);
1807 }
1808 
1809 /*
1810  * Get and validate the contents of the given vdev specification.  This ensures
1811  * that the nvlist returned is well-formed, that all the devices exist, and that
1812  * they are not currently in use by any other known consumer.  The 'poolconfig'
1813  * parameter is the current configuration of the pool when adding devices
1814  * existing pool, and is used to perform additional checks, such as changing the
1815  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1816  * new pool.  The 'force' flag controls whether devices should be forcefully
1817  * added, even if they appear in use.
1818  */
1819 nvlist_t *
1820 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep,
1821     boolean_t replacing, boolean_t dryrun, int argc, char **argv)
1822 {
1823 	nvlist_t *newroot;
1824 	nvlist_t *poolconfig = NULL;
1825 	is_force = force;
1826 
1827 	/*
1828 	 * Construct the vdev specification.  If this is successful, we know
1829 	 * that we have a valid specification, and that all devices can be
1830 	 * opened.
1831 	 */
1832 	if ((newroot = construct_spec(props, argc, argv)) == NULL)
1833 		return (NULL);
1834 
1835 	if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) {
1836 		nvlist_free(newroot);
1837 		return (NULL);
1838 	}
1839 
1840 	/*
1841 	 * Validate each device to make sure that it's not shared with another
1842 	 * subsystem.  We do this even if 'force' is set, because there are some
1843 	 * uses (such as a dedicated dump device) that even '-f' cannot
1844 	 * override.
1845 	 */
1846 	if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1847 		nvlist_free(newroot);
1848 		return (NULL);
1849 	}
1850 
1851 	/*
1852 	 * Check the replication level of the given vdevs and report any errors
1853 	 * found.  We include the existing pool spec, if any, as we need to
1854 	 * catch changes against the existing replication level.
1855 	 */
1856 	if (check_rep && check_replication(poolconfig, newroot) != 0) {
1857 		nvlist_free(newroot);
1858 		return (NULL);
1859 	}
1860 
1861 	/*
1862 	 * On pool create the new vdev spec must have one normal vdev.
1863 	 */
1864 	if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) {
1865 		vdev_error(gettext("at least one general top-level vdev must "
1866 		    "be specified\n"));
1867 		nvlist_free(newroot);
1868 		return (NULL);
1869 	}
1870 
1871 	/*
1872 	 * Run through the vdev specification and label any whole disks found.
1873 	 */
1874 	if (!dryrun && make_disks(zhp, newroot) != 0) {
1875 		nvlist_free(newroot);
1876 		return (NULL);
1877 	}
1878 
1879 	return (newroot);
1880 }
1881