xref: /titanic_50/usr/src/cmd/zpool/zpool_vdev.c (revision 26d97b1b85350d431fb75bb4f40b71c5c03a19e9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Functions to convert between a list of vdevs and an nvlist representing the
31  * configuration.  Each entry in the list can be one of:
32  *
33  * 	Device vdevs
34  * 		disk=(path=..., devid=...)
35  * 		file=(path=...)
36  *
37  * 	Group vdevs
38  * 		raidz[1|2]=(...)
39  * 		mirror=(...)
40  *
41  * 	Hot spares
42  *
43  * While the underlying implementation supports it, group vdevs cannot contain
44  * other group vdevs.  All userland verification of devices is contained within
45  * this file.  If successful, the nvlist returned can be passed directly to the
46  * kernel; we've done as much verification as possible in userland.
47  *
48  * Hot spares are a special case, and passed down as an array of disk vdevs, at
49  * the same level as the root of the vdev tree.
50  *
51  * The only function exported by this file is 'get_vdev_spec'.  The function
52  * performs several passes:
53  *
54  * 	1. Construct the vdev specification.  Performs syntax validation and
55  *         makes sure each device is valid.
56  * 	2. Check for devices in use.  Using libdiskmgt, makes sure that no
57  *         devices are also in use.  Some can be overridden using the 'force'
58  *         flag, others cannot.
59  * 	3. Check for replication errors if the 'force' flag is not specified.
60  *         validates that the replication level is consistent across the
61  *         entire pool.
62  * 	4. Label any whole disks with an EFI label.
63  */
64 
65 #include <assert.h>
66 #include <devid.h>
67 #include <errno.h>
68 #include <fcntl.h>
69 #include <libdiskmgt.h>
70 #include <libintl.h>
71 #include <libnvpair.h>
72 #include <stdio.h>
73 #include <string.h>
74 #include <unistd.h>
75 #include <sys/efi_partition.h>
76 #include <sys/stat.h>
77 #include <sys/vtoc.h>
78 #include <sys/mntent.h>
79 
80 #include <libzfs.h>
81 
82 #include "zpool_util.h"
83 
84 #define	DISK_ROOT	"/dev/dsk"
85 #define	RDISK_ROOT	"/dev/rdsk"
86 #define	BACKUP_SLICE	"s2"
87 
88 /*
89  * For any given vdev specification, we can have multiple errors.  The
90  * vdev_error() function keeps track of whether we have seen an error yet, and
91  * prints out a header if its the first error we've seen.
92  */
93 boolean_t error_seen;
94 boolean_t is_force;
95 
96 /*PRINTFLIKE1*/
97 static void
98 vdev_error(const char *fmt, ...)
99 {
100 	va_list ap;
101 
102 	if (!error_seen) {
103 		(void) fprintf(stderr, gettext("invalid vdev specification\n"));
104 		if (!is_force)
105 			(void) fprintf(stderr, gettext("use '-f' to override "
106 			    "the following errors:\n"));
107 		else
108 			(void) fprintf(stderr, gettext("the following errors "
109 			    "must be manually repaired:\n"));
110 		error_seen = B_TRUE;
111 	}
112 
113 	va_start(ap, fmt);
114 	(void) vfprintf(stderr, fmt, ap);
115 	va_end(ap);
116 }
117 
118 static void
119 libdiskmgt_error(int error)
120 {
121 	/*
122 	 * ENXIO/ENODEV is a valid error message if the device doesn't live in
123 	 * /dev/dsk.  Don't bother printing an error message in this case.
124 	 */
125 	if (error == ENXIO || error == ENODEV)
126 		return;
127 
128 	(void) fprintf(stderr, gettext("warning: device in use checking "
129 	    "failed: %s\n"), strerror(error));
130 }
131 
132 /*
133  * Validate a device, passing the bulk of the work off to libdiskmgt.
134  */
135 int
136 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
137 {
138 	char *msg;
139 	int error = 0;
140 
141 	if (dm_inuse((char *)path, &msg, isspare ? DM_WHO_ZPOOL_SPARE :
142 	    (force ? DM_WHO_ZPOOL_FORCE : DM_WHO_ZPOOL), &error) || error) {
143 		if (error != 0) {
144 			libdiskmgt_error(error);
145 			return (0);
146 		} else {
147 			vdev_error("%s", msg);
148 			free(msg);
149 			return (-1);
150 		}
151 	}
152 
153 	/*
154 	 * If we're given a whole disk, ignore overlapping slices since we're
155 	 * about to label it anyway.
156 	 */
157 	error = 0;
158 	if (!wholedisk && !force &&
159 	    (dm_isoverlapping((char *)path, &msg, &error) || error)) {
160 		if (error == 0) {
161 			/* dm_isoverlapping returned -1 */
162 			vdev_error(gettext("%s overlaps with %s\n"), path, msg);
163 			free(msg);
164 			return (-1);
165 		} else if (error != ENODEV) {
166 			/* libdiskmgt's devcache only handles physical drives */
167 			libdiskmgt_error(error);
168 			return (0);
169 		}
170 	}
171 
172 	return (0);
173 }
174 
175 /*
176  * Validate a whole disk.  Iterate over all slices on the disk and make sure
177  * that none is in use by calling check_slice().
178  */
179 /* ARGSUSED */
180 int
181 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
182 {
183 	dm_descriptor_t *drive, *media, *slice;
184 	int err = 0;
185 	int i;
186 	int ret;
187 
188 	/*
189 	 * Get the drive associated with this disk.  This should never fail,
190 	 * because we already have an alias handle open for the device.
191 	 */
192 	if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
193 	    &err)) == NULL || *drive == NULL) {
194 		if (err)
195 			libdiskmgt_error(err);
196 		return (0);
197 	}
198 
199 	if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
200 	    &err)) == NULL) {
201 		dm_free_descriptors(drive);
202 		if (err)
203 			libdiskmgt_error(err);
204 		return (0);
205 	}
206 
207 	dm_free_descriptors(drive);
208 
209 	/*
210 	 * It is possible that the user has specified a removable media drive,
211 	 * and the media is not present.
212 	 */
213 	if (*media == NULL) {
214 		dm_free_descriptors(media);
215 		vdev_error(gettext("'%s' has no media in drive\n"), name);
216 		return (-1);
217 	}
218 
219 	if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
220 	    &err)) == NULL) {
221 		dm_free_descriptors(media);
222 		if (err)
223 			libdiskmgt_error(err);
224 		return (0);
225 	}
226 
227 	dm_free_descriptors(media);
228 
229 	ret = 0;
230 
231 	/*
232 	 * Iterate over all slices and report any errors.  We don't care about
233 	 * overlapping slices because we are using the whole disk.
234 	 */
235 	for (i = 0; slice[i] != NULL; i++) {
236 		char *name = dm_get_name(slice[i], &err);
237 
238 		if (check_slice(name, force, B_TRUE, isspare) != 0)
239 			ret = -1;
240 
241 		dm_free_name(name);
242 	}
243 
244 	dm_free_descriptors(slice);
245 	return (ret);
246 }
247 
248 /*
249  * Validate a device.
250  */
251 int
252 check_device(const char *path, boolean_t force, boolean_t isspare)
253 {
254 	dm_descriptor_t desc;
255 	int err;
256 	char *dev;
257 
258 	/*
259 	 * For whole disks, libdiskmgt does not include the leading dev path.
260 	 */
261 	dev = strrchr(path, '/');
262 	assert(dev != NULL);
263 	dev++;
264 	if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
265 		err = check_disk(path, desc, force, isspare);
266 		dm_free_descriptor(desc);
267 		return (err);
268 	}
269 
270 	return (check_slice(path, force, B_FALSE, isspare));
271 }
272 
273 /*
274  * Check that a file is valid.  All we can do in this case is check that it's
275  * not in use by another pool, and not in use by swap.
276  */
277 int
278 check_file(const char *file, boolean_t force, boolean_t isspare)
279 {
280 	char  *name;
281 	int fd;
282 	int ret = 0;
283 	int err;
284 	pool_state_t state;
285 	boolean_t inuse;
286 
287 	if (dm_inuse_swap(file, &err)) {
288 		if (err)
289 			libdiskmgt_error(err);
290 		else
291 			vdev_error(gettext("%s is currently used by swap. "
292 			    "Please see swap(1M).\n"), file);
293 		return (-1);
294 	}
295 
296 	if ((fd = open(file, O_RDONLY)) < 0)
297 		return (0);
298 
299 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
300 		const char *desc;
301 
302 		switch (state) {
303 		case POOL_STATE_ACTIVE:
304 			desc = gettext("active");
305 			break;
306 
307 		case POOL_STATE_EXPORTED:
308 			desc = gettext("exported");
309 			break;
310 
311 		case POOL_STATE_POTENTIALLY_ACTIVE:
312 			desc = gettext("potentially active");
313 			break;
314 
315 		default:
316 			desc = gettext("unknown");
317 			break;
318 		}
319 
320 		/*
321 		 * Allow hot spares to be shared between pools.
322 		 */
323 		if (state == POOL_STATE_SPARE && isspare)
324 			return (0);
325 
326 		if (state == POOL_STATE_ACTIVE ||
327 		    state == POOL_STATE_SPARE || !force) {
328 			switch (state) {
329 			case POOL_STATE_SPARE:
330 				vdev_error(gettext("%s is reserved as a hot "
331 				    "spare for pool %s\n"), file, name);
332 				break;
333 			default:
334 				vdev_error(gettext("%s is part of %s pool "
335 				    "'%s'\n"), file, desc, name);
336 				break;
337 			}
338 			ret = -1;
339 		}
340 
341 		free(name);
342 	}
343 
344 	(void) close(fd);
345 	return (ret);
346 }
347 
348 static boolean_t
349 is_whole_disk(const char *arg, struct stat64 *statbuf)
350 {
351 	char path[MAXPATHLEN];
352 
353 	(void) snprintf(path, sizeof (path), "%s%s", arg, BACKUP_SLICE);
354 	if (stat64(path, statbuf) == 0)
355 		return (B_TRUE);
356 
357 	return (B_FALSE);
358 }
359 
360 /*
361  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
362  * device, fill in the device id to make a complete nvlist.  Valid forms for a
363  * leaf vdev are:
364  *
365  * 	/dev/dsk/xxx	Complete disk path
366  * 	/xxx		Full path to file
367  * 	xxx		Shorthand for /dev/dsk/xxx
368  */
369 nvlist_t *
370 make_leaf_vdev(const char *arg)
371 {
372 	char path[MAXPATHLEN];
373 	struct stat64 statbuf;
374 	nvlist_t *vdev = NULL;
375 	char *type = NULL;
376 	boolean_t wholedisk = B_FALSE;
377 
378 	/*
379 	 * Determine what type of vdev this is, and put the full path into
380 	 * 'path'.  We detect whether this is a device of file afterwards by
381 	 * checking the st_mode of the file.
382 	 */
383 	if (arg[0] == '/') {
384 		/*
385 		 * Complete device or file path.  Exact type is determined by
386 		 * examining the file descriptor afterwards.
387 		 */
388 		if (is_whole_disk(arg, &statbuf)) {
389 			wholedisk = B_TRUE;
390 		} else if (stat64(arg, &statbuf) != 0) {
391 			(void) fprintf(stderr,
392 			    gettext("cannot open '%s': %s\n"),
393 			    arg, strerror(errno));
394 			return (NULL);
395 		}
396 
397 		(void) strlcpy(path, arg, sizeof (path));
398 	} else {
399 		/*
400 		 * This may be a short path for a device, or it could be total
401 		 * gibberish.  Check to see if it's a known device in
402 		 * /dev/dsk/.  As part of this check, see if we've been given a
403 		 * an entire disk (minus the slice number).
404 		 */
405 		(void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT,
406 		    arg);
407 		if (is_whole_disk(path, &statbuf)) {
408 			wholedisk = B_TRUE;
409 		} else if (stat64(path, &statbuf) != 0) {
410 			/*
411 			 * If we got ENOENT, then the user gave us
412 			 * gibberish, so try to direct them with a
413 			 * reasonable error message.  Otherwise,
414 			 * regurgitate strerror() since it's the best we
415 			 * can do.
416 			 */
417 			if (errno == ENOENT) {
418 				(void) fprintf(stderr,
419 				    gettext("cannot open '%s': no such "
420 				    "device in %s\n"), arg, DISK_ROOT);
421 				(void) fprintf(stderr,
422 				    gettext("must be a full path or "
423 				    "shorthand device name\n"));
424 				return (NULL);
425 			} else {
426 				(void) fprintf(stderr,
427 				    gettext("cannot open '%s': %s\n"),
428 				    path, strerror(errno));
429 				return (NULL);
430 			}
431 		}
432 	}
433 
434 	/*
435 	 * Determine whether this is a device or a file.
436 	 */
437 	if (S_ISBLK(statbuf.st_mode)) {
438 		type = VDEV_TYPE_DISK;
439 	} else if (S_ISREG(statbuf.st_mode)) {
440 		type = VDEV_TYPE_FILE;
441 	} else {
442 		(void) fprintf(stderr, gettext("cannot use '%s': must be a "
443 		    "block device or regular file\n"), path);
444 		return (NULL);
445 	}
446 
447 	/*
448 	 * Finally, we have the complete device or file, and we know that it is
449 	 * acceptable to use.  Construct the nvlist to describe this vdev.  All
450 	 * vdevs have a 'path' element, and devices also have a 'devid' element.
451 	 */
452 	verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
453 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
454 	verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
455 	if (strcmp(type, VDEV_TYPE_DISK) == 0)
456 		verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
457 		    (uint64_t)wholedisk) == 0);
458 
459 	/*
460 	 * For a whole disk, defer getting its devid until after labeling it.
461 	 */
462 	if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
463 		/*
464 		 * Get the devid for the device.
465 		 */
466 		int fd;
467 		ddi_devid_t devid;
468 		char *minor = NULL, *devid_str = NULL;
469 
470 		if ((fd = open(path, O_RDONLY)) < 0) {
471 			(void) fprintf(stderr, gettext("cannot open '%s': "
472 			    "%s\n"), path, strerror(errno));
473 			nvlist_free(vdev);
474 			return (NULL);
475 		}
476 
477 		if (devid_get(fd, &devid) == 0) {
478 			if (devid_get_minor_name(fd, &minor) == 0 &&
479 			    (devid_str = devid_str_encode(devid, minor)) !=
480 			    NULL) {
481 				verify(nvlist_add_string(vdev,
482 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
483 			}
484 			if (devid_str != NULL)
485 				devid_str_free(devid_str);
486 			if (minor != NULL)
487 				devid_str_free(minor);
488 			devid_free(devid);
489 		}
490 
491 		(void) close(fd);
492 	}
493 
494 	return (vdev);
495 }
496 
497 /*
498  * Go through and verify the replication level of the pool is consistent.
499  * Performs the following checks:
500  *
501  * 	For the new spec, verifies that devices in mirrors and raidz are the
502  * 	same size.
503  *
504  * 	If the current configuration already has inconsistent replication
505  * 	levels, ignore any other potential problems in the new spec.
506  *
507  * 	Otherwise, make sure that the current spec (if there is one) and the new
508  * 	spec have consistent replication levels.
509  */
510 typedef struct replication_level {
511 	char *zprl_type;
512 	uint64_t zprl_children;
513 	uint64_t zprl_parity;
514 } replication_level_t;
515 
516 /*
517  * Given a list of toplevel vdevs, return the current replication level.  If
518  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
519  * an error message will be displayed for each self-inconsistent vdev.
520  */
521 replication_level_t *
522 get_replication(nvlist_t *nvroot, boolean_t fatal)
523 {
524 	nvlist_t **top;
525 	uint_t t, toplevels;
526 	nvlist_t **child;
527 	uint_t c, children;
528 	nvlist_t *nv;
529 	char *type;
530 	replication_level_t lastrep, rep, *ret;
531 	boolean_t dontreport;
532 
533 	ret = safe_malloc(sizeof (replication_level_t));
534 
535 	verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
536 	    &top, &toplevels) == 0);
537 
538 	lastrep.zprl_type = NULL;
539 	for (t = 0; t < toplevels; t++) {
540 		nv = top[t];
541 
542 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
543 
544 		if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
545 		    &child, &children) != 0) {
546 			/*
547 			 * This is a 'file' or 'disk' vdev.
548 			 */
549 			rep.zprl_type = type;
550 			rep.zprl_children = 1;
551 			rep.zprl_parity = 0;
552 		} else {
553 			uint64_t vdev_size;
554 
555 			/*
556 			 * This is a mirror or RAID-Z vdev.  Go through and make
557 			 * sure the contents are all the same (files vs. disks),
558 			 * keeping track of the number of elements in the
559 			 * process.
560 			 *
561 			 * We also check that the size of each vdev (if it can
562 			 * be determined) is the same.
563 			 */
564 			rep.zprl_type = type;
565 			rep.zprl_children = 0;
566 
567 			if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
568 				verify(nvlist_lookup_uint64(nv,
569 				    ZPOOL_CONFIG_NPARITY,
570 				    &rep.zprl_parity) == 0);
571 				assert(rep.zprl_parity != 0);
572 			} else {
573 				rep.zprl_parity = 0;
574 			}
575 
576 			/*
577 			 * The 'dontreport' variable indicatest that we've
578 			 * already reported an error for this spec, so don't
579 			 * bother doing it again.
580 			 */
581 			type = NULL;
582 			dontreport = 0;
583 			vdev_size = -1ULL;
584 			for (c = 0; c < children; c++) {
585 				nvlist_t *cnv = child[c];
586 				char *path;
587 				struct stat64 statbuf;
588 				uint64_t size = -1ULL;
589 				char *childtype;
590 				int fd, err;
591 
592 				rep.zprl_children++;
593 
594 				verify(nvlist_lookup_string(cnv,
595 				    ZPOOL_CONFIG_TYPE, &childtype) == 0);
596 
597 				/*
598 				 * If this is a a replacing or spare vdev, then
599 				 * get the real first child of the vdev.
600 				 */
601 				if (strcmp(childtype,
602 				    VDEV_TYPE_REPLACING) == 0 ||
603 				    strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
604 					nvlist_t **rchild;
605 					uint_t rchildren;
606 
607 					verify(nvlist_lookup_nvlist_array(cnv,
608 					    ZPOOL_CONFIG_CHILDREN, &rchild,
609 					    &rchildren) == 0);
610 					assert(rchildren == 2);
611 					cnv = rchild[0];
612 
613 					verify(nvlist_lookup_string(cnv,
614 					    ZPOOL_CONFIG_TYPE,
615 					    &childtype) == 0);
616 				}
617 
618 				verify(nvlist_lookup_string(cnv,
619 				    ZPOOL_CONFIG_PATH, &path) == 0);
620 
621 				/*
622 				 * If we have a raidz/mirror that combines disks
623 				 * with files, report it as an error.
624 				 */
625 				if (!dontreport && type != NULL &&
626 				    strcmp(type, childtype) != 0) {
627 					if (ret != NULL)
628 						free(ret);
629 					ret = NULL;
630 					if (fatal)
631 						vdev_error(gettext(
632 						    "mismatched replication "
633 						    "level: %s contains both "
634 						    "files and devices\n"),
635 						    rep.zprl_type);
636 					else
637 						return (NULL);
638 					dontreport = B_TRUE;
639 				}
640 
641 				/*
642 				 * According to stat(2), the value of 'st_size'
643 				 * is undefined for block devices and character
644 				 * devices.  But there is no effective way to
645 				 * determine the real size in userland.
646 				 *
647 				 * Instead, we'll take advantage of an
648 				 * implementation detail of spec_size().  If the
649 				 * device is currently open, then we (should)
650 				 * return a valid size.
651 				 *
652 				 * If we still don't get a valid size (indicated
653 				 * by a size of 0 or MAXOFFSET_T), then ignore
654 				 * this device altogether.
655 				 */
656 				if ((fd = open(path, O_RDONLY)) >= 0) {
657 					err = fstat64(fd, &statbuf);
658 					(void) close(fd);
659 				} else {
660 					err = stat64(path, &statbuf);
661 				}
662 
663 				if (err != 0 ||
664 				    statbuf.st_size == 0 ||
665 				    statbuf.st_size == MAXOFFSET_T)
666 					continue;
667 
668 				size = statbuf.st_size;
669 
670 				/*
671 				 * Also check the size of each device.  If they
672 				 * differ, then report an error.
673 				 */
674 				if (!dontreport && vdev_size != -1ULL &&
675 				    size != vdev_size) {
676 					if (ret != NULL)
677 						free(ret);
678 					ret = NULL;
679 					if (fatal)
680 						vdev_error(gettext(
681 						    "%s contains devices of "
682 						    "different sizes\n"),
683 						    rep.zprl_type);
684 					else
685 						return (NULL);
686 					dontreport = B_TRUE;
687 				}
688 
689 				type = childtype;
690 				vdev_size = size;
691 			}
692 		}
693 
694 		/*
695 		 * At this point, we have the replication of the last toplevel
696 		 * vdev in 'rep'.  Compare it to 'lastrep' to see if its
697 		 * different.
698 		 */
699 		if (lastrep.zprl_type != NULL) {
700 			if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
701 				if (ret != NULL)
702 					free(ret);
703 				ret = NULL;
704 				if (fatal)
705 					vdev_error(gettext(
706 					    "mismatched replication level: "
707 					    "both %s and %s vdevs are "
708 					    "present\n"),
709 					    lastrep.zprl_type, rep.zprl_type);
710 				else
711 					return (NULL);
712 			} else if (lastrep.zprl_parity != rep.zprl_parity) {
713 				if (ret)
714 					free(ret);
715 				ret = NULL;
716 				if (fatal)
717 					vdev_error(gettext(
718 					    "mismatched replication level: "
719 					    "both %llu and %llu device parity "
720 					    "%s vdevs are present\n"),
721 					    lastrep.zprl_parity,
722 					    rep.zprl_parity,
723 					    rep.zprl_type);
724 				else
725 					return (NULL);
726 			} else if (lastrep.zprl_children != rep.zprl_children) {
727 				if (ret)
728 					free(ret);
729 				ret = NULL;
730 				if (fatal)
731 					vdev_error(gettext(
732 					    "mismatched replication level: "
733 					    "both %llu-way and %llu-way %s "
734 					    "vdevs are present\n"),
735 					    lastrep.zprl_children,
736 					    rep.zprl_children,
737 					    rep.zprl_type);
738 				else
739 					return (NULL);
740 			}
741 		}
742 		lastrep = rep;
743 	}
744 
745 	if (ret != NULL)
746 		*ret = rep;
747 
748 	return (ret);
749 }
750 
751 /*
752  * Check the replication level of the vdev spec against the current pool.  Calls
753  * get_replication() to make sure the new spec is self-consistent.  If the pool
754  * has a consistent replication level, then we ignore any errors.  Otherwise,
755  * report any difference between the two.
756  */
757 int
758 check_replication(nvlist_t *config, nvlist_t *newroot)
759 {
760 	replication_level_t *current = NULL, *new;
761 	int ret;
762 
763 	/*
764 	 * If we have a current pool configuration, check to see if it's
765 	 * self-consistent.  If not, simply return success.
766 	 */
767 	if (config != NULL) {
768 		nvlist_t *nvroot;
769 
770 		verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
771 		    &nvroot) == 0);
772 		if ((current = get_replication(nvroot, B_FALSE)) == NULL)
773 			return (0);
774 	}
775 
776 	/*
777 	 * Get the replication level of the new vdev spec, reporting any
778 	 * inconsistencies found.
779 	 */
780 	if ((new = get_replication(newroot, B_TRUE)) == NULL) {
781 		free(current);
782 		return (-1);
783 	}
784 
785 	/*
786 	 * Check to see if the new vdev spec matches the replication level of
787 	 * the current pool.
788 	 */
789 	ret = 0;
790 	if (current != NULL) {
791 		if (strcmp(current->zprl_type, new->zprl_type) != 0) {
792 			vdev_error(gettext(
793 			    "mismatched replication level: pool uses %s "
794 			    "and new vdev is %s\n"),
795 			    current->zprl_type, new->zprl_type);
796 			ret = -1;
797 		} else if (current->zprl_parity != new->zprl_parity) {
798 			vdev_error(gettext(
799 			    "mismatched replication level: pool uses %llu "
800 			    "device parity and new vdev uses %llu\n"),
801 			    current->zprl_parity, new->zprl_parity);
802 			ret = -1;
803 		} else if (current->zprl_children != new->zprl_children) {
804 			vdev_error(gettext(
805 			    "mismatched replication level: pool uses %llu-way "
806 			    "%s and new vdev uses %llu-way %s\n"),
807 			    current->zprl_children, current->zprl_type,
808 			    new->zprl_children, new->zprl_type);
809 			ret = -1;
810 		}
811 	}
812 
813 	free(new);
814 	if (current != NULL)
815 		free(current);
816 
817 	return (ret);
818 }
819 
820 /*
821  * Label an individual disk.  The name provided is the short name, stripped of
822  * any leading /dev path.
823  */
824 int
825 label_disk(char *name)
826 {
827 	char path[MAXPATHLEN];
828 	struct dk_gpt *vtoc;
829 	int fd;
830 	size_t resv = 16384;
831 
832 	(void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name,
833 	    BACKUP_SLICE);
834 
835 	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
836 		/*
837 		 * This shouldn't happen.  We've long since verified that this
838 		 * is a valid device.
839 		 */
840 		(void) fprintf(stderr, gettext("cannot open '%s': %s\n"),
841 		    path, strerror(errno));
842 		return (-1);
843 	}
844 
845 
846 	if (efi_alloc_and_init(fd, 9, &vtoc) != 0) {
847 		/*
848 		 * The only way this can fail is if we run out of memory, or we
849 		 * were unable to read the disk geometry.
850 		 */
851 		if (errno == ENOMEM)
852 			zpool_no_memory();
853 
854 		(void) fprintf(stderr, gettext("cannot label '%s': unable to "
855 		    "read disk geometry\n"), name);
856 		(void) close(fd);
857 		return (-1);
858 	}
859 
860 	vtoc->efi_parts[0].p_start = vtoc->efi_first_u_lba;
861 	vtoc->efi_parts[0].p_size = vtoc->efi_last_u_lba + 1 -
862 	    vtoc->efi_first_u_lba - resv;
863 
864 	/*
865 	 * Why we use V_USR: V_BACKUP confuses users, and is considered
866 	 * disposable by some EFI utilities (since EFI doesn't have a backup
867 	 * slice).  V_UNASSIGNED is supposed to be used only for zero size
868 	 * partitions, and efi_write() will fail if we use it.  V_ROOT, V_BOOT,
869 	 * etc. were all pretty specific.  V_USR is as close to reality as we
870 	 * can get, in the absence of V_OTHER.
871 	 */
872 	vtoc->efi_parts[0].p_tag = V_USR;
873 	(void) strcpy(vtoc->efi_parts[0].p_name, "zfs");
874 
875 	vtoc->efi_parts[8].p_start = vtoc->efi_last_u_lba + 1 - resv;
876 	vtoc->efi_parts[8].p_size = resv;
877 	vtoc->efi_parts[8].p_tag = V_RESERVED;
878 
879 	if (efi_write(fd, vtoc) != 0) {
880 		/*
881 		 * Currently, EFI labels are not supported for IDE disks, and it
882 		 * is likely that they will not be supported on other drives for
883 		 * some time.  Print out a helpful error message directing the
884 		 * user to manually label the disk and give a specific slice.
885 		 */
886 		(void) fprintf(stderr, gettext("cannot label '%s': failed to "
887 		    "write EFI label\n"), name);
888 		(void) fprintf(stderr, gettext("use fdisk(1M) to partition "
889 		    "the disk, and provide a specific slice\n"));
890 		(void) close(fd);
891 		efi_free(vtoc);
892 		return (-1);
893 	}
894 
895 	(void) close(fd);
896 	efi_free(vtoc);
897 	return (0);
898 }
899 
900 /*
901  * Go through and find any whole disks in the vdev specification, labelling them
902  * as appropriate.  When constructing the vdev spec, we were unable to open this
903  * device in order to provide a devid.  Now that we have labelled the disk and
904  * know that slice 0 is valid, we can construct the devid now.
905  *
906  * If the disk was already labelled with an EFI label, we will have gotten the
907  * devid already (because we were able to open the whole disk).  Otherwise, we
908  * need to get the devid after we label the disk.
909  */
910 int
911 make_disks(nvlist_t *nv)
912 {
913 	nvlist_t **child;
914 	uint_t c, children;
915 	char *type, *path, *diskname;
916 	char buf[MAXPATHLEN];
917 	uint64_t wholedisk;
918 	int fd;
919 	int ret;
920 	ddi_devid_t devid;
921 	char *minor = NULL, *devid_str = NULL;
922 
923 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
924 
925 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
926 	    &child, &children) != 0) {
927 
928 		if (strcmp(type, VDEV_TYPE_DISK) != 0)
929 			return (0);
930 
931 		/*
932 		 * We have a disk device.  Get the path to the device
933 		 * and see if its a whole disk by appending the backup
934 		 * slice and stat()ing the device.
935 		 */
936 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
937 
938 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
939 		    &wholedisk) != 0 || !wholedisk)
940 			return (0);
941 
942 		diskname = strrchr(path, '/');
943 		assert(diskname != NULL);
944 		diskname++;
945 		if (label_disk(diskname) != 0)
946 			return (-1);
947 
948 		/*
949 		 * Fill in the devid, now that we've labeled the disk.
950 		 */
951 		(void) snprintf(buf, sizeof (buf), "%ss0", path);
952 		if ((fd = open(buf, O_RDONLY)) < 0) {
953 			(void) fprintf(stderr,
954 			    gettext("cannot open '%s': %s\n"),
955 			    buf, strerror(errno));
956 			return (-1);
957 		}
958 
959 		if (devid_get(fd, &devid) == 0) {
960 			if (devid_get_minor_name(fd, &minor) == 0 &&
961 			    (devid_str = devid_str_encode(devid, minor)) !=
962 			    NULL) {
963 				verify(nvlist_add_string(nv,
964 				    ZPOOL_CONFIG_DEVID, devid_str) == 0);
965 			}
966 			if (devid_str != NULL)
967 				devid_str_free(devid_str);
968 			if (minor != NULL)
969 				devid_str_free(minor);
970 			devid_free(devid);
971 		}
972 
973 		/*
974 		 * Update the path to refer to the 's0' slice.  The presence of
975 		 * the 'whole_disk' field indicates to the CLI that we should
976 		 * chop off the slice number when displaying the device in
977 		 * future output.
978 		 */
979 		verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
980 
981 		(void) close(fd);
982 
983 		return (0);
984 	}
985 
986 	for (c = 0; c < children; c++)
987 		if ((ret = make_disks(child[c])) != 0)
988 			return (ret);
989 
990 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
991 	    &child, &children) == 0)
992 		for (c = 0; c < children; c++)
993 			if ((ret = make_disks(child[c])) != 0)
994 				return (ret);
995 
996 	return (0);
997 }
998 
999 /*
1000  * Determine if the given path is a hot spare within the given configuration.
1001  */
1002 static boolean_t
1003 is_spare(nvlist_t *config, const char *path)
1004 {
1005 	int fd;
1006 	pool_state_t state;
1007 	char *name = NULL;
1008 	nvlist_t *label;
1009 	uint64_t guid, spareguid;
1010 	nvlist_t *nvroot;
1011 	nvlist_t **spares;
1012 	uint_t i, nspares;
1013 	boolean_t inuse;
1014 
1015 	if ((fd = open(path, O_RDONLY)) < 0)
1016 		return (B_FALSE);
1017 
1018 	if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
1019 	    !inuse ||
1020 	    state != POOL_STATE_SPARE ||
1021 	    zpool_read_label(fd, &label) != 0) {
1022 		free(name);
1023 		(void) close(fd);
1024 		return (B_FALSE);
1025 	}
1026 	free(name);
1027 
1028 	(void) close(fd);
1029 	verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
1030 	nvlist_free(label);
1031 
1032 	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1033 	    &nvroot) == 0);
1034 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1035 	    &spares, &nspares) == 0) {
1036 		for (i = 0; i < nspares; i++) {
1037 			verify(nvlist_lookup_uint64(spares[i],
1038 			    ZPOOL_CONFIG_GUID, &spareguid) == 0);
1039 			if (spareguid == guid)
1040 				return (B_TRUE);
1041 		}
1042 	}
1043 
1044 	return (B_FALSE);
1045 }
1046 
1047 /*
1048  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1049  * the majority of this task.
1050  */
1051 int
1052 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
1053     int isspare)
1054 {
1055 	nvlist_t **child;
1056 	uint_t c, children;
1057 	char *type, *path;
1058 	int ret;
1059 	char buf[MAXPATHLEN];
1060 	uint64_t wholedisk;
1061 
1062 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1063 
1064 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1065 	    &child, &children) != 0) {
1066 
1067 		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
1068 
1069 		/*
1070 		 * As a generic check, we look to see if this is a replace of a
1071 		 * hot spare within the same pool.  If so, we allow it
1072 		 * regardless of what libdiskmgt or zpool_in_use() says.
1073 		 */
1074 		if (isreplacing) {
1075 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1076 			    &wholedisk) == 0 && wholedisk)
1077 				(void) snprintf(buf, sizeof (buf), "%ss0",
1078 				    path);
1079 			else
1080 				(void) strlcpy(buf, path, sizeof (buf));
1081 			if (is_spare(config, buf))
1082 				return (0);
1083 		}
1084 
1085 		if (strcmp(type, VDEV_TYPE_DISK) == 0)
1086 			ret = check_device(path, force, isspare);
1087 
1088 		if (strcmp(type, VDEV_TYPE_FILE) == 0)
1089 			ret = check_file(path, force, isspare);
1090 
1091 		return (ret);
1092 	}
1093 
1094 	for (c = 0; c < children; c++)
1095 		if ((ret = check_in_use(config, child[c], force,
1096 		    isreplacing, B_FALSE)) != 0)
1097 			return (ret);
1098 
1099 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1100 	    &child, &children) == 0)
1101 		for (c = 0; c < children; c++)
1102 			if ((ret = check_in_use(config, child[c], force,
1103 			    isreplacing, B_TRUE)) != 0)
1104 				return (ret);
1105 
1106 	return (0);
1107 }
1108 
1109 const char *
1110 is_grouping(const char *type, int *mindev)
1111 {
1112 	if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) {
1113 		if (mindev != NULL)
1114 			*mindev = 2;
1115 		return (VDEV_TYPE_RAIDZ);
1116 	}
1117 
1118 	if (strcmp(type, "raidz2") == 0) {
1119 		if (mindev != NULL)
1120 			*mindev = 3;
1121 		return (VDEV_TYPE_RAIDZ);
1122 	}
1123 
1124 	if (strcmp(type, "mirror") == 0) {
1125 		if (mindev != NULL)
1126 			*mindev = 2;
1127 		return (VDEV_TYPE_MIRROR);
1128 	}
1129 
1130 	if (strcmp(type, "spare") == 0) {
1131 		if (mindev != NULL)
1132 			*mindev = 1;
1133 		return (VDEV_TYPE_SPARE);
1134 	}
1135 
1136 	return (NULL);
1137 }
1138 
1139 /*
1140  * Construct a syntactically valid vdev specification,
1141  * and ensure that all devices and files exist and can be opened.
1142  * Note: we don't bother freeing anything in the error paths
1143  * because the program is just going to exit anyway.
1144  */
1145 nvlist_t *
1146 construct_spec(int argc, char **argv)
1147 {
1148 	nvlist_t *nvroot, *nv, **top, **spares;
1149 	int t, toplevels, mindev, nspares;
1150 	const char *type;
1151 
1152 	top = NULL;
1153 	toplevels = 0;
1154 	spares = NULL;
1155 	nspares = 0;
1156 
1157 	while (argc > 0) {
1158 		nv = NULL;
1159 
1160 		/*
1161 		 * If it's a mirror or raidz, the subsequent arguments are
1162 		 * its leaves -- until we encounter the next mirror or raidz.
1163 		 */
1164 		if ((type = is_grouping(argv[0], &mindev)) != NULL) {
1165 			nvlist_t **child = NULL;
1166 			int c, children = 0;
1167 
1168 			if (strcmp(type, VDEV_TYPE_SPARE) == 0 &&
1169 			    spares != NULL) {
1170 				(void) fprintf(stderr, gettext("invalid vdev "
1171 				    "specification: 'spare' can be "
1172 				    "specified only once\n"));
1173 				return (NULL);
1174 			}
1175 
1176 			for (c = 1; c < argc; c++) {
1177 				if (is_grouping(argv[c], NULL) != NULL)
1178 					break;
1179 				children++;
1180 				child = realloc(child,
1181 				    children * sizeof (nvlist_t *));
1182 				if (child == NULL)
1183 					zpool_no_memory();
1184 				if ((nv = make_leaf_vdev(argv[c])) == NULL)
1185 					return (NULL);
1186 				child[children - 1] = nv;
1187 			}
1188 
1189 			if (children < mindev) {
1190 				(void) fprintf(stderr, gettext("invalid vdev "
1191 				    "specification: %s requires at least %d "
1192 				    "devices\n"), argv[0], mindev);
1193 				return (NULL);
1194 			}
1195 
1196 			argc -= c;
1197 			argv += c;
1198 
1199 			if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1200 				spares = child;
1201 				nspares = children;
1202 				continue;
1203 			} else {
1204 				verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1205 				    0) == 0);
1206 				verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1207 				    type) == 0);
1208 				if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1209 					verify(nvlist_add_uint64(nv,
1210 					    ZPOOL_CONFIG_NPARITY,
1211 					    mindev - 1) == 0);
1212 				}
1213 				verify(nvlist_add_nvlist_array(nv,
1214 				    ZPOOL_CONFIG_CHILDREN, child,
1215 				    children) == 0);
1216 
1217 				for (c = 0; c < children; c++)
1218 					nvlist_free(child[c]);
1219 				free(child);
1220 			}
1221 		} else {
1222 			/*
1223 			 * We have a device.  Pass off to make_leaf_vdev() to
1224 			 * construct the appropriate nvlist describing the vdev.
1225 			 */
1226 			if ((nv = make_leaf_vdev(argv[0])) == NULL)
1227 				return (NULL);
1228 			argc--;
1229 			argv++;
1230 		}
1231 
1232 		toplevels++;
1233 		top = realloc(top, toplevels * sizeof (nvlist_t *));
1234 		if (top == NULL)
1235 			zpool_no_memory();
1236 		top[toplevels - 1] = nv;
1237 	}
1238 
1239 	if (toplevels == 0 && nspares == 0) {
1240 		(void) fprintf(stderr, gettext("invalid vdev "
1241 		    "specification: at least one toplevel vdev must be "
1242 		    "specified\n"));
1243 		return (NULL);
1244 	}
1245 
1246 	/*
1247 	 * Finally, create nvroot and add all top-level vdevs to it.
1248 	 */
1249 	verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1250 	verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1251 	    VDEV_TYPE_ROOT) == 0);
1252 	verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1253 	    top, toplevels) == 0);
1254 	if (nspares != 0)
1255 		verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1256 		    spares, nspares) == 0);
1257 
1258 	for (t = 0; t < toplevels; t++)
1259 		nvlist_free(top[t]);
1260 	for (t = 0; t < nspares; t++)
1261 		nvlist_free(spares[t]);
1262 	if (spares)
1263 		free(spares);
1264 	free(top);
1265 
1266 	return (nvroot);
1267 }
1268 
1269 /*
1270  * Get and validate the contents of the given vdev specification.  This ensures
1271  * that the nvlist returned is well-formed, that all the devices exist, and that
1272  * they are not currently in use by any other known consumer.  The 'poolconfig'
1273  * parameter is the current configuration of the pool when adding devices
1274  * existing pool, and is used to perform additional checks, such as changing the
1275  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1276  * new pool.  The 'force' flag controls whether devices should be forcefully
1277  * added, even if they appear in use.
1278  */
1279 nvlist_t *
1280 make_root_vdev(nvlist_t *poolconfig, int force, int check_rep,
1281     boolean_t isreplacing, int argc, char **argv)
1282 {
1283 	nvlist_t *newroot;
1284 
1285 	is_force = force;
1286 
1287 	/*
1288 	 * Construct the vdev specification.  If this is successful, we know
1289 	 * that we have a valid specification, and that all devices can be
1290 	 * opened.
1291 	 */
1292 	if ((newroot = construct_spec(argc, argv)) == NULL)
1293 		return (NULL);
1294 
1295 	/*
1296 	 * Validate each device to make sure that its not shared with another
1297 	 * subsystem.  We do this even if 'force' is set, because there are some
1298 	 * uses (such as a dedicated dump device) that even '-f' cannot
1299 	 * override.
1300 	 */
1301 	if (check_in_use(poolconfig, newroot, force, isreplacing,
1302 	    B_FALSE) != 0) {
1303 		nvlist_free(newroot);
1304 		return (NULL);
1305 	}
1306 
1307 	/*
1308 	 * Check the replication level of the given vdevs and report any errors
1309 	 * found.  We include the existing pool spec, if any, as we need to
1310 	 * catch changes against the existing replication level.
1311 	 */
1312 	if (check_rep && check_replication(poolconfig, newroot) != 0) {
1313 		nvlist_free(newroot);
1314 		return (NULL);
1315 	}
1316 
1317 	/*
1318 	 * Run through the vdev specification and label any whole disks found.
1319 	 */
1320 	if (make_disks(newroot) != 0) {
1321 		nvlist_free(newroot);
1322 		return (NULL);
1323 	}
1324 
1325 	return (newroot);
1326 }
1327