1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2016, 2017 Intel Corporation. 26 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 27 */ 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'make_root_vdev'. The 52 * function performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Call libzfs to label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <devid.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libdiskmgt.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <libzutil.h> 73 #include <limits.h> 74 #include <sys/spa.h> 75 #include <stdio.h> 76 #include <string.h> 77 #include <unistd.h> 78 #include <sys/efi_partition.h> 79 #include <sys/stat.h> 80 #include <sys/vtoc.h> 81 #include <sys/mntent.h> 82 83 #include "zpool_util.h" 84 85 #define BACKUP_SLICE "s2" 86 87 /* 88 * For any given vdev specification, we can have multiple errors. The 89 * vdev_error() function keeps track of whether we have seen an error yet, and 90 * prints out a header if its the first error we've seen. 91 */ 92 boolean_t error_seen; 93 boolean_t is_force; 94 95 /*PRINTFLIKE1*/ 96 static void 97 vdev_error(const char *fmt, ...) 98 { 99 va_list ap; 100 101 if (!error_seen) { 102 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 103 if (!is_force) 104 (void) fprintf(stderr, gettext("use '-f' to override " 105 "the following errors:\n")); 106 else 107 (void) fprintf(stderr, gettext("the following errors " 108 "must be manually repaired:\n")); 109 error_seen = B_TRUE; 110 } 111 112 va_start(ap, fmt); 113 (void) vfprintf(stderr, fmt, ap); 114 va_end(ap); 115 } 116 117 static void 118 libdiskmgt_error(int error) 119 { 120 /* 121 * ENXIO/ENODEV is a valid error message if the device doesn't live in 122 * /dev/dsk. Don't bother printing an error message in this case. 123 */ 124 if (error == ENXIO || error == ENODEV) 125 return; 126 127 (void) fprintf(stderr, gettext("warning: device in use checking " 128 "failed: %s\n"), strerror(error)); 129 } 130 131 /* 132 * Validate a device, passing the bulk of the work off to libdiskmgt. 133 */ 134 static int 135 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 136 { 137 char *msg; 138 int error = 0; 139 dm_who_type_t who; 140 141 if (force) 142 who = DM_WHO_ZPOOL_FORCE; 143 else if (isspare) 144 who = DM_WHO_ZPOOL_SPARE; 145 else 146 who = DM_WHO_ZPOOL; 147 148 if (dm_inuse((char *)path, &msg, who, &error) || error) { 149 if (error != 0) { 150 libdiskmgt_error(error); 151 return (0); 152 } else { 153 vdev_error("%s", msg); 154 free(msg); 155 return (-1); 156 } 157 } 158 159 /* 160 * If we're given a whole disk, ignore overlapping slices since we're 161 * about to label it anyway. 162 */ 163 error = 0; 164 if (!wholedisk && !force && 165 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 166 if (error == 0) { 167 /* dm_isoverlapping returned -1 */ 168 vdev_error(gettext("%s overlaps with %s\n"), path, msg); 169 free(msg); 170 return (-1); 171 } else if (error != ENODEV) { 172 /* libdiskmgt's devcache only handles physical drives */ 173 libdiskmgt_error(error); 174 return (0); 175 } 176 } 177 178 return (0); 179 } 180 181 182 /* 183 * Validate a whole disk. Iterate over all slices on the disk and make sure 184 * that none is in use by calling check_slice(). 185 */ 186 static int 187 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 188 { 189 dm_descriptor_t *drive, *media, *slice; 190 int err = 0; 191 int i; 192 int ret; 193 194 /* 195 * Get the drive associated with this disk. This should never fail, 196 * because we already have an alias handle open for the device. 197 */ 198 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 199 &err)) == NULL || *drive == 0) { 200 if (err) 201 libdiskmgt_error(err); 202 return (0); 203 } 204 205 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 206 &err)) == NULL) { 207 dm_free_descriptors(drive); 208 if (err) 209 libdiskmgt_error(err); 210 return (0); 211 } 212 213 dm_free_descriptors(drive); 214 215 /* 216 * It is possible that the user has specified a removable media drive, 217 * and the media is not present. 218 */ 219 if (*media == 0) { 220 dm_free_descriptors(media); 221 vdev_error(gettext("'%s' has no media in drive\n"), name); 222 return (-1); 223 } 224 225 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 226 &err)) == NULL) { 227 dm_free_descriptors(media); 228 if (err) 229 libdiskmgt_error(err); 230 return (0); 231 } 232 233 dm_free_descriptors(media); 234 235 ret = 0; 236 237 /* 238 * Iterate over all slices and report any errors. We don't care about 239 * overlapping slices because we are using the whole disk. 240 */ 241 for (i = 0; slice[i] != 0; i++) { 242 char *name = dm_get_name(slice[i], &err); 243 244 if (check_slice(name, force, B_TRUE, isspare) != 0) 245 ret = -1; 246 247 dm_free_name(name); 248 } 249 250 dm_free_descriptors(slice); 251 return (ret); 252 } 253 254 /* 255 * Validate a device. 256 */ 257 static int 258 check_device(const char *path, boolean_t force, boolean_t isspare) 259 { 260 dm_descriptor_t desc; 261 int err; 262 char *dev; 263 264 /* 265 * For whole disks, libdiskmgt does not include the leading dev path. 266 */ 267 dev = strrchr(path, '/'); 268 assert(dev != NULL); 269 dev++; 270 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != 0) { 271 err = check_disk(path, desc, force, isspare); 272 dm_free_descriptor(desc); 273 return (err); 274 } 275 276 return (check_slice(path, force, B_FALSE, isspare)); 277 } 278 279 /* 280 * Check that a file is valid. All we can do in this case is check that it's 281 * not in use by another pool, and not in use by swap. 282 */ 283 static int 284 check_file(const char *file, boolean_t force, boolean_t isspare) 285 { 286 char *name; 287 int fd; 288 int ret = 0; 289 int err; 290 pool_state_t state; 291 boolean_t inuse; 292 293 if (dm_inuse_swap(file, &err)) { 294 if (err) 295 libdiskmgt_error(err); 296 else 297 vdev_error(gettext("%s is currently used by swap. " 298 "Please see swap(8).\n"), file); 299 return (-1); 300 } 301 302 if ((fd = open(file, O_RDONLY)) < 0) 303 return (0); 304 305 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 306 const char *desc; 307 308 switch (state) { 309 case POOL_STATE_ACTIVE: 310 desc = gettext("active"); 311 break; 312 313 case POOL_STATE_EXPORTED: 314 desc = gettext("exported"); 315 break; 316 317 case POOL_STATE_POTENTIALLY_ACTIVE: 318 desc = gettext("potentially active"); 319 break; 320 321 default: 322 desc = gettext("unknown"); 323 break; 324 } 325 326 /* 327 * Allow hot spares to be shared between pools. 328 */ 329 if (state == POOL_STATE_SPARE && isspare) 330 return (0); 331 332 if (state == POOL_STATE_ACTIVE || 333 state == POOL_STATE_SPARE || !force) { 334 switch (state) { 335 case POOL_STATE_SPARE: 336 vdev_error(gettext("%s is reserved as a hot " 337 "spare for pool %s\n"), file, name); 338 break; 339 default: 340 vdev_error(gettext("%s is part of %s pool " 341 "'%s'\n"), file, desc, name); 342 break; 343 } 344 ret = -1; 345 } 346 347 free(name); 348 } 349 350 (void) close(fd); 351 return (ret); 352 } 353 354 355 /* 356 * By "whole disk" we mean an entire physical disk (something we can 357 * label, toggle the write cache on, etc.) as opposed to the full 358 * capacity of a pseudo-device such as lofi or did. We act as if we 359 * are labeling the disk, which should be a pretty good test of whether 360 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 361 * it isn't. 362 */ 363 static boolean_t 364 is_whole_disk(const char *arg) 365 { 366 struct dk_gpt *label; 367 int fd; 368 char path[MAXPATHLEN]; 369 370 (void) snprintf(path, sizeof (path), "%s%s%s", 371 ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); 372 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) 373 return (B_FALSE); 374 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 375 (void) close(fd); 376 return (B_FALSE); 377 } 378 efi_free(label); 379 (void) close(fd); 380 return (B_TRUE); 381 } 382 383 /* 384 * Create a leaf vdev. Determine if this is a file or a device. If it's a 385 * device, fill in the device id to make a complete nvlist. Valid forms for a 386 * leaf vdev are: 387 * 388 * /dev/dsk/xxx Complete disk path 389 * /xxx Full path to file 390 * xxx Shorthand for /dev/dsk/xxx 391 */ 392 static nvlist_t * 393 make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) 394 { 395 char path[MAXPATHLEN]; 396 struct stat64 statbuf; 397 nvlist_t *vdev = NULL; 398 char *type = NULL; 399 boolean_t wholedisk = B_FALSE; 400 uint64_t ashift = 0; 401 402 /* 403 * Determine what type of vdev this is, and put the full path into 404 * 'path'. We detect whether this is a device of file afterwards by 405 * checking the st_mode of the file. 406 */ 407 if (arg[0] == '/') { 408 /* 409 * Complete device or file path. Exact type is determined by 410 * examining the file descriptor afterwards. 411 */ 412 wholedisk = is_whole_disk(arg); 413 if (!wholedisk && (stat64(arg, &statbuf) != 0)) { 414 (void) fprintf(stderr, 415 gettext("cannot open '%s': %s\n"), 416 arg, strerror(errno)); 417 return (NULL); 418 } 419 420 (void) strlcpy(path, arg, sizeof (path)); 421 } else { 422 /* 423 * This may be a short path for a device, or it could be total 424 * gibberish. Check to see if it's a known device in 425 * /dev/dsk/. As part of this check, see if we've been given a 426 * an entire disk (minus the slice number). 427 */ 428 (void) snprintf(path, sizeof (path), "%s/%s", ZFS_DISK_ROOT, 429 arg); 430 wholedisk = is_whole_disk(path); 431 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 432 /* 433 * If we got ENOENT, then the user gave us 434 * gibberish, so try to direct them with a 435 * reasonable error message. Otherwise, 436 * regurgitate strerror() since it's the best we 437 * can do. 438 */ 439 if (errno == ENOENT) { 440 (void) fprintf(stderr, 441 gettext("cannot open '%s': no such " 442 "device in %s\n"), arg, ZFS_DISK_ROOT); 443 (void) fprintf(stderr, 444 gettext("must be a full path or " 445 "shorthand device name\n")); 446 return (NULL); 447 } else { 448 (void) fprintf(stderr, 449 gettext("cannot open '%s': %s\n"), 450 path, strerror(errno)); 451 return (NULL); 452 } 453 } 454 } 455 456 /* 457 * Determine whether this is a device or a file. 458 */ 459 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 460 type = VDEV_TYPE_DISK; 461 } else if (S_ISREG(statbuf.st_mode)) { 462 type = VDEV_TYPE_FILE; 463 } else { 464 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 465 "block device or regular file\n"), path); 466 return (NULL); 467 } 468 469 /* 470 * Finally, we have the complete device or file, and we know that it is 471 * acceptable to use. Construct the nvlist to describe this vdev. All 472 * vdevs have a 'path' element, and devices also have a 'devid' element. 473 */ 474 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 475 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 476 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 477 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); 478 if (is_log) 479 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, 480 VDEV_ALLOC_BIAS_LOG) == 0); 481 if (strcmp(type, VDEV_TYPE_DISK) == 0) 482 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 483 (uint64_t)wholedisk) == 0); 484 485 if (props != NULL) { 486 char *value = NULL; 487 488 if (nvlist_lookup_string(props, 489 zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { 490 if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { 491 (void) fprintf(stderr, 492 gettext("ashift must be a number.\n")); 493 return (NULL); 494 } 495 if (ashift != 0 && 496 (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { 497 (void) fprintf(stderr, 498 gettext("invalid 'ashift=%" PRIu64 "' " 499 "property: only values between %" PRId32 " " 500 "and %" PRId32 " are allowed.\n"), 501 ashift, ASHIFT_MIN, ASHIFT_MAX); 502 return (NULL); 503 } 504 } 505 } 506 507 /* 508 * For a whole disk, defer getting its devid until after labeling it. 509 */ 510 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 511 /* 512 * Get the devid for the device. 513 */ 514 int fd; 515 ddi_devid_t devid; 516 char *minor = NULL, *devid_str = NULL; 517 518 if ((fd = open(path, O_RDONLY)) < 0) { 519 (void) fprintf(stderr, gettext("cannot open '%s': " 520 "%s\n"), path, strerror(errno)); 521 nvlist_free(vdev); 522 return (NULL); 523 } 524 525 if (devid_get(fd, &devid) == 0) { 526 if (devid_get_minor_name(fd, &minor) == 0 && 527 (devid_str = devid_str_encode(devid, minor)) != 528 NULL) { 529 verify(nvlist_add_string(vdev, 530 ZPOOL_CONFIG_DEVID, devid_str) == 0); 531 } 532 if (devid_str != NULL) 533 devid_str_free(devid_str); 534 if (minor != NULL) 535 devid_str_free(minor); 536 devid_free(devid); 537 } 538 539 (void) close(fd); 540 } 541 542 if (ashift > 0) 543 (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); 544 545 return (vdev); 546 } 547 548 /* 549 * Go through and verify the replication level of the pool is consistent. 550 * Performs the following checks: 551 * 552 * For the new spec, verifies that devices in mirrors and raidz are the 553 * same size. 554 * 555 * If the current configuration already has inconsistent replication 556 * levels, ignore any other potential problems in the new spec. 557 * 558 * Otherwise, make sure that the current spec (if there is one) and the new 559 * spec have consistent replication levels. 560 * 561 * If there is no current spec (create), make sure new spec has at least 562 * one general purpose vdev. 563 */ 564 typedef struct replication_level { 565 char *zprl_type; 566 uint64_t zprl_children; 567 uint64_t zprl_parity; 568 } replication_level_t; 569 570 #define ZPOOL_FUZZ (16 * 1024 * 1024) 571 572 static boolean_t 573 is_raidz_mirror(replication_level_t *a, replication_level_t *b, 574 replication_level_t **raidz, replication_level_t **mirror) 575 { 576 if (strcmp(a->zprl_type, "raidz") == 0 && 577 strcmp(b->zprl_type, "mirror") == 0) { 578 *raidz = a; 579 *mirror = b; 580 return (B_TRUE); 581 } 582 return (B_FALSE); 583 } 584 585 /* 586 * Given a list of toplevel vdevs, return the current replication level. If 587 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 588 * an error message will be displayed for each self-inconsistent vdev. 589 */ 590 static replication_level_t * 591 get_replication(nvlist_t *nvroot, boolean_t fatal) 592 { 593 nvlist_t **top; 594 uint_t t, toplevels; 595 nvlist_t **child; 596 uint_t c, children; 597 nvlist_t *nv; 598 char *type; 599 replication_level_t lastrep = {0}; 600 replication_level_t rep; 601 replication_level_t *ret; 602 replication_level_t *raidz, *mirror; 603 boolean_t dontreport; 604 605 ret = safe_malloc(sizeof (replication_level_t)); 606 607 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 608 &top, &toplevels) == 0); 609 610 for (t = 0; t < toplevels; t++) { 611 uint64_t is_log = B_FALSE; 612 613 nv = top[t]; 614 615 /* 616 * For separate logs we ignore the top level vdev replication 617 * constraints. 618 */ 619 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 620 if (is_log) 621 continue; 622 623 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, 624 &type) == 0); 625 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 626 &child, &children) != 0) { 627 /* 628 * This is a 'file' or 'disk' vdev. 629 */ 630 rep.zprl_type = type; 631 rep.zprl_children = 1; 632 rep.zprl_parity = 0; 633 } else { 634 uint64_t vdev_size; 635 636 /* 637 * This is a mirror or RAID-Z vdev. Go through and make 638 * sure the contents are all the same (files vs. disks), 639 * keeping track of the number of elements in the 640 * process. 641 * 642 * We also check that the size of each vdev (if it can 643 * be determined) is the same. 644 */ 645 rep.zprl_type = type; 646 rep.zprl_children = 0; 647 648 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 649 verify(nvlist_lookup_uint64(nv, 650 ZPOOL_CONFIG_NPARITY, 651 &rep.zprl_parity) == 0); 652 assert(rep.zprl_parity != 0); 653 } else { 654 rep.zprl_parity = 0; 655 } 656 657 /* 658 * The 'dontreport' variable indicates that we've 659 * already reported an error for this spec, so don't 660 * bother doing it again. 661 */ 662 type = NULL; 663 dontreport = 0; 664 vdev_size = -1ULL; 665 for (c = 0; c < children; c++) { 666 nvlist_t *cnv = child[c]; 667 char *path; 668 struct stat64 statbuf; 669 uint64_t size = -1ULL; 670 char *childtype; 671 int fd, err; 672 673 rep.zprl_children++; 674 675 verify(nvlist_lookup_string(cnv, 676 ZPOOL_CONFIG_TYPE, &childtype) == 0); 677 678 /* 679 * If this is a replacing or spare vdev, then 680 * get the real first child of the vdev: do this 681 * in a loop because replacing and spare vdevs 682 * can be nested. 683 */ 684 while (strcmp(childtype, 685 VDEV_TYPE_REPLACING) == 0 || 686 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 687 nvlist_t **rchild; 688 uint_t rchildren; 689 690 verify(nvlist_lookup_nvlist_array(cnv, 691 ZPOOL_CONFIG_CHILDREN, &rchild, 692 &rchildren) == 0); 693 assert(rchildren == 2); 694 cnv = rchild[0]; 695 696 verify(nvlist_lookup_string(cnv, 697 ZPOOL_CONFIG_TYPE, 698 &childtype) == 0); 699 } 700 701 verify(nvlist_lookup_string(cnv, 702 ZPOOL_CONFIG_PATH, &path) == 0); 703 704 /* 705 * If we have a raidz/mirror that combines disks 706 * with files, report it as an error. 707 */ 708 if (!dontreport && type != NULL && 709 strcmp(type, childtype) != 0) { 710 if (ret != NULL) 711 free(ret); 712 ret = NULL; 713 if (fatal) 714 vdev_error(gettext( 715 "mismatched replication " 716 "level: %s contains both " 717 "files and devices\n"), 718 rep.zprl_type); 719 else 720 return (NULL); 721 dontreport = B_TRUE; 722 } 723 724 /* 725 * According to stat(2), the value of 'st_size' 726 * is undefined for block devices and character 727 * devices. But there is no effective way to 728 * determine the real size in userland. 729 * 730 * Instead, we'll take advantage of an 731 * implementation detail of spec_size(). If the 732 * device is currently open, then we (should) 733 * return a valid size. 734 * 735 * If we still don't get a valid size (indicated 736 * by a size of 0 or MAXOFFSET_T), then ignore 737 * this device altogether. 738 */ 739 if ((fd = open(path, O_RDONLY)) >= 0) { 740 err = fstat64(fd, &statbuf); 741 (void) close(fd); 742 } else { 743 err = stat64(path, &statbuf); 744 } 745 746 if (err != 0 || 747 statbuf.st_size == 0 || 748 statbuf.st_size == MAXOFFSET_T) 749 continue; 750 751 size = statbuf.st_size; 752 753 /* 754 * Also make sure that devices and 755 * slices have a consistent size. If 756 * they differ by a significant amount 757 * (~16MB) then report an error. 758 */ 759 if (!dontreport && 760 (vdev_size != -1ULL && 761 (labs(size - vdev_size) > 762 ZPOOL_FUZZ))) { 763 if (ret != NULL) 764 free(ret); 765 ret = NULL; 766 if (fatal) 767 vdev_error(gettext( 768 "%s contains devices of " 769 "different sizes\n"), 770 rep.zprl_type); 771 else 772 return (NULL); 773 dontreport = B_TRUE; 774 } 775 776 type = childtype; 777 vdev_size = size; 778 } 779 } 780 781 /* 782 * At this point, we have the replication of the last toplevel 783 * vdev in 'rep'. Compare it to 'lastrep' to see if it is 784 * different. 785 */ 786 if (lastrep.zprl_type != NULL) { 787 if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || 788 is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { 789 /* 790 * Accepted raidz and mirror when they can 791 * handle the same number of disk failures. 792 */ 793 if (raidz->zprl_parity != 794 mirror->zprl_children - 1) { 795 if (ret != NULL) 796 free(ret); 797 ret = NULL; 798 if (fatal) 799 vdev_error(gettext( 800 "mismatched replication " 801 "level: " 802 "%s and %s vdevs with " 803 "different redundancy, " 804 "%llu vs. %llu (%llu-way) " 805 "are present\n"), 806 raidz->zprl_type, 807 mirror->zprl_type, 808 raidz->zprl_parity, 809 mirror->zprl_children - 1, 810 mirror->zprl_children); 811 else 812 return (NULL); 813 } 814 } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 815 0) { 816 if (ret != NULL) 817 free(ret); 818 ret = NULL; 819 if (fatal) 820 vdev_error(gettext( 821 "mismatched replication level: " 822 "both %s and %s vdevs are " 823 "present\n"), 824 lastrep.zprl_type, rep.zprl_type); 825 else 826 return (NULL); 827 } else if (lastrep.zprl_parity != rep.zprl_parity) { 828 if (ret) 829 free(ret); 830 ret = NULL; 831 if (fatal) 832 vdev_error(gettext( 833 "mismatched replication level: " 834 "both %llu and %llu device parity " 835 "%s vdevs are present\n"), 836 lastrep.zprl_parity, 837 rep.zprl_parity, 838 rep.zprl_type); 839 else 840 return (NULL); 841 } else if (lastrep.zprl_children != rep.zprl_children) { 842 if (ret) 843 free(ret); 844 ret = NULL; 845 if (fatal) 846 vdev_error(gettext( 847 "mismatched replication level: " 848 "both %llu-way and %llu-way %s " 849 "vdevs are present\n"), 850 lastrep.zprl_children, 851 rep.zprl_children, 852 rep.zprl_type); 853 else 854 return (NULL); 855 } 856 } 857 lastrep = rep; 858 } 859 860 if (ret != NULL) 861 *ret = rep; 862 863 return (ret); 864 } 865 866 /* 867 * Check the replication level of the vdev spec against the current pool. Calls 868 * get_replication() to make sure the new spec is self-consistent. If the pool 869 * has a consistent replication level, then we ignore any errors. Otherwise, 870 * report any difference between the two. 871 */ 872 static int 873 check_replication(nvlist_t *config, nvlist_t *newroot) 874 { 875 nvlist_t **child; 876 uint_t children; 877 replication_level_t *current = NULL, *new; 878 replication_level_t *raidz, *mirror; 879 int ret; 880 881 /* 882 * If we have a current pool configuration, check to see if it's 883 * self-consistent. If not, simply return success. 884 */ 885 if (config != NULL) { 886 nvlist_t *nvroot; 887 888 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 889 &nvroot) == 0); 890 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 891 return (0); 892 } 893 /* 894 * for spares there may be no children, and therefore no 895 * replication level to check 896 */ 897 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 898 &child, &children) != 0) || (children == 0)) { 899 free(current); 900 return (0); 901 } 902 903 /* 904 * If all we have is logs then there's no replication level to check. 905 */ 906 if (num_logs(newroot) == children) { 907 free(current); 908 return (0); 909 } 910 911 /* 912 * Get the replication level of the new vdev spec, reporting any 913 * inconsistencies found. 914 */ 915 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 916 free(current); 917 return (-1); 918 } 919 920 /* 921 * Check to see if the new vdev spec matches the replication level of 922 * the current pool. 923 */ 924 ret = 0; 925 if (current != NULL) { 926 if (is_raidz_mirror(current, new, &raidz, &mirror) || 927 is_raidz_mirror(new, current, &raidz, &mirror)) { 928 if (raidz->zprl_parity != mirror->zprl_children - 1) { 929 vdev_error(gettext( 930 "mismatched replication level: pool and " 931 "new vdev with different redundancy, %s " 932 "and %s vdevs, %llu vs. %llu (%llu-way)\n"), 933 raidz->zprl_type, 934 mirror->zprl_type, 935 raidz->zprl_parity, 936 mirror->zprl_children - 1, 937 mirror->zprl_children); 938 ret = -1; 939 } 940 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { 941 vdev_error(gettext( 942 "mismatched replication level: pool uses %s " 943 "and new vdev is %s\n"), 944 current->zprl_type, new->zprl_type); 945 ret = -1; 946 } else if (current->zprl_parity != new->zprl_parity) { 947 vdev_error(gettext( 948 "mismatched replication level: pool uses %llu " 949 "device parity and new vdev uses %llu\n"), 950 current->zprl_parity, new->zprl_parity); 951 ret = -1; 952 } else if (current->zprl_children != new->zprl_children) { 953 vdev_error(gettext( 954 "mismatched replication level: pool uses %llu-way " 955 "%s and new vdev uses %llu-way %s\n"), 956 current->zprl_children, current->zprl_type, 957 new->zprl_children, new->zprl_type); 958 ret = -1; 959 } 960 } 961 962 free(new); 963 if (current != NULL) 964 free(current); 965 966 return (ret); 967 } 968 969 /* 970 * Go through and find any whole disks in the vdev specification, labelling them 971 * as appropriate. When constructing the vdev spec, we were unable to open this 972 * device in order to provide a devid. Now that we have labelled the disk and 973 * know the pool slice is valid, we can construct the devid now. 974 * 975 * If the disk was already labeled with an EFI label, we will have gotten the 976 * devid already (because we were able to open the whole disk). Otherwise, we 977 * need to get the devid after we label the disk. 978 */ 979 static int 980 make_disks(zpool_handle_t *zhp, nvlist_t *nv, zpool_boot_label_t boot_type, 981 uint64_t boot_size) 982 { 983 nvlist_t **child; 984 uint_t c, children; 985 char *type, *path, *diskname; 986 char buf[MAXPATHLEN]; 987 uint64_t wholedisk; 988 int fd; 989 int ret; 990 int slice; 991 ddi_devid_t devid; 992 char *minor = NULL, *devid_str = NULL; 993 994 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 995 996 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 997 &child, &children) != 0) { 998 999 if (strcmp(type, VDEV_TYPE_DISK) != 0) 1000 return (0); 1001 1002 /* 1003 * We have a disk device. Get the path to the device 1004 * and see if it's a whole disk by appending the backup 1005 * slice and stat()ing the device. 1006 */ 1007 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1008 1009 diskname = strrchr(path, '/'); 1010 assert(diskname != NULL); 1011 diskname++; 1012 1013 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1014 &wholedisk) != 0 || !wholedisk) { 1015 /* 1016 * This is not whole disk, return error if 1017 * boot partition creation was requested 1018 */ 1019 if (boot_type == ZPOOL_CREATE_BOOT_LABEL) { 1020 (void) fprintf(stderr, 1021 gettext("creating boot partition is only " 1022 "supported on whole disk vdevs: %s\n"), 1023 diskname); 1024 return (-1); 1025 } 1026 return (0); 1027 } 1028 1029 ret = zpool_label_disk(g_zfs, zhp, diskname, boot_type, 1030 boot_size, &slice); 1031 if (ret == -1) 1032 return (ret); 1033 1034 /* 1035 * Fill in the devid, now that we've labeled the disk. 1036 */ 1037 (void) snprintf(buf, sizeof (buf), "%ss%d", path, slice); 1038 if ((fd = open(buf, O_RDONLY)) < 0) { 1039 (void) fprintf(stderr, 1040 gettext("cannot open '%s': %s\n"), 1041 buf, strerror(errno)); 1042 return (-1); 1043 } 1044 1045 if (devid_get(fd, &devid) == 0) { 1046 if (devid_get_minor_name(fd, &minor) == 0 && 1047 (devid_str = devid_str_encode(devid, minor)) != 1048 NULL) { 1049 verify(nvlist_add_string(nv, 1050 ZPOOL_CONFIG_DEVID, devid_str) == 0); 1051 } 1052 if (devid_str != NULL) 1053 devid_str_free(devid_str); 1054 if (minor != NULL) 1055 devid_str_free(minor); 1056 devid_free(devid); 1057 } 1058 1059 /* 1060 * Update the path to refer to the pool slice. The presence of 1061 * the 'whole_disk' field indicates to the CLI that we should 1062 * chop off the slice number when displaying the device in 1063 * future output. 1064 */ 1065 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 1066 1067 (void) close(fd); 1068 1069 return (0); 1070 } 1071 1072 /* illumos kernel does not support booting from multi-vdev pools. */ 1073 if ((boot_type == ZPOOL_CREATE_BOOT_LABEL)) { 1074 if ((strcmp(type, VDEV_TYPE_ROOT) == 0) && children > 1) { 1075 (void) fprintf(stderr, gettext("boot pool " 1076 "can not have more than one vdev\n")); 1077 return (-1); 1078 } 1079 } 1080 1081 for (c = 0; c < children; c++) { 1082 ret = make_disks(zhp, child[c], boot_type, boot_size); 1083 if (ret != 0) 1084 return (ret); 1085 } 1086 1087 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1088 &child, &children) == 0) 1089 for (c = 0; c < children; c++) { 1090 ret = make_disks(zhp, child[c], boot_type, boot_size); 1091 if (ret != 0) 1092 return (ret); 1093 } 1094 1095 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1096 &child, &children) == 0) 1097 for (c = 0; c < children; c++) { 1098 ret = make_disks(zhp, child[c], boot_type, boot_size); 1099 if (ret != 0) 1100 return (ret); 1101 } 1102 1103 return (0); 1104 } 1105 1106 /* 1107 * Determine if the given path is a hot spare within the given configuration. 1108 */ 1109 static boolean_t 1110 is_spare(nvlist_t *config, const char *path) 1111 { 1112 int fd; 1113 pool_state_t state; 1114 char *name = NULL; 1115 nvlist_t *label; 1116 uint64_t guid, spareguid; 1117 nvlist_t *nvroot; 1118 nvlist_t **spares; 1119 uint_t i, nspares; 1120 boolean_t inuse; 1121 1122 if ((fd = open(path, O_RDONLY)) < 0) 1123 return (B_FALSE); 1124 1125 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 1126 !inuse || 1127 state != POOL_STATE_SPARE || 1128 zpool_read_label(fd, &label, NULL) != 0) { 1129 free(name); 1130 (void) close(fd); 1131 return (B_FALSE); 1132 } 1133 free(name); 1134 (void) close(fd); 1135 1136 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1137 nvlist_free(label); 1138 1139 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1140 &nvroot) == 0); 1141 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1142 &spares, &nspares) == 0) { 1143 for (i = 0; i < nspares; i++) { 1144 verify(nvlist_lookup_uint64(spares[i], 1145 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1146 if (spareguid == guid) 1147 return (B_TRUE); 1148 } 1149 } 1150 1151 return (B_FALSE); 1152 } 1153 1154 /* 1155 * Go through and find any devices that are in use. We rely on libdiskmgt for 1156 * the majority of this task. 1157 */ 1158 static boolean_t 1159 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, 1160 boolean_t replacing, boolean_t isspare) 1161 { 1162 nvlist_t **child; 1163 uint_t c, children; 1164 char *type, *path; 1165 int ret = 0; 1166 char buf[MAXPATHLEN]; 1167 uint64_t wholedisk; 1168 boolean_t anyinuse = B_FALSE; 1169 1170 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1171 1172 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1173 &child, &children) != 0) { 1174 1175 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1176 1177 /* 1178 * As a generic check, we look to see if this is a replace of a 1179 * hot spare within the same pool. If so, we allow it 1180 * regardless of what libdiskmgt or zpool_in_use() says. 1181 */ 1182 if (replacing) { 1183 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1184 &wholedisk) == 0 && wholedisk) 1185 (void) snprintf(buf, sizeof (buf), "%ss0", 1186 path); 1187 else 1188 (void) strlcpy(buf, path, sizeof (buf)); 1189 1190 if (is_spare(config, buf)) 1191 return (B_FALSE); 1192 } 1193 1194 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1195 ret = check_device(path, force, isspare); 1196 else if (strcmp(type, VDEV_TYPE_FILE) == 0) 1197 ret = check_file(path, force, isspare); 1198 1199 return (ret != 0); 1200 } 1201 1202 for (c = 0; c < children; c++) 1203 if (is_device_in_use(config, child[c], force, replacing, 1204 B_FALSE)) 1205 anyinuse = B_TRUE; 1206 1207 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1208 &child, &children) == 0) 1209 for (c = 0; c < children; c++) 1210 if (is_device_in_use(config, child[c], force, replacing, 1211 B_TRUE)) 1212 anyinuse = B_TRUE; 1213 1214 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1215 &child, &children) == 0) 1216 for (c = 0; c < children; c++) 1217 if (is_device_in_use(config, child[c], force, replacing, 1218 B_FALSE)) 1219 anyinuse = B_TRUE; 1220 1221 return (anyinuse); 1222 } 1223 1224 static const char * 1225 is_grouping(const char *type, int *mindev, int *maxdev) 1226 { 1227 if (strncmp(type, "raidz", 5) == 0) { 1228 const char *p = type + 5; 1229 char *end; 1230 long nparity; 1231 1232 if (*p == '\0') { 1233 nparity = 1; 1234 } else if (*p == '0') { 1235 return (NULL); /* no zero prefixes allowed */ 1236 } else { 1237 errno = 0; 1238 nparity = strtol(p, &end, 10); 1239 if (errno != 0 || nparity < 1 || nparity >= 255 || 1240 *end != '\0') 1241 return (NULL); 1242 } 1243 1244 if (mindev != NULL) 1245 *mindev = nparity + 1; 1246 if (maxdev != NULL) 1247 *maxdev = 255; 1248 return (VDEV_TYPE_RAIDZ); 1249 } 1250 1251 if (maxdev != NULL) 1252 *maxdev = INT_MAX; 1253 1254 if (strcmp(type, "mirror") == 0) { 1255 if (mindev != NULL) 1256 *mindev = 2; 1257 return (VDEV_TYPE_MIRROR); 1258 } 1259 1260 if (strcmp(type, "spare") == 0) { 1261 if (mindev != NULL) 1262 *mindev = 1; 1263 return (VDEV_TYPE_SPARE); 1264 } 1265 1266 if (strcmp(type, "log") == 0) { 1267 if (mindev != NULL) 1268 *mindev = 1; 1269 return (VDEV_TYPE_LOG); 1270 } 1271 1272 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || 1273 strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1274 if (mindev != NULL) 1275 *mindev = 1; 1276 return (type); 1277 } 1278 1279 if (strcmp(type, "cache") == 0) { 1280 if (mindev != NULL) 1281 *mindev = 1; 1282 return (VDEV_TYPE_L2CACHE); 1283 } 1284 1285 return (NULL); 1286 } 1287 1288 /* 1289 * Construct a syntactically valid vdev specification, 1290 * and ensure that all devices and files exist and can be opened. 1291 * Note: we don't bother freeing anything in the error paths 1292 * because the program is just going to exit anyway. 1293 */ 1294 nvlist_t * 1295 construct_spec(nvlist_t *props, int argc, char **argv) 1296 { 1297 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1298 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1299 const char *type; 1300 uint64_t is_log, is_special, is_dedup; 1301 boolean_t seen_logs; 1302 1303 top = NULL; 1304 toplevels = 0; 1305 spares = NULL; 1306 l2cache = NULL; 1307 nspares = 0; 1308 nlogs = 0; 1309 nl2cache = 0; 1310 is_log = is_special = is_dedup = B_FALSE; 1311 seen_logs = B_FALSE; 1312 1313 while (argc > 0) { 1314 nv = NULL; 1315 1316 /* 1317 * If it's a mirror or raidz, the subsequent arguments are 1318 * its leaves -- until we encounter the next mirror or raidz. 1319 */ 1320 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { 1321 nvlist_t **child = NULL; 1322 int c, children = 0; 1323 1324 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1325 if (spares != NULL) { 1326 (void) fprintf(stderr, 1327 gettext("invalid vdev " 1328 "specification: 'spare' can be " 1329 "specified only once\n")); 1330 return (NULL); 1331 } 1332 is_log = is_special = is_dedup = B_FALSE; 1333 } 1334 1335 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1336 if (seen_logs) { 1337 (void) fprintf(stderr, 1338 gettext("invalid vdev " 1339 "specification: 'log' can be " 1340 "specified only once\n")); 1341 return (NULL); 1342 } 1343 seen_logs = B_TRUE; 1344 is_log = B_TRUE; 1345 is_special = B_FALSE; 1346 is_dedup = B_FALSE; 1347 argc--; 1348 argv++; 1349 /* 1350 * A log is not a real grouping device. 1351 * We just set is_log and continue. 1352 */ 1353 continue; 1354 } 1355 1356 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { 1357 is_special = B_TRUE; 1358 is_log = B_FALSE; 1359 is_dedup = B_FALSE; 1360 argc--; 1361 argv++; 1362 continue; 1363 } 1364 1365 if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1366 is_dedup = B_TRUE; 1367 is_log = B_FALSE; 1368 is_special = B_FALSE; 1369 argc--; 1370 argv++; 1371 continue; 1372 } 1373 1374 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1375 if (l2cache != NULL) { 1376 (void) fprintf(stderr, 1377 gettext("invalid vdev " 1378 "specification: 'cache' can be " 1379 "specified only once\n")); 1380 return (NULL); 1381 } 1382 is_log = is_special = is_dedup = B_FALSE; 1383 } 1384 1385 if (is_log || is_special || is_dedup) { 1386 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1387 (void) fprintf(stderr, 1388 gettext("invalid vdev " 1389 "specification: unsupported '%s' " 1390 "device: %s\n"), is_log ? "log" : 1391 "special", type); 1392 return (NULL); 1393 } 1394 nlogs++; 1395 } 1396 1397 for (c = 1; c < argc; c++) { 1398 if (is_grouping(argv[c], NULL, NULL) != NULL) 1399 break; 1400 children++; 1401 child = realloc(child, 1402 children * sizeof (nvlist_t *)); 1403 if (child == NULL) 1404 zpool_no_memory(); 1405 if ((nv = make_leaf_vdev(props, argv[c], 1406 B_FALSE)) == NULL) 1407 return (NULL); 1408 child[children - 1] = nv; 1409 } 1410 1411 if (children < mindev) { 1412 (void) fprintf(stderr, gettext("invalid vdev " 1413 "specification: %s requires at least %d " 1414 "devices\n"), argv[0], mindev); 1415 return (NULL); 1416 } 1417 1418 if (children > maxdev) { 1419 (void) fprintf(stderr, gettext("invalid vdev " 1420 "specification: %s supports no more than " 1421 "%d devices\n"), argv[0], maxdev); 1422 return (NULL); 1423 } 1424 1425 argc -= c; 1426 argv += c; 1427 1428 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1429 spares = child; 1430 nspares = children; 1431 continue; 1432 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1433 l2cache = child; 1434 nl2cache = children; 1435 continue; 1436 } else { 1437 /* create a top-level vdev with children */ 1438 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1439 0) == 0); 1440 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1441 type) == 0); 1442 verify(nvlist_add_uint64(nv, 1443 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1444 if (is_log) 1445 verify(nvlist_add_string(nv, 1446 ZPOOL_CONFIG_ALLOCATION_BIAS, 1447 VDEV_ALLOC_BIAS_LOG) == 0); 1448 if (is_special) { 1449 verify(nvlist_add_string(nv, 1450 ZPOOL_CONFIG_ALLOCATION_BIAS, 1451 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1452 } 1453 if (is_dedup) { 1454 verify(nvlist_add_string(nv, 1455 ZPOOL_CONFIG_ALLOCATION_BIAS, 1456 VDEV_ALLOC_BIAS_DEDUP) == 0); 1457 } 1458 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1459 verify(nvlist_add_uint64(nv, 1460 ZPOOL_CONFIG_NPARITY, 1461 mindev - 1) == 0); 1462 } 1463 verify(nvlist_add_nvlist_array(nv, 1464 ZPOOL_CONFIG_CHILDREN, child, 1465 children) == 0); 1466 1467 for (c = 0; c < children; c++) 1468 nvlist_free(child[c]); 1469 free(child); 1470 } 1471 } else { 1472 /* 1473 * We have a device. Pass off to make_leaf_vdev() to 1474 * construct the appropriate nvlist describing the vdev. 1475 */ 1476 if ((nv = make_leaf_vdev(props, argv[0], is_log)) 1477 == NULL) 1478 return (NULL); 1479 if (is_log) 1480 nlogs++; 1481 if (is_special) { 1482 verify(nvlist_add_string(nv, 1483 ZPOOL_CONFIG_ALLOCATION_BIAS, 1484 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1485 } 1486 if (is_dedup) { 1487 verify(nvlist_add_string(nv, 1488 ZPOOL_CONFIG_ALLOCATION_BIAS, 1489 VDEV_ALLOC_BIAS_DEDUP) == 0); 1490 } 1491 argc--; 1492 argv++; 1493 } 1494 1495 toplevels++; 1496 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1497 if (top == NULL) 1498 zpool_no_memory(); 1499 top[toplevels - 1] = nv; 1500 } 1501 1502 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1503 (void) fprintf(stderr, gettext("invalid vdev " 1504 "specification: at least one toplevel vdev must be " 1505 "specified\n")); 1506 return (NULL); 1507 } 1508 1509 if (seen_logs && nlogs == 0) { 1510 (void) fprintf(stderr, gettext("invalid vdev specification: " 1511 "log requires at least 1 device\n")); 1512 return (NULL); 1513 } 1514 1515 /* 1516 * Finally, create nvroot and add all top-level vdevs to it. 1517 */ 1518 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1519 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1520 VDEV_TYPE_ROOT) == 0); 1521 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1522 top, toplevels) == 0); 1523 if (nspares != 0) 1524 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1525 spares, nspares) == 0); 1526 if (nl2cache != 0) 1527 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1528 l2cache, nl2cache) == 0); 1529 1530 for (t = 0; t < toplevels; t++) 1531 nvlist_free(top[t]); 1532 for (t = 0; t < nspares; t++) 1533 nvlist_free(spares[t]); 1534 for (t = 0; t < nl2cache; t++) 1535 nvlist_free(l2cache[t]); 1536 if (spares) 1537 free(spares); 1538 if (l2cache) 1539 free(l2cache); 1540 free(top); 1541 1542 return (nvroot); 1543 } 1544 1545 nvlist_t * 1546 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1547 splitflags_t flags, int argc, char **argv) 1548 { 1549 nvlist_t *newroot = NULL, **child; 1550 uint_t c, children; 1551 zpool_boot_label_t boot_type; 1552 1553 if (argc > 0) { 1554 if ((newroot = construct_spec(props, argc, argv)) == NULL) { 1555 (void) fprintf(stderr, gettext("Unable to build a " 1556 "pool from the specified devices\n")); 1557 return (NULL); 1558 } 1559 1560 if (zpool_is_bootable(zhp)) 1561 boot_type = ZPOOL_COPY_BOOT_LABEL; 1562 else 1563 boot_type = ZPOOL_NO_BOOT_LABEL; 1564 1565 if (!flags.dryrun && 1566 make_disks(zhp, newroot, boot_type, 0) != 0) { 1567 nvlist_free(newroot); 1568 return (NULL); 1569 } 1570 1571 /* avoid any tricks in the spec */ 1572 verify(nvlist_lookup_nvlist_array(newroot, 1573 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1574 for (c = 0; c < children; c++) { 1575 char *path; 1576 const char *type; 1577 int min, max; 1578 1579 verify(nvlist_lookup_string(child[c], 1580 ZPOOL_CONFIG_PATH, &path) == 0); 1581 if ((type = is_grouping(path, &min, &max)) != NULL) { 1582 (void) fprintf(stderr, gettext("Cannot use " 1583 "'%s' as a device for splitting\n"), type); 1584 nvlist_free(newroot); 1585 return (NULL); 1586 } 1587 } 1588 } 1589 1590 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1591 nvlist_free(newroot); 1592 return (NULL); 1593 } 1594 1595 return (newroot); 1596 } 1597 1598 static int 1599 num_normal_vdevs(nvlist_t *nvroot) 1600 { 1601 nvlist_t **top; 1602 uint_t t, toplevels, normal = 0; 1603 1604 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1605 &top, &toplevels) == 0); 1606 1607 for (t = 0; t < toplevels; t++) { 1608 uint64_t log = B_FALSE; 1609 1610 (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); 1611 if (log) 1612 continue; 1613 if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) 1614 continue; 1615 1616 normal++; 1617 } 1618 1619 return (normal); 1620 } 1621 1622 /* 1623 * Get and validate the contents of the given vdev specification. This ensures 1624 * that the nvlist returned is well-formed, that all the devices exist, and that 1625 * they are not currently in use by any other known consumer. The 'poolconfig' 1626 * parameter is the current configuration of the pool when adding devices 1627 * existing pool, and is used to perform additional checks, such as changing the 1628 * replication level of the pool. It can be 'NULL' to indicate that this is a 1629 * new pool. The 'force' flag controls whether devices should be forcefully 1630 * added, even if they appear in use. 1631 */ 1632 nvlist_t * 1633 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, 1634 boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type, 1635 uint64_t boot_size, int argc, char **argv) 1636 { 1637 nvlist_t *newroot; 1638 nvlist_t *poolconfig = NULL; 1639 is_force = force; 1640 1641 /* 1642 * Construct the vdev specification. If this is successful, we know 1643 * that we have a valid specification, and that all devices can be 1644 * opened. 1645 */ 1646 if ((newroot = construct_spec(props, argc, argv)) == NULL) 1647 return (NULL); 1648 1649 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) 1650 return (NULL); 1651 1652 /* 1653 * Validate each device to make sure that its not shared with another 1654 * subsystem. We do this even if 'force' is set, because there are some 1655 * uses (such as a dedicated dump device) that even '-f' cannot 1656 * override. 1657 */ 1658 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { 1659 nvlist_free(newroot); 1660 return (NULL); 1661 } 1662 1663 /* 1664 * Check the replication level of the given vdevs and report any errors 1665 * found. We include the existing pool spec, if any, as we need to 1666 * catch changes against the existing replication level. 1667 */ 1668 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1669 nvlist_free(newroot); 1670 return (NULL); 1671 } 1672 1673 /* 1674 * On pool create the new vdev spec must have one normal vdev. 1675 */ 1676 if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { 1677 vdev_error(gettext("at least one general top-level vdev must " 1678 "be specified\n")); 1679 nvlist_free(newroot); 1680 return (NULL); 1681 } 1682 1683 /* 1684 * Run through the vdev specification and label any whole disks found. 1685 */ 1686 if (!dryrun && make_disks(zhp, newroot, boot_type, boot_size) != 0) { 1687 nvlist_free(newroot); 1688 return (NULL); 1689 } 1690 1691 return (newroot); 1692 } 1693