1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2016, 2017 Intel Corporation. 26 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 27 */ 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'make_root_vdev'. The 52 * function performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Call libzfs to label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <devid.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libdiskmgt.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <libzutil.h> 73 #include <limits.h> 74 #include <sys/spa.h> 75 #include <stdio.h> 76 #include <string.h> 77 #include <unistd.h> 78 #include <sys/efi_partition.h> 79 #include <sys/stat.h> 80 #include <sys/vtoc.h> 81 #include <sys/mntent.h> 82 83 #include "zpool_util.h" 84 85 #define BACKUP_SLICE "s2" 86 87 /* 88 * For any given vdev specification, we can have multiple errors. The 89 * vdev_error() function keeps track of whether we have seen an error yet, and 90 * prints out a header if its the first error we've seen. 91 */ 92 boolean_t error_seen; 93 boolean_t is_force; 94 95 /*PRINTFLIKE1*/ 96 static void 97 vdev_error(const char *fmt, ...) 98 { 99 va_list ap; 100 101 if (!error_seen) { 102 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 103 if (!is_force) 104 (void) fprintf(stderr, gettext("use '-f' to override " 105 "the following errors:\n")); 106 else 107 (void) fprintf(stderr, gettext("the following errors " 108 "must be manually repaired:\n")); 109 error_seen = B_TRUE; 110 } 111 112 va_start(ap, fmt); 113 (void) vfprintf(stderr, fmt, ap); 114 va_end(ap); 115 } 116 117 static void 118 libdiskmgt_error(int error) 119 { 120 /* 121 * ENXIO/ENODEV is a valid error message if the device doesn't live in 122 * /dev/dsk. Don't bother printing an error message in this case. 123 */ 124 if (error == ENXIO || error == ENODEV) 125 return; 126 127 (void) fprintf(stderr, gettext("warning: device in use checking " 128 "failed: %s\n"), strerror(error)); 129 } 130 131 /* 132 * Validate a device, passing the bulk of the work off to libdiskmgt. 133 */ 134 static int 135 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 136 { 137 char *msg; 138 int error = 0; 139 dm_who_type_t who; 140 141 if (force) 142 who = DM_WHO_ZPOOL_FORCE; 143 else if (isspare) 144 who = DM_WHO_ZPOOL_SPARE; 145 else 146 who = DM_WHO_ZPOOL; 147 148 if (dm_inuse((char *)path, &msg, who, &error) || error) { 149 if (error != 0) { 150 libdiskmgt_error(error); 151 return (0); 152 } else { 153 vdev_error("%s", msg); 154 free(msg); 155 return (-1); 156 } 157 } 158 159 /* 160 * If we're given a whole disk, ignore overlapping slices since we're 161 * about to label it anyway. 162 */ 163 error = 0; 164 if (!wholedisk && !force && 165 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 166 if (error == 0) { 167 /* dm_isoverlapping returned -1 */ 168 vdev_error(gettext("%s overlaps with %s\n"), path, msg); 169 free(msg); 170 return (-1); 171 } else if (error != ENODEV) { 172 /* libdiskmgt's devcache only handles physical drives */ 173 libdiskmgt_error(error); 174 return (0); 175 } 176 } 177 178 return (0); 179 } 180 181 182 /* 183 * Validate a whole disk. Iterate over all slices on the disk and make sure 184 * that none is in use by calling check_slice(). 185 */ 186 static int 187 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 188 { 189 dm_descriptor_t *drive, *media, *slice; 190 int err = 0; 191 int i; 192 int ret; 193 194 /* 195 * Get the drive associated with this disk. This should never fail, 196 * because we already have an alias handle open for the device. 197 */ 198 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 199 &err)) == NULL || *drive == 0) { 200 if (err) 201 libdiskmgt_error(err); 202 return (0); 203 } 204 205 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 206 &err)) == NULL) { 207 dm_free_descriptors(drive); 208 if (err) 209 libdiskmgt_error(err); 210 return (0); 211 } 212 213 dm_free_descriptors(drive); 214 215 /* 216 * It is possible that the user has specified a removable media drive, 217 * and the media is not present. 218 */ 219 if (*media == 0) { 220 dm_free_descriptors(media); 221 vdev_error(gettext("'%s' has no media in drive\n"), name); 222 return (-1); 223 } 224 225 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 226 &err)) == NULL) { 227 dm_free_descriptors(media); 228 if (err) 229 libdiskmgt_error(err); 230 return (0); 231 } 232 233 dm_free_descriptors(media); 234 235 ret = 0; 236 237 /* 238 * Iterate over all slices and report any errors. We don't care about 239 * overlapping slices because we are using the whole disk. 240 */ 241 for (i = 0; slice[i] != 0; i++) { 242 char *name = dm_get_name(slice[i], &err); 243 244 if (check_slice(name, force, B_TRUE, isspare) != 0) 245 ret = -1; 246 247 dm_free_name(name); 248 } 249 250 dm_free_descriptors(slice); 251 return (ret); 252 } 253 254 /* 255 * Validate a device. 256 */ 257 static int 258 check_device(const char *path, boolean_t force, boolean_t isspare) 259 { 260 dm_descriptor_t desc; 261 int err; 262 char *dev; 263 264 /* 265 * For whole disks, libdiskmgt does not include the leading dev path. 266 */ 267 dev = strrchr(path, '/'); 268 assert(dev != NULL); 269 dev++; 270 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != 0) { 271 err = check_disk(path, desc, force, isspare); 272 dm_free_descriptor(desc); 273 return (err); 274 } 275 276 return (check_slice(path, force, B_FALSE, isspare)); 277 } 278 279 /* 280 * Check that a file is valid. All we can do in this case is check that it's 281 * not in use by another pool, and not in use by swap. 282 */ 283 static int 284 check_file(const char *file, boolean_t force, boolean_t isspare) 285 { 286 char *name; 287 int fd; 288 int ret = 0; 289 int err; 290 pool_state_t state; 291 boolean_t inuse; 292 293 if (dm_inuse_swap(file, &err)) { 294 if (err) 295 libdiskmgt_error(err); 296 else 297 vdev_error(gettext("%s is currently used by swap. " 298 "Please see swap(8).\n"), file); 299 return (-1); 300 } 301 302 if ((fd = open(file, O_RDONLY)) < 0) 303 return (0); 304 305 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 306 const char *desc; 307 308 switch (state) { 309 case POOL_STATE_ACTIVE: 310 desc = gettext("active"); 311 break; 312 313 case POOL_STATE_EXPORTED: 314 desc = gettext("exported"); 315 break; 316 317 case POOL_STATE_POTENTIALLY_ACTIVE: 318 desc = gettext("potentially active"); 319 break; 320 321 default: 322 desc = gettext("unknown"); 323 break; 324 } 325 326 /* 327 * Allow hot spares to be shared between pools. 328 */ 329 if (state == POOL_STATE_SPARE && isspare) 330 return (0); 331 332 if (state == POOL_STATE_ACTIVE || 333 state == POOL_STATE_SPARE || !force) { 334 switch (state) { 335 case POOL_STATE_SPARE: 336 vdev_error(gettext("%s is reserved as a hot " 337 "spare for pool %s\n"), file, name); 338 break; 339 default: 340 vdev_error(gettext("%s is part of %s pool " 341 "'%s'\n"), file, desc, name); 342 break; 343 } 344 ret = -1; 345 } 346 347 free(name); 348 } 349 350 (void) close(fd); 351 return (ret); 352 } 353 354 355 /* 356 * By "whole disk" we mean an entire physical disk (something we can 357 * label, toggle the write cache on, etc.) as opposed to the full 358 * capacity of a pseudo-device such as lofi or did. We act as if we 359 * are labeling the disk, which should be a pretty good test of whether 360 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 361 * it isn't. 362 */ 363 static boolean_t 364 is_whole_disk(const char *arg) 365 { 366 struct dk_gpt *label; 367 int fd; 368 char path[MAXPATHLEN]; 369 370 (void) snprintf(path, sizeof (path), "%s%s%s", 371 ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); 372 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) 373 return (B_FALSE); 374 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 375 (void) close(fd); 376 return (B_FALSE); 377 } 378 efi_free(label); 379 (void) close(fd); 380 return (B_TRUE); 381 } 382 383 /* 384 * Create a leaf vdev. Determine if this is a file or a device. If it's a 385 * device, fill in the device id to make a complete nvlist. Valid forms for a 386 * leaf vdev are: 387 * 388 * /dev/dsk/xxx Complete disk path 389 * /xxx Full path to file 390 * xxx Shorthand for /dev/dsk/xxx 391 */ 392 static nvlist_t * 393 make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) 394 { 395 char path[MAXPATHLEN]; 396 struct stat64 statbuf; 397 nvlist_t *vdev = NULL; 398 char *type = NULL; 399 boolean_t wholedisk = B_FALSE; 400 uint64_t ashift = 0; 401 402 /* 403 * Determine what type of vdev this is, and put the full path into 404 * 'path'. We detect whether this is a device of file afterwards by 405 * checking the st_mode of the file. 406 */ 407 if (arg[0] == '/') { 408 /* 409 * Complete device or file path. Exact type is determined by 410 * examining the file descriptor afterwards. 411 */ 412 wholedisk = is_whole_disk(arg); 413 if (!wholedisk && (stat64(arg, &statbuf) != 0)) { 414 (void) fprintf(stderr, 415 gettext("cannot open '%s': %s\n"), 416 arg, strerror(errno)); 417 return (NULL); 418 } 419 420 (void) strlcpy(path, arg, sizeof (path)); 421 } else { 422 /* 423 * This may be a short path for a device, or it could be total 424 * gibberish. Check to see if it's a known device in 425 * /dev/dsk/. As part of this check, see if we've been given a 426 * an entire disk (minus the slice number). 427 */ 428 (void) snprintf(path, sizeof (path), "%s/%s", ZFS_DISK_ROOT, 429 arg); 430 wholedisk = is_whole_disk(path); 431 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 432 /* 433 * If we got ENOENT, then the user gave us 434 * gibberish, so try to direct them with a 435 * reasonable error message. Otherwise, 436 * regurgitate strerror() since it's the best we 437 * can do. 438 */ 439 if (errno == ENOENT) { 440 (void) fprintf(stderr, 441 gettext("cannot open '%s': no such " 442 "device in %s\n"), arg, ZFS_DISK_ROOT); 443 (void) fprintf(stderr, 444 gettext("must be a full path or " 445 "shorthand device name\n")); 446 return (NULL); 447 } else { 448 (void) fprintf(stderr, 449 gettext("cannot open '%s': %s\n"), 450 path, strerror(errno)); 451 return (NULL); 452 } 453 } 454 } 455 456 /* 457 * Determine whether this is a device or a file. 458 */ 459 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 460 type = VDEV_TYPE_DISK; 461 } else if (S_ISREG(statbuf.st_mode)) { 462 type = VDEV_TYPE_FILE; 463 } else { 464 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 465 "block device or regular file\n"), path); 466 return (NULL); 467 } 468 469 /* 470 * Finally, we have the complete device or file, and we know that it is 471 * acceptable to use. Construct the nvlist to describe this vdev. All 472 * vdevs have a 'path' element, and devices also have a 'devid' element. 473 */ 474 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 475 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 476 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 477 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); 478 if (is_log) 479 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, 480 VDEV_ALLOC_BIAS_LOG) == 0); 481 if (strcmp(type, VDEV_TYPE_DISK) == 0) 482 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 483 (uint64_t)wholedisk) == 0); 484 485 if (props != NULL) { 486 char *value = NULL; 487 488 if (nvlist_lookup_string(props, 489 zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { 490 if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { 491 (void) fprintf(stderr, 492 gettext("ashift must be a number.\n")); 493 return (NULL); 494 } 495 if (ashift != 0 && 496 (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { 497 (void) fprintf(stderr, 498 gettext("invalid 'ashift=%" PRIu64 "' " 499 "property: only values between %" PRId32 " " 500 "and %" PRId32 " are allowed.\n"), 501 ashift, ASHIFT_MIN, ASHIFT_MAX); 502 return (NULL); 503 } 504 } 505 } 506 507 /* 508 * For a whole disk, defer getting its devid until after labeling it. 509 */ 510 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 511 /* 512 * Get the devid for the device. 513 */ 514 int fd; 515 ddi_devid_t devid; 516 char *minor = NULL, *devid_str = NULL; 517 518 if ((fd = open(path, O_RDONLY)) < 0) { 519 (void) fprintf(stderr, gettext("cannot open '%s': " 520 "%s\n"), path, strerror(errno)); 521 nvlist_free(vdev); 522 return (NULL); 523 } 524 525 if (devid_get(fd, &devid) == 0) { 526 if (devid_get_minor_name(fd, &minor) == 0 && 527 (devid_str = devid_str_encode(devid, minor)) != 528 NULL) { 529 verify(nvlist_add_string(vdev, 530 ZPOOL_CONFIG_DEVID, devid_str) == 0); 531 } 532 if (devid_str != NULL) 533 devid_str_free(devid_str); 534 if (minor != NULL) 535 devid_str_free(minor); 536 devid_free(devid); 537 } 538 539 (void) close(fd); 540 } 541 542 if (ashift > 0) 543 (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); 544 545 return (vdev); 546 } 547 548 /* 549 * Go through and verify the replication level of the pool is consistent. 550 * Performs the following checks: 551 * 552 * For the new spec, verifies that devices in mirrors and raidz are the 553 * same size. 554 * 555 * If the current configuration already has inconsistent replication 556 * levels, ignore any other potential problems in the new spec. 557 * 558 * Otherwise, make sure that the current spec (if there is one) and the new 559 * spec have consistent replication levels. 560 * 561 * If there is no current spec (create), make sure new spec has at least 562 * one general purpose vdev. 563 */ 564 typedef struct replication_level { 565 char *zprl_type; 566 uint64_t zprl_children; 567 uint64_t zprl_parity; 568 } replication_level_t; 569 570 #define ZPOOL_FUZZ (16 * 1024 * 1024) 571 572 static boolean_t 573 is_raidz_mirror(replication_level_t *a, replication_level_t *b, 574 replication_level_t **raidz, replication_level_t **mirror) 575 { 576 if (strcmp(a->zprl_type, "raidz") == 0 && 577 strcmp(b->zprl_type, "mirror") == 0) { 578 *raidz = a; 579 *mirror = b; 580 return (B_TRUE); 581 } 582 return (B_FALSE); 583 } 584 585 /* 586 * Given a list of toplevel vdevs, return the current replication level. If 587 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 588 * an error message will be displayed for each self-inconsistent vdev. 589 */ 590 static replication_level_t * 591 get_replication(nvlist_t *nvroot, boolean_t fatal) 592 { 593 nvlist_t **top; 594 uint_t t, toplevels; 595 nvlist_t **child; 596 uint_t c, children; 597 nvlist_t *nv; 598 char *type; 599 replication_level_t lastrep = {0}; 600 replication_level_t rep; 601 replication_level_t *ret; 602 replication_level_t *raidz, *mirror; 603 boolean_t dontreport; 604 605 ret = safe_malloc(sizeof (replication_level_t)); 606 607 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 608 &top, &toplevels) == 0); 609 610 for (t = 0; t < toplevels; t++) { 611 uint64_t is_log = B_FALSE; 612 613 nv = top[t]; 614 615 /* 616 * For separate logs we ignore the top level vdev replication 617 * constraints. 618 */ 619 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 620 if (is_log) 621 continue; 622 623 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, 624 &type) == 0); 625 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 626 &child, &children) != 0) { 627 /* 628 * This is a 'file' or 'disk' vdev. 629 */ 630 rep.zprl_type = type; 631 rep.zprl_children = 1; 632 rep.zprl_parity = 0; 633 } else { 634 uint64_t vdev_size; 635 636 /* 637 * This is a mirror or RAID-Z vdev. Go through and make 638 * sure the contents are all the same (files vs. disks), 639 * keeping track of the number of elements in the 640 * process. 641 * 642 * We also check that the size of each vdev (if it can 643 * be determined) is the same. 644 */ 645 rep.zprl_type = type; 646 rep.zprl_children = 0; 647 648 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 649 verify(nvlist_lookup_uint64(nv, 650 ZPOOL_CONFIG_NPARITY, 651 &rep.zprl_parity) == 0); 652 assert(rep.zprl_parity != 0); 653 } else { 654 rep.zprl_parity = 0; 655 } 656 657 /* 658 * The 'dontreport' variable indicates that we've 659 * already reported an error for this spec, so don't 660 * bother doing it again. 661 */ 662 type = NULL; 663 dontreport = 0; 664 vdev_size = -1ULL; 665 for (c = 0; c < children; c++) { 666 nvlist_t *cnv = child[c]; 667 char *path; 668 struct stat64 statbuf; 669 uint64_t size = -1ULL; 670 char *childtype; 671 int fd, err; 672 673 rep.zprl_children++; 674 675 verify(nvlist_lookup_string(cnv, 676 ZPOOL_CONFIG_TYPE, &childtype) == 0); 677 678 /* 679 * If this is a replacing or spare vdev, then 680 * get the real first child of the vdev: do this 681 * in a loop because replacing and spare vdevs 682 * can be nested. 683 */ 684 while (strcmp(childtype, 685 VDEV_TYPE_REPLACING) == 0 || 686 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 687 nvlist_t **rchild; 688 uint_t rchildren; 689 690 verify(nvlist_lookup_nvlist_array(cnv, 691 ZPOOL_CONFIG_CHILDREN, &rchild, 692 &rchildren) == 0); 693 assert(rchildren == 2); 694 cnv = rchild[0]; 695 696 verify(nvlist_lookup_string(cnv, 697 ZPOOL_CONFIG_TYPE, 698 &childtype) == 0); 699 } 700 701 verify(nvlist_lookup_string(cnv, 702 ZPOOL_CONFIG_PATH, &path) == 0); 703 704 /* 705 * If we have a raidz/mirror that combines disks 706 * with files, report it as an error. 707 */ 708 if (!dontreport && type != NULL && 709 strcmp(type, childtype) != 0) { 710 if (ret != NULL) 711 free(ret); 712 ret = NULL; 713 if (fatal) 714 vdev_error(gettext( 715 "mismatched replication " 716 "level: %s contains both " 717 "files and devices\n"), 718 rep.zprl_type); 719 else 720 return (NULL); 721 dontreport = B_TRUE; 722 } 723 724 /* 725 * According to stat(2), the value of 'st_size' 726 * is undefined for block devices and character 727 * devices. But there is no effective way to 728 * determine the real size in userland. 729 * 730 * Instead, we'll take advantage of an 731 * implementation detail of spec_size(). If the 732 * device is currently open, then we (should) 733 * return a valid size. 734 * 735 * If we still don't get a valid size (indicated 736 * by a size of 0 or MAXOFFSET_T), then ignore 737 * this device altogether. 738 */ 739 if ((fd = open(path, O_RDONLY)) >= 0) { 740 err = fstat64(fd, &statbuf); 741 (void) close(fd); 742 } else { 743 err = stat64(path, &statbuf); 744 } 745 746 if (err != 0 || 747 statbuf.st_size == 0 || 748 statbuf.st_size == MAXOFFSET_T) 749 continue; 750 751 size = statbuf.st_size; 752 753 /* 754 * Also make sure that devices and 755 * slices have a consistent size. If 756 * they differ by a significant amount 757 * (~16MB) then report an error. 758 */ 759 if (!dontreport && 760 (vdev_size != -1ULL && 761 (labs(size - vdev_size) > 762 ZPOOL_FUZZ))) { 763 if (ret != NULL) 764 free(ret); 765 ret = NULL; 766 if (fatal) 767 vdev_error(gettext( 768 "%s contains devices of " 769 "different sizes\n"), 770 rep.zprl_type); 771 else 772 return (NULL); 773 dontreport = B_TRUE; 774 } 775 776 type = childtype; 777 vdev_size = size; 778 } 779 } 780 781 /* 782 * At this point, we have the replication of the last toplevel 783 * vdev in 'rep'. Compare it to 'lastrep' to see if it is 784 * different. 785 */ 786 if (lastrep.zprl_type != NULL) { 787 if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || 788 is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { 789 /* 790 * Accepted raidz and mirror when they can 791 * handle the same number of disk failures. 792 */ 793 if (raidz->zprl_parity != 794 mirror->zprl_children - 1) { 795 if (ret != NULL) 796 free(ret); 797 ret = NULL; 798 if (fatal) 799 vdev_error(gettext( 800 "mismatched replication " 801 "level: " 802 "%s and %s vdevs with " 803 "different redundancy, " 804 "%llu vs. %llu (%llu-way) " 805 "are present\n"), 806 raidz->zprl_type, 807 mirror->zprl_type, 808 raidz->zprl_parity, 809 mirror->zprl_children - 1, 810 mirror->zprl_children); 811 else 812 return (NULL); 813 } 814 } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 815 0) { 816 if (ret != NULL) 817 free(ret); 818 ret = NULL; 819 if (fatal) 820 vdev_error(gettext( 821 "mismatched replication level: " 822 "both %s and %s vdevs are " 823 "present\n"), 824 lastrep.zprl_type, rep.zprl_type); 825 else 826 return (NULL); 827 } else if (lastrep.zprl_parity != rep.zprl_parity) { 828 if (ret) 829 free(ret); 830 ret = NULL; 831 if (fatal) 832 vdev_error(gettext( 833 "mismatched replication level: " 834 "both %llu and %llu device parity " 835 "%s vdevs are present\n"), 836 lastrep.zprl_parity, 837 rep.zprl_parity, 838 rep.zprl_type); 839 else 840 return (NULL); 841 } else if (lastrep.zprl_children != rep.zprl_children) { 842 if (ret) 843 free(ret); 844 ret = NULL; 845 if (fatal) 846 vdev_error(gettext( 847 "mismatched replication level: " 848 "both %llu-way and %llu-way %s " 849 "vdevs are present\n"), 850 lastrep.zprl_children, 851 rep.zprl_children, 852 rep.zprl_type); 853 else 854 return (NULL); 855 } 856 } 857 lastrep = rep; 858 } 859 860 if (ret != NULL) 861 *ret = rep; 862 863 return (ret); 864 } 865 866 /* 867 * Check the replication level of the vdev spec against the current pool. Calls 868 * get_replication() to make sure the new spec is self-consistent. If the pool 869 * has a consistent replication level, then we ignore any errors. Otherwise, 870 * report any difference between the two. 871 */ 872 static int 873 check_replication(nvlist_t *config, nvlist_t *newroot) 874 { 875 nvlist_t **child; 876 uint_t children; 877 replication_level_t *current = NULL, *new; 878 replication_level_t *raidz, *mirror; 879 int ret; 880 881 /* 882 * If we have a current pool configuration, check to see if it's 883 * self-consistent. If not, simply return success. 884 */ 885 if (config != NULL) { 886 nvlist_t *nvroot; 887 888 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 889 &nvroot) == 0); 890 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 891 return (0); 892 } 893 /* 894 * for spares there may be no children, and therefore no 895 * replication level to check 896 */ 897 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 898 &child, &children) != 0) || (children == 0)) { 899 free(current); 900 return (0); 901 } 902 903 /* 904 * If all we have is logs then there's no replication level to check. 905 */ 906 if (num_logs(newroot) == children) { 907 free(current); 908 return (0); 909 } 910 911 /* 912 * Get the replication level of the new vdev spec, reporting any 913 * inconsistencies found. 914 */ 915 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 916 free(current); 917 return (-1); 918 } 919 920 /* 921 * Check to see if the new vdev spec matches the replication level of 922 * the current pool. 923 */ 924 ret = 0; 925 if (current != NULL) { 926 if (is_raidz_mirror(current, new, &raidz, &mirror) || 927 is_raidz_mirror(new, current, &raidz, &mirror)) { 928 if (raidz->zprl_parity != mirror->zprl_children - 1) { 929 vdev_error(gettext( 930 "mismatched replication level: pool and " 931 "new vdev with different redundancy, %s " 932 "and %s vdevs, %llu vs. %llu (%llu-way)\n"), 933 raidz->zprl_type, 934 mirror->zprl_type, 935 raidz->zprl_parity, 936 mirror->zprl_children - 1, 937 mirror->zprl_children); 938 ret = -1; 939 } 940 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { 941 vdev_error(gettext( 942 "mismatched replication level: pool uses %s " 943 "and new vdev is %s\n"), 944 current->zprl_type, new->zprl_type); 945 ret = -1; 946 } else if (current->zprl_parity != new->zprl_parity) { 947 vdev_error(gettext( 948 "mismatched replication level: pool uses %llu " 949 "device parity and new vdev uses %llu\n"), 950 current->zprl_parity, new->zprl_parity); 951 ret = -1; 952 } else if (current->zprl_children != new->zprl_children) { 953 vdev_error(gettext( 954 "mismatched replication level: pool uses %llu-way " 955 "%s and new vdev uses %llu-way %s\n"), 956 current->zprl_children, current->zprl_type, 957 new->zprl_children, new->zprl_type); 958 ret = -1; 959 } 960 } 961 962 free(new); 963 if (current != NULL) 964 free(current); 965 966 return (ret); 967 } 968 969 /* 970 * Go through and find any whole disks in the vdev specification, labelling them 971 * as appropriate. When constructing the vdev spec, we were unable to open this 972 * device in order to provide a devid. Now that we have labelled the disk and 973 * know the pool slice is valid, we can construct the devid now. 974 * 975 * If the disk was already labeled with an EFI label, we will have gotten the 976 * devid already (because we were able to open the whole disk). Otherwise, we 977 * need to get the devid after we label the disk. 978 */ 979 static int 980 make_disks(zpool_handle_t *zhp, nvlist_t *nv, zpool_boot_label_t boot_type, 981 uint64_t boot_size) 982 { 983 nvlist_t **child; 984 uint_t c, children; 985 char *type, *path, *diskname; 986 char buf[MAXPATHLEN]; 987 uint64_t wholedisk; 988 int fd; 989 int ret; 990 int slice; 991 ddi_devid_t devid; 992 char *minor = NULL, *devid_str = NULL; 993 994 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 995 996 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 997 &child, &children) != 0) { 998 999 if (strcmp(type, VDEV_TYPE_DISK) != 0) 1000 return (0); 1001 1002 /* 1003 * We have a disk device. Get the path to the device 1004 * and see if it's a whole disk by appending the backup 1005 * slice and stat()ing the device. 1006 */ 1007 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1008 1009 diskname = strrchr(path, '/'); 1010 assert(diskname != NULL); 1011 diskname++; 1012 1013 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1014 &wholedisk) != 0 || !wholedisk) { 1015 /* 1016 * This is not whole disk, return error if 1017 * boot partition creation was requested 1018 */ 1019 if (boot_type == ZPOOL_CREATE_BOOT_LABEL) { 1020 (void) fprintf(stderr, 1021 gettext("creating boot partition is only " 1022 "supported on whole disk vdevs: %s\n"), 1023 diskname); 1024 return (-1); 1025 } 1026 return (0); 1027 } 1028 1029 ret = zpool_label_disk(g_zfs, zhp, diskname, boot_type, 1030 boot_size, &slice); 1031 if (ret == -1) 1032 return (ret); 1033 1034 /* 1035 * Fill in the devid, now that we've labeled the disk. 1036 */ 1037 (void) snprintf(buf, sizeof (buf), "%ss%d", path, slice); 1038 if ((fd = open(buf, O_RDONLY)) < 0) { 1039 (void) fprintf(stderr, 1040 gettext("cannot open '%s': %s\n"), 1041 buf, strerror(errno)); 1042 return (-1); 1043 } 1044 1045 if (devid_get(fd, &devid) == 0) { 1046 if (devid_get_minor_name(fd, &minor) == 0 && 1047 (devid_str = devid_str_encode(devid, minor)) != 1048 NULL) { 1049 verify(nvlist_add_string(nv, 1050 ZPOOL_CONFIG_DEVID, devid_str) == 0); 1051 } 1052 if (devid_str != NULL) 1053 devid_str_free(devid_str); 1054 if (minor != NULL) 1055 devid_str_free(minor); 1056 devid_free(devid); 1057 } 1058 1059 /* 1060 * Update the path to refer to the pool slice. The presence of 1061 * the 'whole_disk' field indicates to the CLI that we should 1062 * chop off the slice number when displaying the device in 1063 * future output. 1064 */ 1065 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 1066 1067 (void) close(fd); 1068 1069 return (0); 1070 } 1071 1072 /* illumos kernel does not support booting from multi-vdev pools. */ 1073 if ((boot_type == ZPOOL_CREATE_BOOT_LABEL)) { 1074 if ((strcmp(type, VDEV_TYPE_ROOT) == 0) && children > 1) { 1075 (void) fprintf(stderr, gettext("boot pool " 1076 "can not have more than one vdev\n")); 1077 return (-1); 1078 } 1079 } 1080 1081 for (c = 0; c < children; c++) { 1082 ret = make_disks(zhp, child[c], boot_type, boot_size); 1083 if (ret != 0) 1084 return (ret); 1085 } 1086 1087 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1088 &child, &children) == 0) 1089 for (c = 0; c < children; c++) { 1090 ret = make_disks(zhp, child[c], boot_type, boot_size); 1091 if (ret != 0) 1092 return (ret); 1093 } 1094 1095 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1096 &child, &children) == 0) 1097 for (c = 0; c < children; c++) { 1098 ret = make_disks(zhp, child[c], boot_type, boot_size); 1099 if (ret != 0) 1100 return (ret); 1101 } 1102 1103 return (0); 1104 } 1105 1106 /* 1107 * Determine if the given path is a hot spare within the given configuration. 1108 */ 1109 static boolean_t 1110 is_spare(nvlist_t *config, const char *path) 1111 { 1112 int fd; 1113 pool_state_t state; 1114 char *name = NULL; 1115 nvlist_t *label; 1116 uint64_t guid, spareguid; 1117 nvlist_t *nvroot; 1118 nvlist_t **spares; 1119 uint_t i, nspares; 1120 boolean_t inuse; 1121 1122 if ((fd = open(path, O_RDONLY)) < 0) 1123 return (B_FALSE); 1124 1125 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 1126 !inuse || 1127 state != POOL_STATE_SPARE || 1128 zpool_read_label(fd, &label, NULL) != 0) { 1129 free(name); 1130 (void) close(fd); 1131 return (B_FALSE); 1132 } 1133 free(name); 1134 (void) close(fd); 1135 1136 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1137 nvlist_free(label); 1138 1139 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1140 &nvroot) == 0); 1141 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1142 &spares, &nspares) == 0) { 1143 for (i = 0; i < nspares; i++) { 1144 verify(nvlist_lookup_uint64(spares[i], 1145 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1146 if (spareguid == guid) 1147 return (B_TRUE); 1148 } 1149 } 1150 1151 return (B_FALSE); 1152 } 1153 1154 /* 1155 * Go through and find any devices that are in use. We rely on libdiskmgt for 1156 * the majority of this task. 1157 */ 1158 static boolean_t 1159 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, 1160 boolean_t replacing, boolean_t isspare) 1161 { 1162 nvlist_t **child; 1163 uint_t c, children; 1164 char *type, *path; 1165 int ret = 0; 1166 char buf[MAXPATHLEN]; 1167 uint64_t wholedisk; 1168 boolean_t anyinuse = B_FALSE; 1169 1170 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1171 1172 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1173 &child, &children) != 0) { 1174 1175 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1176 1177 /* 1178 * As a generic check, we look to see if this is a replace of a 1179 * hot spare within the same pool. If so, we allow it 1180 * regardless of what libdiskmgt or zpool_in_use() says. 1181 */ 1182 if (replacing) { 1183 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1184 &wholedisk) == 0 && wholedisk) 1185 (void) snprintf(buf, sizeof (buf), "%ss0", 1186 path); 1187 else 1188 (void) strlcpy(buf, path, sizeof (buf)); 1189 1190 if (is_spare(config, buf)) 1191 return (B_FALSE); 1192 } 1193 1194 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1195 ret = check_device(path, force, isspare); 1196 else if (strcmp(type, VDEV_TYPE_FILE) == 0) 1197 ret = check_file(path, force, isspare); 1198 1199 return (ret != 0); 1200 } 1201 1202 for (c = 0; c < children; c++) 1203 if (is_device_in_use(config, child[c], force, replacing, 1204 B_FALSE)) 1205 anyinuse = B_TRUE; 1206 1207 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1208 &child, &children) == 0) 1209 for (c = 0; c < children; c++) 1210 if (is_device_in_use(config, child[c], force, replacing, 1211 B_TRUE)) 1212 anyinuse = B_TRUE; 1213 1214 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1215 &child, &children) == 0) 1216 for (c = 0; c < children; c++) 1217 if (is_device_in_use(config, child[c], force, replacing, 1218 B_FALSE)) 1219 anyinuse = B_TRUE; 1220 1221 return (anyinuse); 1222 } 1223 1224 static const char * 1225 is_grouping(const char *type, int *mindev, int *maxdev) 1226 { 1227 if (strncmp(type, "raidz", 5) == 0) { 1228 const char *p = type + 5; 1229 char *end; 1230 long nparity; 1231 1232 if (*p == '\0') { 1233 nparity = 1; 1234 } else if (*p == '0') { 1235 return (NULL); /* no zero prefixes allowed */ 1236 } else { 1237 errno = 0; 1238 nparity = strtol(p, &end, 10); 1239 if (errno != 0 || nparity < 1 || nparity >= 255 || 1240 *end != '\0') 1241 return (NULL); 1242 } 1243 1244 if (mindev != NULL) 1245 *mindev = nparity + 1; 1246 if (maxdev != NULL) 1247 *maxdev = 255; 1248 return (VDEV_TYPE_RAIDZ); 1249 } 1250 1251 if (maxdev != NULL) 1252 *maxdev = INT_MAX; 1253 1254 if (strcmp(type, "mirror") == 0) { 1255 if (mindev != NULL) 1256 *mindev = 2; 1257 return (VDEV_TYPE_MIRROR); 1258 } 1259 1260 if (strcmp(type, "spare") == 0) { 1261 if (mindev != NULL) 1262 *mindev = 1; 1263 return (VDEV_TYPE_SPARE); 1264 } 1265 1266 if (strcmp(type, "log") == 0) { 1267 if (mindev != NULL) 1268 *mindev = 1; 1269 return (VDEV_TYPE_LOG); 1270 } 1271 1272 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || 1273 strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1274 if (mindev != NULL) 1275 *mindev = 1; 1276 return (type); 1277 } 1278 1279 if (strcmp(type, "cache") == 0) { 1280 if (mindev != NULL) 1281 *mindev = 1; 1282 return (VDEV_TYPE_L2CACHE); 1283 } 1284 1285 return (NULL); 1286 } 1287 1288 /* 1289 * Construct a syntactically valid vdev specification, 1290 * and ensure that all devices and files exist and can be opened. 1291 * Note: we don't bother freeing anything in the error paths 1292 * because the program is just going to exit anyway. 1293 */ 1294 nvlist_t * 1295 construct_spec(nvlist_t *props, int argc, char **argv) 1296 { 1297 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1298 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1299 const char *type; 1300 uint64_t is_log, is_special, is_dedup; 1301 boolean_t seen_logs; 1302 1303 top = NULL; 1304 toplevels = 0; 1305 spares = NULL; 1306 l2cache = NULL; 1307 nspares = 0; 1308 nlogs = 0; 1309 nl2cache = 0; 1310 is_log = is_special = is_dedup = B_FALSE; 1311 seen_logs = B_FALSE; 1312 nvroot = NULL; 1313 1314 while (argc > 0) { 1315 nv = NULL; 1316 1317 /* 1318 * If it's a mirror or raidz, the subsequent arguments are 1319 * its leaves -- until we encounter the next mirror or raidz. 1320 */ 1321 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { 1322 nvlist_t **child = NULL; 1323 int c, children = 0; 1324 1325 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1326 if (spares != NULL) { 1327 (void) fprintf(stderr, 1328 gettext("invalid vdev " 1329 "specification: 'spare' can be " 1330 "specified only once\n")); 1331 goto spec_out; 1332 } 1333 is_log = is_special = is_dedup = B_FALSE; 1334 } 1335 1336 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1337 if (seen_logs) { 1338 (void) fprintf(stderr, 1339 gettext("invalid vdev " 1340 "specification: 'log' can be " 1341 "specified only once\n")); 1342 goto spec_out; 1343 } 1344 seen_logs = B_TRUE; 1345 is_log = B_TRUE; 1346 is_special = B_FALSE; 1347 is_dedup = B_FALSE; 1348 argc--; 1349 argv++; 1350 /* 1351 * A log is not a real grouping device. 1352 * We just set is_log and continue. 1353 */ 1354 continue; 1355 } 1356 1357 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { 1358 is_special = B_TRUE; 1359 is_log = B_FALSE; 1360 is_dedup = B_FALSE; 1361 argc--; 1362 argv++; 1363 continue; 1364 } 1365 1366 if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1367 is_dedup = B_TRUE; 1368 is_log = B_FALSE; 1369 is_special = B_FALSE; 1370 argc--; 1371 argv++; 1372 continue; 1373 } 1374 1375 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1376 if (l2cache != NULL) { 1377 (void) fprintf(stderr, 1378 gettext("invalid vdev " 1379 "specification: 'cache' can be " 1380 "specified only once\n")); 1381 goto spec_out; 1382 } 1383 is_log = is_special = is_dedup = B_FALSE; 1384 } 1385 1386 if (is_log || is_special || is_dedup) { 1387 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1388 (void) fprintf(stderr, 1389 gettext("invalid vdev " 1390 "specification: unsupported '%s' " 1391 "device: %s\n"), is_log ? "log" : 1392 "special", type); 1393 goto spec_out; 1394 } 1395 nlogs++; 1396 } 1397 1398 for (c = 1; c < argc; c++) { 1399 if (is_grouping(argv[c], NULL, NULL) != NULL) 1400 break; 1401 children++; 1402 child = realloc(child, 1403 children * sizeof (nvlist_t *)); 1404 if (child == NULL) 1405 zpool_no_memory(); 1406 if ((nv = make_leaf_vdev(props, argv[c], 1407 B_FALSE)) == NULL) { 1408 for (c = 0; c < children - 1; c++) 1409 nvlist_free(child[c]); 1410 free(child); 1411 goto spec_out; 1412 } 1413 child[children - 1] = nv; 1414 } 1415 1416 if (children < mindev) { 1417 (void) fprintf(stderr, gettext("invalid vdev " 1418 "specification: %s requires at least %d " 1419 "devices\n"), argv[0], mindev); 1420 goto spec_out; 1421 } 1422 1423 if (children > maxdev) { 1424 (void) fprintf(stderr, gettext("invalid vdev " 1425 "specification: %s supports no more than " 1426 "%d devices\n"), argv[0], maxdev); 1427 goto spec_out; 1428 } 1429 1430 argc -= c; 1431 argv += c; 1432 1433 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1434 spares = child; 1435 nspares = children; 1436 continue; 1437 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1438 l2cache = child; 1439 nl2cache = children; 1440 continue; 1441 } else { 1442 /* create a top-level vdev with children */ 1443 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1444 0) == 0); 1445 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1446 type) == 0); 1447 verify(nvlist_add_uint64(nv, 1448 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1449 if (is_log) 1450 verify(nvlist_add_string(nv, 1451 ZPOOL_CONFIG_ALLOCATION_BIAS, 1452 VDEV_ALLOC_BIAS_LOG) == 0); 1453 if (is_special) { 1454 verify(nvlist_add_string(nv, 1455 ZPOOL_CONFIG_ALLOCATION_BIAS, 1456 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1457 } 1458 if (is_dedup) { 1459 verify(nvlist_add_string(nv, 1460 ZPOOL_CONFIG_ALLOCATION_BIAS, 1461 VDEV_ALLOC_BIAS_DEDUP) == 0); 1462 } 1463 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1464 verify(nvlist_add_uint64(nv, 1465 ZPOOL_CONFIG_NPARITY, 1466 mindev - 1) == 0); 1467 } 1468 verify(nvlist_add_nvlist_array(nv, 1469 ZPOOL_CONFIG_CHILDREN, child, 1470 children) == 0); 1471 1472 for (c = 0; c < children; c++) 1473 nvlist_free(child[c]); 1474 free(child); 1475 } 1476 } else { 1477 /* 1478 * We have a device. Pass off to make_leaf_vdev() to 1479 * construct the appropriate nvlist describing the vdev. 1480 */ 1481 if ((nv = make_leaf_vdev(props, argv[0], is_log)) 1482 == NULL) 1483 goto spec_out; 1484 if (is_log) 1485 nlogs++; 1486 if (is_special) { 1487 verify(nvlist_add_string(nv, 1488 ZPOOL_CONFIG_ALLOCATION_BIAS, 1489 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1490 } 1491 if (is_dedup) { 1492 verify(nvlist_add_string(nv, 1493 ZPOOL_CONFIG_ALLOCATION_BIAS, 1494 VDEV_ALLOC_BIAS_DEDUP) == 0); 1495 } 1496 argc--; 1497 argv++; 1498 } 1499 1500 toplevels++; 1501 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1502 if (top == NULL) 1503 zpool_no_memory(); 1504 top[toplevels - 1] = nv; 1505 } 1506 1507 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1508 (void) fprintf(stderr, gettext("invalid vdev " 1509 "specification: at least one toplevel vdev must be " 1510 "specified\n")); 1511 goto spec_out; 1512 } 1513 1514 if (seen_logs && nlogs == 0) { 1515 (void) fprintf(stderr, gettext("invalid vdev specification: " 1516 "log requires at least 1 device\n")); 1517 goto spec_out; 1518 } 1519 1520 /* 1521 * Finally, create nvroot and add all top-level vdevs to it. 1522 */ 1523 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1524 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1525 VDEV_TYPE_ROOT) == 0); 1526 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1527 top, toplevels) == 0); 1528 if (nspares != 0) 1529 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1530 spares, nspares) == 0); 1531 if (nl2cache != 0) 1532 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1533 l2cache, nl2cache) == 0); 1534 1535 spec_out: 1536 for (t = 0; t < toplevels; t++) 1537 nvlist_free(top[t]); 1538 for (t = 0; t < nspares; t++) 1539 nvlist_free(spares[t]); 1540 for (t = 0; t < nl2cache; t++) 1541 nvlist_free(l2cache[t]); 1542 1543 free(spares); 1544 free(l2cache); 1545 free(top); 1546 1547 return (nvroot); 1548 } 1549 1550 nvlist_t * 1551 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1552 splitflags_t flags, int argc, char **argv) 1553 { 1554 nvlist_t *newroot = NULL, **child; 1555 uint_t c, children; 1556 zpool_boot_label_t boot_type; 1557 1558 if (argc > 0) { 1559 if ((newroot = construct_spec(props, argc, argv)) == NULL) { 1560 (void) fprintf(stderr, gettext("Unable to build a " 1561 "pool from the specified devices\n")); 1562 return (NULL); 1563 } 1564 1565 if (zpool_is_bootable(zhp)) 1566 boot_type = ZPOOL_COPY_BOOT_LABEL; 1567 else 1568 boot_type = ZPOOL_NO_BOOT_LABEL; 1569 1570 if (!flags.dryrun && 1571 make_disks(zhp, newroot, boot_type, 0) != 0) { 1572 nvlist_free(newroot); 1573 return (NULL); 1574 } 1575 1576 /* avoid any tricks in the spec */ 1577 verify(nvlist_lookup_nvlist_array(newroot, 1578 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1579 for (c = 0; c < children; c++) { 1580 char *path; 1581 const char *type; 1582 int min, max; 1583 1584 verify(nvlist_lookup_string(child[c], 1585 ZPOOL_CONFIG_PATH, &path) == 0); 1586 if ((type = is_grouping(path, &min, &max)) != NULL) { 1587 (void) fprintf(stderr, gettext("Cannot use " 1588 "'%s' as a device for splitting\n"), type); 1589 nvlist_free(newroot); 1590 return (NULL); 1591 } 1592 } 1593 } 1594 1595 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1596 nvlist_free(newroot); 1597 return (NULL); 1598 } 1599 1600 return (newroot); 1601 } 1602 1603 static int 1604 num_normal_vdevs(nvlist_t *nvroot) 1605 { 1606 nvlist_t **top; 1607 uint_t t, toplevels, normal = 0; 1608 1609 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1610 &top, &toplevels) == 0); 1611 1612 for (t = 0; t < toplevels; t++) { 1613 uint64_t log = B_FALSE; 1614 1615 (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); 1616 if (log) 1617 continue; 1618 if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) 1619 continue; 1620 1621 normal++; 1622 } 1623 1624 return (normal); 1625 } 1626 1627 /* 1628 * Get and validate the contents of the given vdev specification. This ensures 1629 * that the nvlist returned is well-formed, that all the devices exist, and that 1630 * they are not currently in use by any other known consumer. The 'poolconfig' 1631 * parameter is the current configuration of the pool when adding devices 1632 * existing pool, and is used to perform additional checks, such as changing the 1633 * replication level of the pool. It can be 'NULL' to indicate that this is a 1634 * new pool. The 'force' flag controls whether devices should be forcefully 1635 * added, even if they appear in use. 1636 */ 1637 nvlist_t * 1638 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, 1639 boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type, 1640 uint64_t boot_size, int argc, char **argv) 1641 { 1642 nvlist_t *newroot; 1643 nvlist_t *poolconfig = NULL; 1644 is_force = force; 1645 1646 /* 1647 * Construct the vdev specification. If this is successful, we know 1648 * that we have a valid specification, and that all devices can be 1649 * opened. 1650 */ 1651 if ((newroot = construct_spec(props, argc, argv)) == NULL) 1652 return (NULL); 1653 1654 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) 1655 return (NULL); 1656 1657 /* 1658 * Validate each device to make sure that its not shared with another 1659 * subsystem. We do this even if 'force' is set, because there are some 1660 * uses (such as a dedicated dump device) that even '-f' cannot 1661 * override. 1662 */ 1663 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { 1664 nvlist_free(newroot); 1665 return (NULL); 1666 } 1667 1668 /* 1669 * Check the replication level of the given vdevs and report any errors 1670 * found. We include the existing pool spec, if any, as we need to 1671 * catch changes against the existing replication level. 1672 */ 1673 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1674 nvlist_free(newroot); 1675 return (NULL); 1676 } 1677 1678 /* 1679 * On pool create the new vdev spec must have one normal vdev. 1680 */ 1681 if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { 1682 vdev_error(gettext("at least one general top-level vdev must " 1683 "be specified\n")); 1684 nvlist_free(newroot); 1685 return (NULL); 1686 } 1687 1688 /* 1689 * Run through the vdev specification and label any whole disks found. 1690 */ 1691 if (!dryrun && make_disks(zhp, newroot, boot_type, boot_size) != 0) { 1692 nvlist_free(newroot); 1693 return (NULL); 1694 } 1695 1696 return (newroot); 1697 } 1698