1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2016, 2017 Intel Corporation. 26 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 27 */ 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'make_root_vdev'. The 52 * function performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Call libzfs to label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <devid.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libdiskmgt.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <limits.h> 73 #include <stdio.h> 74 #include <string.h> 75 #include <unistd.h> 76 #include <sys/efi_partition.h> 77 #include <sys/stat.h> 78 #include <sys/vtoc.h> 79 #include <sys/mntent.h> 80 81 #include "zpool_util.h" 82 83 #define BACKUP_SLICE "s2" 84 85 /* 86 * For any given vdev specification, we can have multiple errors. The 87 * vdev_error() function keeps track of whether we have seen an error yet, and 88 * prints out a header if its the first error we've seen. 89 */ 90 boolean_t error_seen; 91 boolean_t is_force; 92 93 /*PRINTFLIKE1*/ 94 static void 95 vdev_error(const char *fmt, ...) 96 { 97 va_list ap; 98 99 if (!error_seen) { 100 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 101 if (!is_force) 102 (void) fprintf(stderr, gettext("use '-f' to override " 103 "the following errors:\n")); 104 else 105 (void) fprintf(stderr, gettext("the following errors " 106 "must be manually repaired:\n")); 107 error_seen = B_TRUE; 108 } 109 110 va_start(ap, fmt); 111 (void) vfprintf(stderr, fmt, ap); 112 va_end(ap); 113 } 114 115 static void 116 libdiskmgt_error(int error) 117 { 118 /* 119 * ENXIO/ENODEV is a valid error message if the device doesn't live in 120 * /dev/dsk. Don't bother printing an error message in this case. 121 */ 122 if (error == ENXIO || error == ENODEV) 123 return; 124 125 (void) fprintf(stderr, gettext("warning: device in use checking " 126 "failed: %s\n"), strerror(error)); 127 } 128 129 /* 130 * Validate a device, passing the bulk of the work off to libdiskmgt. 131 */ 132 static int 133 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 134 { 135 char *msg; 136 int error = 0; 137 dm_who_type_t who; 138 139 if (force) 140 who = DM_WHO_ZPOOL_FORCE; 141 else if (isspare) 142 who = DM_WHO_ZPOOL_SPARE; 143 else 144 who = DM_WHO_ZPOOL; 145 146 if (dm_inuse((char *)path, &msg, who, &error) || error) { 147 if (error != 0) { 148 libdiskmgt_error(error); 149 return (0); 150 } else { 151 vdev_error("%s", msg); 152 free(msg); 153 return (-1); 154 } 155 } 156 157 /* 158 * If we're given a whole disk, ignore overlapping slices since we're 159 * about to label it anyway. 160 */ 161 error = 0; 162 if (!wholedisk && !force && 163 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 164 if (error == 0) { 165 /* dm_isoverlapping returned -1 */ 166 vdev_error(gettext("%s overlaps with %s\n"), path, msg); 167 free(msg); 168 return (-1); 169 } else if (error != ENODEV) { 170 /* libdiskmgt's devcache only handles physical drives */ 171 libdiskmgt_error(error); 172 return (0); 173 } 174 } 175 176 return (0); 177 } 178 179 180 /* 181 * Validate a whole disk. Iterate over all slices on the disk and make sure 182 * that none is in use by calling check_slice(). 183 */ 184 static int 185 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 186 { 187 dm_descriptor_t *drive, *media, *slice; 188 int err = 0; 189 int i; 190 int ret; 191 192 /* 193 * Get the drive associated with this disk. This should never fail, 194 * because we already have an alias handle open for the device. 195 */ 196 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 197 &err)) == NULL || *drive == 0) { 198 if (err) 199 libdiskmgt_error(err); 200 return (0); 201 } 202 203 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 204 &err)) == NULL) { 205 dm_free_descriptors(drive); 206 if (err) 207 libdiskmgt_error(err); 208 return (0); 209 } 210 211 dm_free_descriptors(drive); 212 213 /* 214 * It is possible that the user has specified a removable media drive, 215 * and the media is not present. 216 */ 217 if (*media == 0) { 218 dm_free_descriptors(media); 219 vdev_error(gettext("'%s' has no media in drive\n"), name); 220 return (-1); 221 } 222 223 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 224 &err)) == NULL) { 225 dm_free_descriptors(media); 226 if (err) 227 libdiskmgt_error(err); 228 return (0); 229 } 230 231 dm_free_descriptors(media); 232 233 ret = 0; 234 235 /* 236 * Iterate over all slices and report any errors. We don't care about 237 * overlapping slices because we are using the whole disk. 238 */ 239 for (i = 0; slice[i] != 0; i++) { 240 char *name = dm_get_name(slice[i], &err); 241 242 if (check_slice(name, force, B_TRUE, isspare) != 0) 243 ret = -1; 244 245 dm_free_name(name); 246 } 247 248 dm_free_descriptors(slice); 249 return (ret); 250 } 251 252 /* 253 * Validate a device. 254 */ 255 static int 256 check_device(const char *path, boolean_t force, boolean_t isspare) 257 { 258 dm_descriptor_t desc; 259 int err; 260 char *dev; 261 262 /* 263 * For whole disks, libdiskmgt does not include the leading dev path. 264 */ 265 dev = strrchr(path, '/'); 266 assert(dev != NULL); 267 dev++; 268 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != 0) { 269 err = check_disk(path, desc, force, isspare); 270 dm_free_descriptor(desc); 271 return (err); 272 } 273 274 return (check_slice(path, force, B_FALSE, isspare)); 275 } 276 277 /* 278 * Check that a file is valid. All we can do in this case is check that it's 279 * not in use by another pool, and not in use by swap. 280 */ 281 static int 282 check_file(const char *file, boolean_t force, boolean_t isspare) 283 { 284 char *name; 285 int fd; 286 int ret = 0; 287 int err; 288 pool_state_t state; 289 boolean_t inuse; 290 291 if (dm_inuse_swap(file, &err)) { 292 if (err) 293 libdiskmgt_error(err); 294 else 295 vdev_error(gettext("%s is currently used by swap. " 296 "Please see swap(1M).\n"), file); 297 return (-1); 298 } 299 300 if ((fd = open(file, O_RDONLY)) < 0) 301 return (0); 302 303 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 304 const char *desc; 305 306 switch (state) { 307 case POOL_STATE_ACTIVE: 308 desc = gettext("active"); 309 break; 310 311 case POOL_STATE_EXPORTED: 312 desc = gettext("exported"); 313 break; 314 315 case POOL_STATE_POTENTIALLY_ACTIVE: 316 desc = gettext("potentially active"); 317 break; 318 319 default: 320 desc = gettext("unknown"); 321 break; 322 } 323 324 /* 325 * Allow hot spares to be shared between pools. 326 */ 327 if (state == POOL_STATE_SPARE && isspare) 328 return (0); 329 330 if (state == POOL_STATE_ACTIVE || 331 state == POOL_STATE_SPARE || !force) { 332 switch (state) { 333 case POOL_STATE_SPARE: 334 vdev_error(gettext("%s is reserved as a hot " 335 "spare for pool %s\n"), file, name); 336 break; 337 default: 338 vdev_error(gettext("%s is part of %s pool " 339 "'%s'\n"), file, desc, name); 340 break; 341 } 342 ret = -1; 343 } 344 345 free(name); 346 } 347 348 (void) close(fd); 349 return (ret); 350 } 351 352 353 /* 354 * By "whole disk" we mean an entire physical disk (something we can 355 * label, toggle the write cache on, etc.) as opposed to the full 356 * capacity of a pseudo-device such as lofi or did. We act as if we 357 * are labeling the disk, which should be a pretty good test of whether 358 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 359 * it isn't. 360 */ 361 static boolean_t 362 is_whole_disk(const char *arg) 363 { 364 struct dk_gpt *label; 365 int fd; 366 char path[MAXPATHLEN]; 367 368 (void) snprintf(path, sizeof (path), "%s%s%s", 369 ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); 370 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) 371 return (B_FALSE); 372 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 373 (void) close(fd); 374 return (B_FALSE); 375 } 376 efi_free(label); 377 (void) close(fd); 378 return (B_TRUE); 379 } 380 381 /* 382 * Create a leaf vdev. Determine if this is a file or a device. If it's a 383 * device, fill in the device id to make a complete nvlist. Valid forms for a 384 * leaf vdev are: 385 * 386 * /dev/dsk/xxx Complete disk path 387 * /xxx Full path to file 388 * xxx Shorthand for /dev/dsk/xxx 389 */ 390 static nvlist_t * 391 make_leaf_vdev(const char *arg, uint64_t is_log) 392 { 393 char path[MAXPATHLEN]; 394 struct stat64 statbuf; 395 nvlist_t *vdev = NULL; 396 char *type = NULL; 397 boolean_t wholedisk = B_FALSE; 398 399 /* 400 * Determine what type of vdev this is, and put the full path into 401 * 'path'. We detect whether this is a device of file afterwards by 402 * checking the st_mode of the file. 403 */ 404 if (arg[0] == '/') { 405 /* 406 * Complete device or file path. Exact type is determined by 407 * examining the file descriptor afterwards. 408 */ 409 wholedisk = is_whole_disk(arg); 410 if (!wholedisk && (stat64(arg, &statbuf) != 0)) { 411 (void) fprintf(stderr, 412 gettext("cannot open '%s': %s\n"), 413 arg, strerror(errno)); 414 return (NULL); 415 } 416 417 (void) strlcpy(path, arg, sizeof (path)); 418 } else { 419 /* 420 * This may be a short path for a device, or it could be total 421 * gibberish. Check to see if it's a known device in 422 * /dev/dsk/. As part of this check, see if we've been given a 423 * an entire disk (minus the slice number). 424 */ 425 (void) snprintf(path, sizeof (path), "%s/%s", ZFS_DISK_ROOT, 426 arg); 427 wholedisk = is_whole_disk(path); 428 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 429 /* 430 * If we got ENOENT, then the user gave us 431 * gibberish, so try to direct them with a 432 * reasonable error message. Otherwise, 433 * regurgitate strerror() since it's the best we 434 * can do. 435 */ 436 if (errno == ENOENT) { 437 (void) fprintf(stderr, 438 gettext("cannot open '%s': no such " 439 "device in %s\n"), arg, ZFS_DISK_ROOT); 440 (void) fprintf(stderr, 441 gettext("must be a full path or " 442 "shorthand device name\n")); 443 return (NULL); 444 } else { 445 (void) fprintf(stderr, 446 gettext("cannot open '%s': %s\n"), 447 path, strerror(errno)); 448 return (NULL); 449 } 450 } 451 } 452 453 /* 454 * Determine whether this is a device or a file. 455 */ 456 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 457 type = VDEV_TYPE_DISK; 458 } else if (S_ISREG(statbuf.st_mode)) { 459 type = VDEV_TYPE_FILE; 460 } else { 461 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 462 "block device or regular file\n"), path); 463 return (NULL); 464 } 465 466 /* 467 * Finally, we have the complete device or file, and we know that it is 468 * acceptable to use. Construct the nvlist to describe this vdev. All 469 * vdevs have a 'path' element, and devices also have a 'devid' element. 470 */ 471 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 472 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 473 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 474 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); 475 if (is_log) 476 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, 477 VDEV_ALLOC_BIAS_LOG) == 0); 478 if (strcmp(type, VDEV_TYPE_DISK) == 0) 479 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 480 (uint64_t)wholedisk) == 0); 481 482 /* 483 * For a whole disk, defer getting its devid until after labeling it. 484 */ 485 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 486 /* 487 * Get the devid for the device. 488 */ 489 int fd; 490 ddi_devid_t devid; 491 char *minor = NULL, *devid_str = NULL; 492 493 if ((fd = open(path, O_RDONLY)) < 0) { 494 (void) fprintf(stderr, gettext("cannot open '%s': " 495 "%s\n"), path, strerror(errno)); 496 nvlist_free(vdev); 497 return (NULL); 498 } 499 500 if (devid_get(fd, &devid) == 0) { 501 if (devid_get_minor_name(fd, &minor) == 0 && 502 (devid_str = devid_str_encode(devid, minor)) != 503 NULL) { 504 verify(nvlist_add_string(vdev, 505 ZPOOL_CONFIG_DEVID, devid_str) == 0); 506 } 507 if (devid_str != NULL) 508 devid_str_free(devid_str); 509 if (minor != NULL) 510 devid_str_free(minor); 511 devid_free(devid); 512 } 513 514 (void) close(fd); 515 } 516 517 return (vdev); 518 } 519 520 /* 521 * Go through and verify the replication level of the pool is consistent. 522 * Performs the following checks: 523 * 524 * For the new spec, verifies that devices in mirrors and raidz are the 525 * same size. 526 * 527 * If the current configuration already has inconsistent replication 528 * levels, ignore any other potential problems in the new spec. 529 * 530 * Otherwise, make sure that the current spec (if there is one) and the new 531 * spec have consistent replication levels. 532 * 533 * If there is no current spec (create), make sure new spec has at least 534 * one general purpose vdev. 535 */ 536 typedef struct replication_level { 537 char *zprl_type; 538 uint64_t zprl_children; 539 uint64_t zprl_parity; 540 } replication_level_t; 541 542 #define ZPOOL_FUZZ (16 * 1024 * 1024) 543 544 static boolean_t 545 is_raidz_mirror(replication_level_t *a, replication_level_t *b, 546 replication_level_t **raidz, replication_level_t **mirror) 547 { 548 if (strcmp(a->zprl_type, "raidz") == 0 && 549 strcmp(b->zprl_type, "mirror") == 0) { 550 *raidz = a; 551 *mirror = b; 552 return (B_TRUE); 553 } 554 return (B_FALSE); 555 } 556 557 /* 558 * Given a list of toplevel vdevs, return the current replication level. If 559 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 560 * an error message will be displayed for each self-inconsistent vdev. 561 */ 562 static replication_level_t * 563 get_replication(nvlist_t *nvroot, boolean_t fatal) 564 { 565 nvlist_t **top; 566 uint_t t, toplevels; 567 nvlist_t **child; 568 uint_t c, children; 569 nvlist_t *nv; 570 char *type; 571 replication_level_t lastrep = {0}; 572 replication_level_t rep; 573 replication_level_t *ret; 574 replication_level_t *raidz, *mirror; 575 boolean_t dontreport; 576 577 ret = safe_malloc(sizeof (replication_level_t)); 578 579 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 580 &top, &toplevels) == 0); 581 582 for (t = 0; t < toplevels; t++) { 583 uint64_t is_log = B_FALSE; 584 585 nv = top[t]; 586 587 /* 588 * For separate logs we ignore the top level vdev replication 589 * constraints. 590 */ 591 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 592 if (is_log) 593 continue; 594 595 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, 596 &type) == 0); 597 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 598 &child, &children) != 0) { 599 /* 600 * This is a 'file' or 'disk' vdev. 601 */ 602 rep.zprl_type = type; 603 rep.zprl_children = 1; 604 rep.zprl_parity = 0; 605 } else { 606 uint64_t vdev_size; 607 608 /* 609 * This is a mirror or RAID-Z vdev. Go through and make 610 * sure the contents are all the same (files vs. disks), 611 * keeping track of the number of elements in the 612 * process. 613 * 614 * We also check that the size of each vdev (if it can 615 * be determined) is the same. 616 */ 617 rep.zprl_type = type; 618 rep.zprl_children = 0; 619 620 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 621 verify(nvlist_lookup_uint64(nv, 622 ZPOOL_CONFIG_NPARITY, 623 &rep.zprl_parity) == 0); 624 assert(rep.zprl_parity != 0); 625 } else { 626 rep.zprl_parity = 0; 627 } 628 629 /* 630 * The 'dontreport' variable indicates that we've 631 * already reported an error for this spec, so don't 632 * bother doing it again. 633 */ 634 type = NULL; 635 dontreport = 0; 636 vdev_size = -1ULL; 637 for (c = 0; c < children; c++) { 638 nvlist_t *cnv = child[c]; 639 char *path; 640 struct stat64 statbuf; 641 uint64_t size = -1ULL; 642 char *childtype; 643 int fd, err; 644 645 rep.zprl_children++; 646 647 verify(nvlist_lookup_string(cnv, 648 ZPOOL_CONFIG_TYPE, &childtype) == 0); 649 650 /* 651 * If this is a replacing or spare vdev, then 652 * get the real first child of the vdev: do this 653 * in a loop because replacing and spare vdevs 654 * can be nested. 655 */ 656 while (strcmp(childtype, 657 VDEV_TYPE_REPLACING) == 0 || 658 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 659 nvlist_t **rchild; 660 uint_t rchildren; 661 662 verify(nvlist_lookup_nvlist_array(cnv, 663 ZPOOL_CONFIG_CHILDREN, &rchild, 664 &rchildren) == 0); 665 assert(rchildren == 2); 666 cnv = rchild[0]; 667 668 verify(nvlist_lookup_string(cnv, 669 ZPOOL_CONFIG_TYPE, 670 &childtype) == 0); 671 } 672 673 verify(nvlist_lookup_string(cnv, 674 ZPOOL_CONFIG_PATH, &path) == 0); 675 676 /* 677 * If we have a raidz/mirror that combines disks 678 * with files, report it as an error. 679 */ 680 if (!dontreport && type != NULL && 681 strcmp(type, childtype) != 0) { 682 if (ret != NULL) 683 free(ret); 684 ret = NULL; 685 if (fatal) 686 vdev_error(gettext( 687 "mismatched replication " 688 "level: %s contains both " 689 "files and devices\n"), 690 rep.zprl_type); 691 else 692 return (NULL); 693 dontreport = B_TRUE; 694 } 695 696 /* 697 * According to stat(2), the value of 'st_size' 698 * is undefined for block devices and character 699 * devices. But there is no effective way to 700 * determine the real size in userland. 701 * 702 * Instead, we'll take advantage of an 703 * implementation detail of spec_size(). If the 704 * device is currently open, then we (should) 705 * return a valid size. 706 * 707 * If we still don't get a valid size (indicated 708 * by a size of 0 or MAXOFFSET_T), then ignore 709 * this device altogether. 710 */ 711 if ((fd = open(path, O_RDONLY)) >= 0) { 712 err = fstat64(fd, &statbuf); 713 (void) close(fd); 714 } else { 715 err = stat64(path, &statbuf); 716 } 717 718 if (err != 0 || 719 statbuf.st_size == 0 || 720 statbuf.st_size == MAXOFFSET_T) 721 continue; 722 723 size = statbuf.st_size; 724 725 /* 726 * Also make sure that devices and 727 * slices have a consistent size. If 728 * they differ by a significant amount 729 * (~16MB) then report an error. 730 */ 731 if (!dontreport && 732 (vdev_size != -1ULL && 733 (labs(size - vdev_size) > 734 ZPOOL_FUZZ))) { 735 if (ret != NULL) 736 free(ret); 737 ret = NULL; 738 if (fatal) 739 vdev_error(gettext( 740 "%s contains devices of " 741 "different sizes\n"), 742 rep.zprl_type); 743 else 744 return (NULL); 745 dontreport = B_TRUE; 746 } 747 748 type = childtype; 749 vdev_size = size; 750 } 751 } 752 753 /* 754 * At this point, we have the replication of the last toplevel 755 * vdev in 'rep'. Compare it to 'lastrep' to see if it is 756 * different. 757 */ 758 if (lastrep.zprl_type != NULL) { 759 if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || 760 is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { 761 /* 762 * Accepted raidz and mirror when they can 763 * handle the same number of disk failures. 764 */ 765 if (raidz->zprl_parity != 766 mirror->zprl_children - 1) { 767 if (ret != NULL) 768 free(ret); 769 ret = NULL; 770 if (fatal) 771 vdev_error(gettext( 772 "mismatched replication " 773 "level: " 774 "%s and %s vdevs with " 775 "different redundancy, " 776 "%llu vs. %llu (%llu-way) " 777 "are present\n"), 778 raidz->zprl_type, 779 mirror->zprl_type, 780 raidz->zprl_parity, 781 mirror->zprl_children - 1, 782 mirror->zprl_children); 783 else 784 return (NULL); 785 } 786 } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 787 0) { 788 if (ret != NULL) 789 free(ret); 790 ret = NULL; 791 if (fatal) 792 vdev_error(gettext( 793 "mismatched replication level: " 794 "both %s and %s vdevs are " 795 "present\n"), 796 lastrep.zprl_type, rep.zprl_type); 797 else 798 return (NULL); 799 } else if (lastrep.zprl_parity != rep.zprl_parity) { 800 if (ret) 801 free(ret); 802 ret = NULL; 803 if (fatal) 804 vdev_error(gettext( 805 "mismatched replication level: " 806 "both %llu and %llu device parity " 807 "%s vdevs are present\n"), 808 lastrep.zprl_parity, 809 rep.zprl_parity, 810 rep.zprl_type); 811 else 812 return (NULL); 813 } else if (lastrep.zprl_children != rep.zprl_children) { 814 if (ret) 815 free(ret); 816 ret = NULL; 817 if (fatal) 818 vdev_error(gettext( 819 "mismatched replication level: " 820 "both %llu-way and %llu-way %s " 821 "vdevs are present\n"), 822 lastrep.zprl_children, 823 rep.zprl_children, 824 rep.zprl_type); 825 else 826 return (NULL); 827 } 828 } 829 lastrep = rep; 830 } 831 832 if (ret != NULL) 833 *ret = rep; 834 835 return (ret); 836 } 837 838 /* 839 * Check the replication level of the vdev spec against the current pool. Calls 840 * get_replication() to make sure the new spec is self-consistent. If the pool 841 * has a consistent replication level, then we ignore any errors. Otherwise, 842 * report any difference between the two. 843 */ 844 static int 845 check_replication(nvlist_t *config, nvlist_t *newroot) 846 { 847 nvlist_t **child; 848 uint_t children; 849 replication_level_t *current = NULL, *new; 850 replication_level_t *raidz, *mirror; 851 int ret; 852 853 /* 854 * If we have a current pool configuration, check to see if it's 855 * self-consistent. If not, simply return success. 856 */ 857 if (config != NULL) { 858 nvlist_t *nvroot; 859 860 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 861 &nvroot) == 0); 862 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 863 return (0); 864 } 865 /* 866 * for spares there may be no children, and therefore no 867 * replication level to check 868 */ 869 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 870 &child, &children) != 0) || (children == 0)) { 871 free(current); 872 return (0); 873 } 874 875 /* 876 * If all we have is logs then there's no replication level to check. 877 */ 878 if (num_logs(newroot) == children) { 879 free(current); 880 return (0); 881 } 882 883 /* 884 * Get the replication level of the new vdev spec, reporting any 885 * inconsistencies found. 886 */ 887 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 888 free(current); 889 return (-1); 890 } 891 892 /* 893 * Check to see if the new vdev spec matches the replication level of 894 * the current pool. 895 */ 896 ret = 0; 897 if (current != NULL) { 898 if (is_raidz_mirror(current, new, &raidz, &mirror) || 899 is_raidz_mirror(new, current, &raidz, &mirror)) { 900 if (raidz->zprl_parity != mirror->zprl_children - 1) { 901 vdev_error(gettext( 902 "mismatched replication level: pool and " 903 "new vdev with different redundancy, %s " 904 "and %s vdevs, %llu vs. %llu (%llu-way)\n"), 905 raidz->zprl_type, 906 mirror->zprl_type, 907 raidz->zprl_parity, 908 mirror->zprl_children - 1, 909 mirror->zprl_children); 910 ret = -1; 911 } 912 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { 913 vdev_error(gettext( 914 "mismatched replication level: pool uses %s " 915 "and new vdev is %s\n"), 916 current->zprl_type, new->zprl_type); 917 ret = -1; 918 } else if (current->zprl_parity != new->zprl_parity) { 919 vdev_error(gettext( 920 "mismatched replication level: pool uses %llu " 921 "device parity and new vdev uses %llu\n"), 922 current->zprl_parity, new->zprl_parity); 923 ret = -1; 924 } else if (current->zprl_children != new->zprl_children) { 925 vdev_error(gettext( 926 "mismatched replication level: pool uses %llu-way " 927 "%s and new vdev uses %llu-way %s\n"), 928 current->zprl_children, current->zprl_type, 929 new->zprl_children, new->zprl_type); 930 ret = -1; 931 } 932 } 933 934 free(new); 935 if (current != NULL) 936 free(current); 937 938 return (ret); 939 } 940 941 /* 942 * Go through and find any whole disks in the vdev specification, labelling them 943 * as appropriate. When constructing the vdev spec, we were unable to open this 944 * device in order to provide a devid. Now that we have labelled the disk and 945 * know the pool slice is valid, we can construct the devid now. 946 * 947 * If the disk was already labeled with an EFI label, we will have gotten the 948 * devid already (because we were able to open the whole disk). Otherwise, we 949 * need to get the devid after we label the disk. 950 */ 951 static int 952 make_disks(zpool_handle_t *zhp, nvlist_t *nv, zpool_boot_label_t boot_type, 953 uint64_t boot_size) 954 { 955 nvlist_t **child; 956 uint_t c, children; 957 char *type, *path, *diskname; 958 char buf[MAXPATHLEN]; 959 uint64_t wholedisk; 960 int fd; 961 int ret; 962 int slice; 963 ddi_devid_t devid; 964 char *minor = NULL, *devid_str = NULL; 965 966 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 967 968 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 969 &child, &children) != 0) { 970 971 if (strcmp(type, VDEV_TYPE_DISK) != 0) 972 return (0); 973 974 /* 975 * We have a disk device. Get the path to the device 976 * and see if it's a whole disk by appending the backup 977 * slice and stat()ing the device. 978 */ 979 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 980 981 diskname = strrchr(path, '/'); 982 assert(diskname != NULL); 983 diskname++; 984 985 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 986 &wholedisk) != 0 || !wholedisk) { 987 /* 988 * This is not whole disk, return error if 989 * boot partition creation was requested 990 */ 991 if (boot_type == ZPOOL_CREATE_BOOT_LABEL) { 992 (void) fprintf(stderr, 993 gettext("creating boot partition is only " 994 "supported on whole disk vdevs: %s\n"), 995 diskname); 996 return (-1); 997 } 998 return (0); 999 } 1000 1001 ret = zpool_label_disk(g_zfs, zhp, diskname, boot_type, 1002 boot_size, &slice); 1003 if (ret == -1) 1004 return (ret); 1005 1006 /* 1007 * Fill in the devid, now that we've labeled the disk. 1008 */ 1009 (void) snprintf(buf, sizeof (buf), "%ss%d", path, slice); 1010 if ((fd = open(buf, O_RDONLY)) < 0) { 1011 (void) fprintf(stderr, 1012 gettext("cannot open '%s': %s\n"), 1013 buf, strerror(errno)); 1014 return (-1); 1015 } 1016 1017 if (devid_get(fd, &devid) == 0) { 1018 if (devid_get_minor_name(fd, &minor) == 0 && 1019 (devid_str = devid_str_encode(devid, minor)) != 1020 NULL) { 1021 verify(nvlist_add_string(nv, 1022 ZPOOL_CONFIG_DEVID, devid_str) == 0); 1023 } 1024 if (devid_str != NULL) 1025 devid_str_free(devid_str); 1026 if (minor != NULL) 1027 devid_str_free(minor); 1028 devid_free(devid); 1029 } 1030 1031 /* 1032 * Update the path to refer to the pool slice. The presence of 1033 * the 'whole_disk' field indicates to the CLI that we should 1034 * chop off the slice number when displaying the device in 1035 * future output. 1036 */ 1037 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 1038 1039 (void) close(fd); 1040 1041 return (0); 1042 } 1043 1044 /* illumos kernel does not support booting from multi-vdev pools. */ 1045 if ((boot_type == ZPOOL_CREATE_BOOT_LABEL)) { 1046 if ((strcmp(type, VDEV_TYPE_ROOT) == 0) && children > 1) { 1047 (void) fprintf(stderr, gettext("boot pool " 1048 "can not have more than one vdev\n")); 1049 return (-1); 1050 } 1051 } 1052 1053 for (c = 0; c < children; c++) { 1054 ret = make_disks(zhp, child[c], boot_type, boot_size); 1055 if (ret != 0) 1056 return (ret); 1057 } 1058 1059 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1060 &child, &children) == 0) 1061 for (c = 0; c < children; c++) { 1062 ret = make_disks(zhp, child[c], boot_type, boot_size); 1063 if (ret != 0) 1064 return (ret); 1065 } 1066 1067 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1068 &child, &children) == 0) 1069 for (c = 0; c < children; c++) { 1070 ret = make_disks(zhp, child[c], boot_type, boot_size); 1071 if (ret != 0) 1072 return (ret); 1073 } 1074 1075 return (0); 1076 } 1077 1078 /* 1079 * Determine if the given path is a hot spare within the given configuration. 1080 */ 1081 static boolean_t 1082 is_spare(nvlist_t *config, const char *path) 1083 { 1084 int fd; 1085 pool_state_t state; 1086 char *name = NULL; 1087 nvlist_t *label; 1088 uint64_t guid, spareguid; 1089 nvlist_t *nvroot; 1090 nvlist_t **spares; 1091 uint_t i, nspares; 1092 boolean_t inuse; 1093 1094 if ((fd = open(path, O_RDONLY)) < 0) 1095 return (B_FALSE); 1096 1097 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 1098 !inuse || 1099 state != POOL_STATE_SPARE || 1100 zpool_read_label(fd, &label) != 0) { 1101 free(name); 1102 (void) close(fd); 1103 return (B_FALSE); 1104 } 1105 free(name); 1106 (void) close(fd); 1107 1108 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1109 nvlist_free(label); 1110 1111 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1112 &nvroot) == 0); 1113 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1114 &spares, &nspares) == 0) { 1115 for (i = 0; i < nspares; i++) { 1116 verify(nvlist_lookup_uint64(spares[i], 1117 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1118 if (spareguid == guid) 1119 return (B_TRUE); 1120 } 1121 } 1122 1123 return (B_FALSE); 1124 } 1125 1126 /* 1127 * Go through and find any devices that are in use. We rely on libdiskmgt for 1128 * the majority of this task. 1129 */ 1130 static boolean_t 1131 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, 1132 boolean_t replacing, boolean_t isspare) 1133 { 1134 nvlist_t **child; 1135 uint_t c, children; 1136 char *type, *path; 1137 int ret = 0; 1138 char buf[MAXPATHLEN]; 1139 uint64_t wholedisk; 1140 boolean_t anyinuse = B_FALSE; 1141 1142 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1143 1144 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1145 &child, &children) != 0) { 1146 1147 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1148 1149 /* 1150 * As a generic check, we look to see if this is a replace of a 1151 * hot spare within the same pool. If so, we allow it 1152 * regardless of what libdiskmgt or zpool_in_use() says. 1153 */ 1154 if (replacing) { 1155 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1156 &wholedisk) == 0 && wholedisk) 1157 (void) snprintf(buf, sizeof (buf), "%ss0", 1158 path); 1159 else 1160 (void) strlcpy(buf, path, sizeof (buf)); 1161 1162 if (is_spare(config, buf)) 1163 return (B_FALSE); 1164 } 1165 1166 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1167 ret = check_device(path, force, isspare); 1168 else if (strcmp(type, VDEV_TYPE_FILE) == 0) 1169 ret = check_file(path, force, isspare); 1170 1171 return (ret != 0); 1172 } 1173 1174 for (c = 0; c < children; c++) 1175 if (is_device_in_use(config, child[c], force, replacing, 1176 B_FALSE)) 1177 anyinuse = B_TRUE; 1178 1179 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1180 &child, &children) == 0) 1181 for (c = 0; c < children; c++) 1182 if (is_device_in_use(config, child[c], force, replacing, 1183 B_TRUE)) 1184 anyinuse = B_TRUE; 1185 1186 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1187 &child, &children) == 0) 1188 for (c = 0; c < children; c++) 1189 if (is_device_in_use(config, child[c], force, replacing, 1190 B_FALSE)) 1191 anyinuse = B_TRUE; 1192 1193 return (anyinuse); 1194 } 1195 1196 static const char * 1197 is_grouping(const char *type, int *mindev, int *maxdev) 1198 { 1199 if (strncmp(type, "raidz", 5) == 0) { 1200 const char *p = type + 5; 1201 char *end; 1202 long nparity; 1203 1204 if (*p == '\0') { 1205 nparity = 1; 1206 } else if (*p == '0') { 1207 return (NULL); /* no zero prefixes allowed */ 1208 } else { 1209 errno = 0; 1210 nparity = strtol(p, &end, 10); 1211 if (errno != 0 || nparity < 1 || nparity >= 255 || 1212 *end != '\0') 1213 return (NULL); 1214 } 1215 1216 if (mindev != NULL) 1217 *mindev = nparity + 1; 1218 if (maxdev != NULL) 1219 *maxdev = 255; 1220 return (VDEV_TYPE_RAIDZ); 1221 } 1222 1223 if (maxdev != NULL) 1224 *maxdev = INT_MAX; 1225 1226 if (strcmp(type, "mirror") == 0) { 1227 if (mindev != NULL) 1228 *mindev = 2; 1229 return (VDEV_TYPE_MIRROR); 1230 } 1231 1232 if (strcmp(type, "spare") == 0) { 1233 if (mindev != NULL) 1234 *mindev = 1; 1235 return (VDEV_TYPE_SPARE); 1236 } 1237 1238 if (strcmp(type, "log") == 0) { 1239 if (mindev != NULL) 1240 *mindev = 1; 1241 return (VDEV_TYPE_LOG); 1242 } 1243 1244 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || 1245 strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1246 if (mindev != NULL) 1247 *mindev = 1; 1248 return (type); 1249 } 1250 1251 if (strcmp(type, "cache") == 0) { 1252 if (mindev != NULL) 1253 *mindev = 1; 1254 return (VDEV_TYPE_L2CACHE); 1255 } 1256 1257 return (NULL); 1258 } 1259 1260 /* 1261 * Construct a syntactically valid vdev specification, 1262 * and ensure that all devices and files exist and can be opened. 1263 * Note: we don't bother freeing anything in the error paths 1264 * because the program is just going to exit anyway. 1265 */ 1266 nvlist_t * 1267 construct_spec(int argc, char **argv) 1268 { 1269 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1270 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1271 const char *type; 1272 uint64_t is_log, is_special, is_dedup; 1273 boolean_t seen_logs; 1274 1275 top = NULL; 1276 toplevels = 0; 1277 spares = NULL; 1278 l2cache = NULL; 1279 nspares = 0; 1280 nlogs = 0; 1281 nl2cache = 0; 1282 is_log = is_special = is_dedup = B_FALSE; 1283 seen_logs = B_FALSE; 1284 1285 while (argc > 0) { 1286 nv = NULL; 1287 1288 /* 1289 * If it's a mirror or raidz, the subsequent arguments are 1290 * its leaves -- until we encounter the next mirror or raidz. 1291 */ 1292 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { 1293 nvlist_t **child = NULL; 1294 int c, children = 0; 1295 1296 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1297 if (spares != NULL) { 1298 (void) fprintf(stderr, 1299 gettext("invalid vdev " 1300 "specification: 'spare' can be " 1301 "specified only once\n")); 1302 return (NULL); 1303 } 1304 is_log = is_special = is_dedup = B_FALSE; 1305 } 1306 1307 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1308 if (seen_logs) { 1309 (void) fprintf(stderr, 1310 gettext("invalid vdev " 1311 "specification: 'log' can be " 1312 "specified only once\n")); 1313 return (NULL); 1314 } 1315 seen_logs = B_TRUE; 1316 is_log = B_TRUE; 1317 is_special = B_FALSE; 1318 is_dedup = B_FALSE; 1319 argc--; 1320 argv++; 1321 /* 1322 * A log is not a real grouping device. 1323 * We just set is_log and continue. 1324 */ 1325 continue; 1326 } 1327 1328 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { 1329 is_special = B_TRUE; 1330 is_log = B_FALSE; 1331 is_dedup = B_FALSE; 1332 argc--; 1333 argv++; 1334 continue; 1335 } 1336 1337 if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1338 is_dedup = B_TRUE; 1339 is_log = B_FALSE; 1340 is_special = B_FALSE; 1341 argc--; 1342 argv++; 1343 continue; 1344 } 1345 1346 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1347 if (l2cache != NULL) { 1348 (void) fprintf(stderr, 1349 gettext("invalid vdev " 1350 "specification: 'cache' can be " 1351 "specified only once\n")); 1352 return (NULL); 1353 } 1354 is_log = is_special = is_dedup = B_FALSE; 1355 } 1356 1357 if (is_log || is_special || is_dedup) { 1358 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1359 (void) fprintf(stderr, 1360 gettext("invalid vdev " 1361 "specification: unsupported '%s' " 1362 "device: %s\n"), is_log ? "log" : 1363 "special", type); 1364 return (NULL); 1365 } 1366 nlogs++; 1367 } 1368 1369 for (c = 1; c < argc; c++) { 1370 if (is_grouping(argv[c], NULL, NULL) != NULL) 1371 break; 1372 children++; 1373 child = realloc(child, 1374 children * sizeof (nvlist_t *)); 1375 if (child == NULL) 1376 zpool_no_memory(); 1377 if ((nv = make_leaf_vdev(argv[c], B_FALSE)) 1378 == NULL) 1379 return (NULL); 1380 child[children - 1] = nv; 1381 } 1382 1383 if (children < mindev) { 1384 (void) fprintf(stderr, gettext("invalid vdev " 1385 "specification: %s requires at least %d " 1386 "devices\n"), argv[0], mindev); 1387 return (NULL); 1388 } 1389 1390 if (children > maxdev) { 1391 (void) fprintf(stderr, gettext("invalid vdev " 1392 "specification: %s supports no more than " 1393 "%d devices\n"), argv[0], maxdev); 1394 return (NULL); 1395 } 1396 1397 argc -= c; 1398 argv += c; 1399 1400 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1401 spares = child; 1402 nspares = children; 1403 continue; 1404 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1405 l2cache = child; 1406 nl2cache = children; 1407 continue; 1408 } else { 1409 /* create a top-level vdev with children */ 1410 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1411 0) == 0); 1412 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1413 type) == 0); 1414 verify(nvlist_add_uint64(nv, 1415 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1416 if (is_log) 1417 verify(nvlist_add_string(nv, 1418 ZPOOL_CONFIG_ALLOCATION_BIAS, 1419 VDEV_ALLOC_BIAS_LOG) == 0); 1420 if (is_special) { 1421 verify(nvlist_add_string(nv, 1422 ZPOOL_CONFIG_ALLOCATION_BIAS, 1423 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1424 } 1425 if (is_dedup) { 1426 verify(nvlist_add_string(nv, 1427 ZPOOL_CONFIG_ALLOCATION_BIAS, 1428 VDEV_ALLOC_BIAS_DEDUP) == 0); 1429 } 1430 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1431 verify(nvlist_add_uint64(nv, 1432 ZPOOL_CONFIG_NPARITY, 1433 mindev - 1) == 0); 1434 } 1435 verify(nvlist_add_nvlist_array(nv, 1436 ZPOOL_CONFIG_CHILDREN, child, 1437 children) == 0); 1438 1439 for (c = 0; c < children; c++) 1440 nvlist_free(child[c]); 1441 free(child); 1442 } 1443 } else { 1444 /* 1445 * We have a device. Pass off to make_leaf_vdev() to 1446 * construct the appropriate nvlist describing the vdev. 1447 */ 1448 if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL) 1449 return (NULL); 1450 if (is_log) 1451 nlogs++; 1452 if (is_special) { 1453 verify(nvlist_add_string(nv, 1454 ZPOOL_CONFIG_ALLOCATION_BIAS, 1455 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1456 } 1457 if (is_dedup) { 1458 verify(nvlist_add_string(nv, 1459 ZPOOL_CONFIG_ALLOCATION_BIAS, 1460 VDEV_ALLOC_BIAS_DEDUP) == 0); 1461 } 1462 argc--; 1463 argv++; 1464 } 1465 1466 toplevels++; 1467 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1468 if (top == NULL) 1469 zpool_no_memory(); 1470 top[toplevels - 1] = nv; 1471 } 1472 1473 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1474 (void) fprintf(stderr, gettext("invalid vdev " 1475 "specification: at least one toplevel vdev must be " 1476 "specified\n")); 1477 return (NULL); 1478 } 1479 1480 if (seen_logs && nlogs == 0) { 1481 (void) fprintf(stderr, gettext("invalid vdev specification: " 1482 "log requires at least 1 device\n")); 1483 return (NULL); 1484 } 1485 1486 /* 1487 * Finally, create nvroot and add all top-level vdevs to it. 1488 */ 1489 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1490 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1491 VDEV_TYPE_ROOT) == 0); 1492 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1493 top, toplevels) == 0); 1494 if (nspares != 0) 1495 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1496 spares, nspares) == 0); 1497 if (nl2cache != 0) 1498 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1499 l2cache, nl2cache) == 0); 1500 1501 for (t = 0; t < toplevels; t++) 1502 nvlist_free(top[t]); 1503 for (t = 0; t < nspares; t++) 1504 nvlist_free(spares[t]); 1505 for (t = 0; t < nl2cache; t++) 1506 nvlist_free(l2cache[t]); 1507 if (spares) 1508 free(spares); 1509 if (l2cache) 1510 free(l2cache); 1511 free(top); 1512 1513 return (nvroot); 1514 } 1515 1516 nvlist_t * 1517 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1518 splitflags_t flags, int argc, char **argv) 1519 { 1520 nvlist_t *newroot = NULL, **child; 1521 uint_t c, children; 1522 zpool_boot_label_t boot_type; 1523 1524 if (argc > 0) { 1525 if ((newroot = construct_spec(argc, argv)) == NULL) { 1526 (void) fprintf(stderr, gettext("Unable to build a " 1527 "pool from the specified devices\n")); 1528 return (NULL); 1529 } 1530 1531 if (zpool_is_bootable(zhp)) 1532 boot_type = ZPOOL_COPY_BOOT_LABEL; 1533 else 1534 boot_type = ZPOOL_NO_BOOT_LABEL; 1535 1536 if (!flags.dryrun && 1537 make_disks(zhp, newroot, boot_type, 0) != 0) { 1538 nvlist_free(newroot); 1539 return (NULL); 1540 } 1541 1542 /* avoid any tricks in the spec */ 1543 verify(nvlist_lookup_nvlist_array(newroot, 1544 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1545 for (c = 0; c < children; c++) { 1546 char *path; 1547 const char *type; 1548 int min, max; 1549 1550 verify(nvlist_lookup_string(child[c], 1551 ZPOOL_CONFIG_PATH, &path) == 0); 1552 if ((type = is_grouping(path, &min, &max)) != NULL) { 1553 (void) fprintf(stderr, gettext("Cannot use " 1554 "'%s' as a device for splitting\n"), type); 1555 nvlist_free(newroot); 1556 return (NULL); 1557 } 1558 } 1559 } 1560 1561 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1562 nvlist_free(newroot); 1563 return (NULL); 1564 } 1565 1566 return (newroot); 1567 } 1568 1569 static int 1570 num_normal_vdevs(nvlist_t *nvroot) 1571 { 1572 nvlist_t **top; 1573 uint_t t, toplevels, normal = 0; 1574 1575 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1576 &top, &toplevels) == 0); 1577 1578 for (t = 0; t < toplevels; t++) { 1579 uint64_t log = B_FALSE; 1580 1581 (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); 1582 if (log) 1583 continue; 1584 if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) 1585 continue; 1586 1587 normal++; 1588 } 1589 1590 return (normal); 1591 } 1592 1593 /* 1594 * Get and validate the contents of the given vdev specification. This ensures 1595 * that the nvlist returned is well-formed, that all the devices exist, and that 1596 * they are not currently in use by any other known consumer. The 'poolconfig' 1597 * parameter is the current configuration of the pool when adding devices 1598 * existing pool, and is used to perform additional checks, such as changing the 1599 * replication level of the pool. It can be 'NULL' to indicate that this is a 1600 * new pool. The 'force' flag controls whether devices should be forcefully 1601 * added, even if they appear in use. 1602 */ 1603 nvlist_t * 1604 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, 1605 boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type, 1606 uint64_t boot_size, int argc, char **argv) 1607 { 1608 nvlist_t *newroot; 1609 nvlist_t *poolconfig = NULL; 1610 is_force = force; 1611 1612 /* 1613 * Construct the vdev specification. If this is successful, we know 1614 * that we have a valid specification, and that all devices can be 1615 * opened. 1616 */ 1617 if ((newroot = construct_spec(argc, argv)) == NULL) 1618 return (NULL); 1619 1620 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) 1621 return (NULL); 1622 1623 /* 1624 * Validate each device to make sure that its not shared with another 1625 * subsystem. We do this even if 'force' is set, because there are some 1626 * uses (such as a dedicated dump device) that even '-f' cannot 1627 * override. 1628 */ 1629 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { 1630 nvlist_free(newroot); 1631 return (NULL); 1632 } 1633 1634 /* 1635 * Check the replication level of the given vdevs and report any errors 1636 * found. We include the existing pool spec, if any, as we need to 1637 * catch changes against the existing replication level. 1638 */ 1639 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1640 nvlist_free(newroot); 1641 return (NULL); 1642 } 1643 1644 /* 1645 * On pool create the new vdev spec must have one normal vdev. 1646 */ 1647 if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { 1648 vdev_error(gettext("at least one general top-level vdev must " 1649 "be specified\n")); 1650 nvlist_free(newroot); 1651 return (NULL); 1652 } 1653 1654 /* 1655 * Run through the vdev specification and label any whole disks found. 1656 */ 1657 if (!dryrun && make_disks(zhp, newroot, boot_type, boot_size) != 0) { 1658 nvlist_free(newroot); 1659 return (NULL); 1660 } 1661 1662 return (newroot); 1663 } 1664