1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'make_root_vdev'. The 52 * function performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Call libzfs to label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <devid.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libdiskmgt.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <stdio.h> 73 #include <string.h> 74 #include <unistd.h> 75 #include <sys/efi_partition.h> 76 #include <sys/stat.h> 77 #include <sys/vtoc.h> 78 #include <sys/mntent.h> 79 80 #include "zpool_util.h" 81 82 #define DISK_ROOT "/dev/dsk" 83 #define RDISK_ROOT "/dev/rdsk" 84 #define BACKUP_SLICE "s2" 85 86 /* 87 * For any given vdev specification, we can have multiple errors. The 88 * vdev_error() function keeps track of whether we have seen an error yet, and 89 * prints out a header if its the first error we've seen. 90 */ 91 boolean_t error_seen; 92 boolean_t is_force; 93 94 /*PRINTFLIKE1*/ 95 static void 96 vdev_error(const char *fmt, ...) 97 { 98 va_list ap; 99 100 if (!error_seen) { 101 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 102 if (!is_force) 103 (void) fprintf(stderr, gettext("use '-f' to override " 104 "the following errors:\n")); 105 else 106 (void) fprintf(stderr, gettext("the following errors " 107 "must be manually repaired:\n")); 108 error_seen = B_TRUE; 109 } 110 111 va_start(ap, fmt); 112 (void) vfprintf(stderr, fmt, ap); 113 va_end(ap); 114 } 115 116 static void 117 libdiskmgt_error(int error) 118 { 119 /* 120 * ENXIO/ENODEV is a valid error message if the device doesn't live in 121 * /dev/dsk. Don't bother printing an error message in this case. 122 */ 123 if (error == ENXIO || error == ENODEV) 124 return; 125 126 (void) fprintf(stderr, gettext("warning: device in use checking " 127 "failed: %s\n"), strerror(error)); 128 } 129 130 /* 131 * Validate a device, passing the bulk of the work off to libdiskmgt. 132 */ 133 static int 134 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 135 { 136 char *msg; 137 int error = 0; 138 dm_who_type_t who; 139 140 if (force) 141 who = DM_WHO_ZPOOL_FORCE; 142 else if (isspare) 143 who = DM_WHO_ZPOOL_SPARE; 144 else 145 who = DM_WHO_ZPOOL; 146 147 if (dm_inuse((char *)path, &msg, who, &error) || error) { 148 if (error != 0) { 149 libdiskmgt_error(error); 150 return (0); 151 } else { 152 vdev_error("%s", msg); 153 free(msg); 154 return (-1); 155 } 156 } 157 158 /* 159 * If we're given a whole disk, ignore overlapping slices since we're 160 * about to label it anyway. 161 */ 162 error = 0; 163 if (!wholedisk && !force && 164 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 165 if (error == 0) { 166 /* dm_isoverlapping returned -1 */ 167 vdev_error(gettext("%s overlaps with %s\n"), path, msg); 168 free(msg); 169 return (-1); 170 } else if (error != ENODEV) { 171 /* libdiskmgt's devcache only handles physical drives */ 172 libdiskmgt_error(error); 173 return (0); 174 } 175 } 176 177 return (0); 178 } 179 180 181 /* 182 * Validate a whole disk. Iterate over all slices on the disk and make sure 183 * that none is in use by calling check_slice(). 184 */ 185 static int 186 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 187 { 188 dm_descriptor_t *drive, *media, *slice; 189 int err = 0; 190 int i; 191 int ret; 192 193 /* 194 * Get the drive associated with this disk. This should never fail, 195 * because we already have an alias handle open for the device. 196 */ 197 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 198 &err)) == NULL || *drive == NULL) { 199 if (err) 200 libdiskmgt_error(err); 201 return (0); 202 } 203 204 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 205 &err)) == NULL) { 206 dm_free_descriptors(drive); 207 if (err) 208 libdiskmgt_error(err); 209 return (0); 210 } 211 212 dm_free_descriptors(drive); 213 214 /* 215 * It is possible that the user has specified a removable media drive, 216 * and the media is not present. 217 */ 218 if (*media == NULL) { 219 dm_free_descriptors(media); 220 vdev_error(gettext("'%s' has no media in drive\n"), name); 221 return (-1); 222 } 223 224 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 225 &err)) == NULL) { 226 dm_free_descriptors(media); 227 if (err) 228 libdiskmgt_error(err); 229 return (0); 230 } 231 232 dm_free_descriptors(media); 233 234 ret = 0; 235 236 /* 237 * Iterate over all slices and report any errors. We don't care about 238 * overlapping slices because we are using the whole disk. 239 */ 240 for (i = 0; slice[i] != NULL; i++) { 241 char *name = dm_get_name(slice[i], &err); 242 243 if (check_slice(name, force, B_TRUE, isspare) != 0) 244 ret = -1; 245 246 dm_free_name(name); 247 } 248 249 dm_free_descriptors(slice); 250 return (ret); 251 } 252 253 /* 254 * Validate a device. 255 */ 256 static int 257 check_device(const char *path, boolean_t force, boolean_t isspare) 258 { 259 dm_descriptor_t desc; 260 int err; 261 char *dev; 262 263 /* 264 * For whole disks, libdiskmgt does not include the leading dev path. 265 */ 266 dev = strrchr(path, '/'); 267 assert(dev != NULL); 268 dev++; 269 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 270 err = check_disk(path, desc, force, isspare); 271 dm_free_descriptor(desc); 272 return (err); 273 } 274 275 return (check_slice(path, force, B_FALSE, isspare)); 276 } 277 278 /* 279 * Check that a file is valid. All we can do in this case is check that it's 280 * not in use by another pool, and not in use by swap. 281 */ 282 static int 283 check_file(const char *file, boolean_t force, boolean_t isspare) 284 { 285 char *name; 286 int fd; 287 int ret = 0; 288 int err; 289 pool_state_t state; 290 boolean_t inuse; 291 292 if (dm_inuse_swap(file, &err)) { 293 if (err) 294 libdiskmgt_error(err); 295 else 296 vdev_error(gettext("%s is currently used by swap. " 297 "Please see swap(1M).\n"), file); 298 return (-1); 299 } 300 301 if ((fd = open(file, O_RDONLY)) < 0) 302 return (0); 303 304 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 305 const char *desc; 306 307 switch (state) { 308 case POOL_STATE_ACTIVE: 309 desc = gettext("active"); 310 break; 311 312 case POOL_STATE_EXPORTED: 313 desc = gettext("exported"); 314 break; 315 316 case POOL_STATE_POTENTIALLY_ACTIVE: 317 desc = gettext("potentially active"); 318 break; 319 320 default: 321 desc = gettext("unknown"); 322 break; 323 } 324 325 /* 326 * Allow hot spares to be shared between pools. 327 */ 328 if (state == POOL_STATE_SPARE && isspare) 329 return (0); 330 331 if (state == POOL_STATE_ACTIVE || 332 state == POOL_STATE_SPARE || !force) { 333 switch (state) { 334 case POOL_STATE_SPARE: 335 vdev_error(gettext("%s is reserved as a hot " 336 "spare for pool %s\n"), file, name); 337 break; 338 default: 339 vdev_error(gettext("%s is part of %s pool " 340 "'%s'\n"), file, desc, name); 341 break; 342 } 343 ret = -1; 344 } 345 346 free(name); 347 } 348 349 (void) close(fd); 350 return (ret); 351 } 352 353 354 /* 355 * By "whole disk" we mean an entire physical disk (something we can 356 * label, toggle the write cache on, etc.) as opposed to the full 357 * capacity of a pseudo-device such as lofi or did. We act as if we 358 * are labeling the disk, which should be a pretty good test of whether 359 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 360 * it isn't. 361 */ 362 static boolean_t 363 is_whole_disk(const char *arg) 364 { 365 struct dk_gpt *label; 366 int fd; 367 char path[MAXPATHLEN]; 368 369 (void) snprintf(path, sizeof (path), "%s%s%s", 370 RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); 371 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) 372 return (B_FALSE); 373 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 374 (void) close(fd); 375 return (B_FALSE); 376 } 377 efi_free(label); 378 (void) close(fd); 379 return (B_TRUE); 380 } 381 382 /* 383 * Create a leaf vdev. Determine if this is a file or a device. If it's a 384 * device, fill in the device id to make a complete nvlist. Valid forms for a 385 * leaf vdev are: 386 * 387 * /dev/dsk/xxx Complete disk path 388 * /xxx Full path to file 389 * xxx Shorthand for /dev/dsk/xxx 390 */ 391 static nvlist_t * 392 make_leaf_vdev(const char *arg, uint64_t is_log) 393 { 394 char path[MAXPATHLEN]; 395 struct stat64 statbuf; 396 nvlist_t *vdev = NULL; 397 char *type = NULL; 398 boolean_t wholedisk = B_FALSE; 399 400 /* 401 * Determine what type of vdev this is, and put the full path into 402 * 'path'. We detect whether this is a device of file afterwards by 403 * checking the st_mode of the file. 404 */ 405 if (arg[0] == '/') { 406 /* 407 * Complete device or file path. Exact type is determined by 408 * examining the file descriptor afterwards. 409 */ 410 wholedisk = is_whole_disk(arg); 411 if (!wholedisk && (stat64(arg, &statbuf) != 0)) { 412 (void) fprintf(stderr, 413 gettext("cannot open '%s': %s\n"), 414 arg, strerror(errno)); 415 return (NULL); 416 } 417 418 (void) strlcpy(path, arg, sizeof (path)); 419 } else { 420 /* 421 * This may be a short path for a device, or it could be total 422 * gibberish. Check to see if it's a known device in 423 * /dev/dsk/. As part of this check, see if we've been given a 424 * an entire disk (minus the slice number). 425 */ 426 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 427 arg); 428 wholedisk = is_whole_disk(path); 429 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 430 /* 431 * If we got ENOENT, then the user gave us 432 * gibberish, so try to direct them with a 433 * reasonable error message. Otherwise, 434 * regurgitate strerror() since it's the best we 435 * can do. 436 */ 437 if (errno == ENOENT) { 438 (void) fprintf(stderr, 439 gettext("cannot open '%s': no such " 440 "device in %s\n"), arg, DISK_ROOT); 441 (void) fprintf(stderr, 442 gettext("must be a full path or " 443 "shorthand device name\n")); 444 return (NULL); 445 } else { 446 (void) fprintf(stderr, 447 gettext("cannot open '%s': %s\n"), 448 path, strerror(errno)); 449 return (NULL); 450 } 451 } 452 } 453 454 /* 455 * Determine whether this is a device or a file. 456 */ 457 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 458 type = VDEV_TYPE_DISK; 459 } else if (S_ISREG(statbuf.st_mode)) { 460 type = VDEV_TYPE_FILE; 461 } else { 462 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 463 "block device or regular file\n"), path); 464 return (NULL); 465 } 466 467 /* 468 * Finally, we have the complete device or file, and we know that it is 469 * acceptable to use. Construct the nvlist to describe this vdev. All 470 * vdevs have a 'path' element, and devices also have a 'devid' element. 471 */ 472 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 473 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 474 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 475 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); 476 if (strcmp(type, VDEV_TYPE_DISK) == 0) 477 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 478 (uint64_t)wholedisk) == 0); 479 480 /* 481 * For a whole disk, defer getting its devid until after labeling it. 482 */ 483 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 484 /* 485 * Get the devid for the device. 486 */ 487 int fd; 488 ddi_devid_t devid; 489 char *minor = NULL, *devid_str = NULL; 490 491 if ((fd = open(path, O_RDONLY)) < 0) { 492 (void) fprintf(stderr, gettext("cannot open '%s': " 493 "%s\n"), path, strerror(errno)); 494 nvlist_free(vdev); 495 return (NULL); 496 } 497 498 if (devid_get(fd, &devid) == 0) { 499 if (devid_get_minor_name(fd, &minor) == 0 && 500 (devid_str = devid_str_encode(devid, minor)) != 501 NULL) { 502 verify(nvlist_add_string(vdev, 503 ZPOOL_CONFIG_DEVID, devid_str) == 0); 504 } 505 if (devid_str != NULL) 506 devid_str_free(devid_str); 507 if (minor != NULL) 508 devid_str_free(minor); 509 devid_free(devid); 510 } 511 512 (void) close(fd); 513 } 514 515 return (vdev); 516 } 517 518 /* 519 * Go through and verify the replication level of the pool is consistent. 520 * Performs the following checks: 521 * 522 * For the new spec, verifies that devices in mirrors and raidz are the 523 * same size. 524 * 525 * If the current configuration already has inconsistent replication 526 * levels, ignore any other potential problems in the new spec. 527 * 528 * Otherwise, make sure that the current spec (if there is one) and the new 529 * spec have consistent replication levels. 530 */ 531 typedef struct replication_level { 532 char *zprl_type; 533 uint64_t zprl_children; 534 uint64_t zprl_parity; 535 } replication_level_t; 536 537 #define ZPOOL_FUZZ (16 * 1024 * 1024) 538 539 /* 540 * Given a list of toplevel vdevs, return the current replication level. If 541 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 542 * an error message will be displayed for each self-inconsistent vdev. 543 */ 544 static replication_level_t * 545 get_replication(nvlist_t *nvroot, boolean_t fatal) 546 { 547 nvlist_t **top; 548 uint_t t, toplevels; 549 nvlist_t **child; 550 uint_t c, children; 551 nvlist_t *nv; 552 char *type; 553 replication_level_t lastrep, rep, *ret; 554 boolean_t dontreport; 555 556 ret = safe_malloc(sizeof (replication_level_t)); 557 558 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 559 &top, &toplevels) == 0); 560 561 lastrep.zprl_type = NULL; 562 for (t = 0; t < toplevels; t++) { 563 uint64_t is_log = B_FALSE; 564 565 nv = top[t]; 566 567 /* 568 * For separate logs we ignore the top level vdev replication 569 * constraints. 570 */ 571 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 572 if (is_log) 573 continue; 574 575 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, 576 &type) == 0); 577 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 578 &child, &children) != 0) { 579 /* 580 * This is a 'file' or 'disk' vdev. 581 */ 582 rep.zprl_type = type; 583 rep.zprl_children = 1; 584 rep.zprl_parity = 0; 585 } else { 586 uint64_t vdev_size; 587 588 /* 589 * This is a mirror or RAID-Z vdev. Go through and make 590 * sure the contents are all the same (files vs. disks), 591 * keeping track of the number of elements in the 592 * process. 593 * 594 * We also check that the size of each vdev (if it can 595 * be determined) is the same. 596 */ 597 rep.zprl_type = type; 598 rep.zprl_children = 0; 599 600 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 601 verify(nvlist_lookup_uint64(nv, 602 ZPOOL_CONFIG_NPARITY, 603 &rep.zprl_parity) == 0); 604 assert(rep.zprl_parity != 0); 605 } else { 606 rep.zprl_parity = 0; 607 } 608 609 /* 610 * The 'dontreport' variable indicates that we've 611 * already reported an error for this spec, so don't 612 * bother doing it again. 613 */ 614 type = NULL; 615 dontreport = 0; 616 vdev_size = -1ULL; 617 for (c = 0; c < children; c++) { 618 nvlist_t *cnv = child[c]; 619 char *path; 620 struct stat64 statbuf; 621 uint64_t size = -1ULL; 622 char *childtype; 623 int fd, err; 624 625 rep.zprl_children++; 626 627 verify(nvlist_lookup_string(cnv, 628 ZPOOL_CONFIG_TYPE, &childtype) == 0); 629 630 /* 631 * If this is a replacing or spare vdev, then 632 * get the real first child of the vdev. 633 */ 634 if (strcmp(childtype, 635 VDEV_TYPE_REPLACING) == 0 || 636 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 637 nvlist_t **rchild; 638 uint_t rchildren; 639 640 verify(nvlist_lookup_nvlist_array(cnv, 641 ZPOOL_CONFIG_CHILDREN, &rchild, 642 &rchildren) == 0); 643 assert(rchildren == 2); 644 cnv = rchild[0]; 645 646 verify(nvlist_lookup_string(cnv, 647 ZPOOL_CONFIG_TYPE, 648 &childtype) == 0); 649 } 650 651 verify(nvlist_lookup_string(cnv, 652 ZPOOL_CONFIG_PATH, &path) == 0); 653 654 /* 655 * If we have a raidz/mirror that combines disks 656 * with files, report it as an error. 657 */ 658 if (!dontreport && type != NULL && 659 strcmp(type, childtype) != 0) { 660 if (ret != NULL) 661 free(ret); 662 ret = NULL; 663 if (fatal) 664 vdev_error(gettext( 665 "mismatched replication " 666 "level: %s contains both " 667 "files and devices\n"), 668 rep.zprl_type); 669 else 670 return (NULL); 671 dontreport = B_TRUE; 672 } 673 674 /* 675 * According to stat(2), the value of 'st_size' 676 * is undefined for block devices and character 677 * devices. But there is no effective way to 678 * determine the real size in userland. 679 * 680 * Instead, we'll take advantage of an 681 * implementation detail of spec_size(). If the 682 * device is currently open, then we (should) 683 * return a valid size. 684 * 685 * If we still don't get a valid size (indicated 686 * by a size of 0 or MAXOFFSET_T), then ignore 687 * this device altogether. 688 */ 689 if ((fd = open(path, O_RDONLY)) >= 0) { 690 err = fstat64(fd, &statbuf); 691 (void) close(fd); 692 } else { 693 err = stat64(path, &statbuf); 694 } 695 696 if (err != 0 || 697 statbuf.st_size == 0 || 698 statbuf.st_size == MAXOFFSET_T) 699 continue; 700 701 size = statbuf.st_size; 702 703 /* 704 * Also make sure that devices and 705 * slices have a consistent size. If 706 * they differ by a significant amount 707 * (~16MB) then report an error. 708 */ 709 if (!dontreport && 710 (vdev_size != -1ULL && 711 (labs(size - vdev_size) > 712 ZPOOL_FUZZ))) { 713 if (ret != NULL) 714 free(ret); 715 ret = NULL; 716 if (fatal) 717 vdev_error(gettext( 718 "%s contains devices of " 719 "different sizes\n"), 720 rep.zprl_type); 721 else 722 return (NULL); 723 dontreport = B_TRUE; 724 } 725 726 type = childtype; 727 vdev_size = size; 728 } 729 } 730 731 /* 732 * At this point, we have the replication of the last toplevel 733 * vdev in 'rep'. Compare it to 'lastrep' to see if its 734 * different. 735 */ 736 if (lastrep.zprl_type != NULL) { 737 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 738 if (ret != NULL) 739 free(ret); 740 ret = NULL; 741 if (fatal) 742 vdev_error(gettext( 743 "mismatched replication level: " 744 "both %s and %s vdevs are " 745 "present\n"), 746 lastrep.zprl_type, rep.zprl_type); 747 else 748 return (NULL); 749 } else if (lastrep.zprl_parity != rep.zprl_parity) { 750 if (ret) 751 free(ret); 752 ret = NULL; 753 if (fatal) 754 vdev_error(gettext( 755 "mismatched replication level: " 756 "both %llu and %llu device parity " 757 "%s vdevs are present\n"), 758 lastrep.zprl_parity, 759 rep.zprl_parity, 760 rep.zprl_type); 761 else 762 return (NULL); 763 } else if (lastrep.zprl_children != rep.zprl_children) { 764 if (ret) 765 free(ret); 766 ret = NULL; 767 if (fatal) 768 vdev_error(gettext( 769 "mismatched replication level: " 770 "both %llu-way and %llu-way %s " 771 "vdevs are present\n"), 772 lastrep.zprl_children, 773 rep.zprl_children, 774 rep.zprl_type); 775 else 776 return (NULL); 777 } 778 } 779 lastrep = rep; 780 } 781 782 if (ret != NULL) 783 *ret = rep; 784 785 return (ret); 786 } 787 788 /* 789 * Check the replication level of the vdev spec against the current pool. Calls 790 * get_replication() to make sure the new spec is self-consistent. If the pool 791 * has a consistent replication level, then we ignore any errors. Otherwise, 792 * report any difference between the two. 793 */ 794 static int 795 check_replication(nvlist_t *config, nvlist_t *newroot) 796 { 797 nvlist_t **child; 798 uint_t children; 799 replication_level_t *current = NULL, *new; 800 int ret; 801 802 /* 803 * If we have a current pool configuration, check to see if it's 804 * self-consistent. If not, simply return success. 805 */ 806 if (config != NULL) { 807 nvlist_t *nvroot; 808 809 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 810 &nvroot) == 0); 811 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 812 return (0); 813 } 814 /* 815 * for spares there may be no children, and therefore no 816 * replication level to check 817 */ 818 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 819 &child, &children) != 0) || (children == 0)) { 820 free(current); 821 return (0); 822 } 823 824 /* 825 * If all we have is logs then there's no replication level to check. 826 */ 827 if (num_logs(newroot) == children) { 828 free(current); 829 return (0); 830 } 831 832 /* 833 * Get the replication level of the new vdev spec, reporting any 834 * inconsistencies found. 835 */ 836 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 837 free(current); 838 return (-1); 839 } 840 841 /* 842 * Check to see if the new vdev spec matches the replication level of 843 * the current pool. 844 */ 845 ret = 0; 846 if (current != NULL) { 847 if (strcmp(current->zprl_type, new->zprl_type) != 0) { 848 vdev_error(gettext( 849 "mismatched replication level: pool uses %s " 850 "and new vdev is %s\n"), 851 current->zprl_type, new->zprl_type); 852 ret = -1; 853 } else if (current->zprl_parity != new->zprl_parity) { 854 vdev_error(gettext( 855 "mismatched replication level: pool uses %llu " 856 "device parity and new vdev uses %llu\n"), 857 current->zprl_parity, new->zprl_parity); 858 ret = -1; 859 } else if (current->zprl_children != new->zprl_children) { 860 vdev_error(gettext( 861 "mismatched replication level: pool uses %llu-way " 862 "%s and new vdev uses %llu-way %s\n"), 863 current->zprl_children, current->zprl_type, 864 new->zprl_children, new->zprl_type); 865 ret = -1; 866 } 867 } 868 869 free(new); 870 if (current != NULL) 871 free(current); 872 873 return (ret); 874 } 875 876 /* 877 * Go through and find any whole disks in the vdev specification, labelling them 878 * as appropriate. When constructing the vdev spec, we were unable to open this 879 * device in order to provide a devid. Now that we have labelled the disk and 880 * know that slice 0 is valid, we can construct the devid now. 881 * 882 * If the disk was already labeled with an EFI label, we will have gotten the 883 * devid already (because we were able to open the whole disk). Otherwise, we 884 * need to get the devid after we label the disk. 885 */ 886 static int 887 make_disks(zpool_handle_t *zhp, nvlist_t *nv) 888 { 889 nvlist_t **child; 890 uint_t c, children; 891 char *type, *path, *diskname; 892 char buf[MAXPATHLEN]; 893 uint64_t wholedisk; 894 int fd; 895 int ret; 896 ddi_devid_t devid; 897 char *minor = NULL, *devid_str = NULL; 898 899 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 900 901 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 902 &child, &children) != 0) { 903 904 if (strcmp(type, VDEV_TYPE_DISK) != 0) 905 return (0); 906 907 /* 908 * We have a disk device. Get the path to the device 909 * and see if it's a whole disk by appending the backup 910 * slice and stat()ing the device. 911 */ 912 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 913 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 914 &wholedisk) != 0 || !wholedisk) 915 return (0); 916 917 diskname = strrchr(path, '/'); 918 assert(diskname != NULL); 919 diskname++; 920 if (zpool_label_disk(g_zfs, zhp, diskname) == -1) 921 return (-1); 922 923 /* 924 * Fill in the devid, now that we've labeled the disk. 925 */ 926 (void) snprintf(buf, sizeof (buf), "%ss0", path); 927 if ((fd = open(buf, O_RDONLY)) < 0) { 928 (void) fprintf(stderr, 929 gettext("cannot open '%s': %s\n"), 930 buf, strerror(errno)); 931 return (-1); 932 } 933 934 if (devid_get(fd, &devid) == 0) { 935 if (devid_get_minor_name(fd, &minor) == 0 && 936 (devid_str = devid_str_encode(devid, minor)) != 937 NULL) { 938 verify(nvlist_add_string(nv, 939 ZPOOL_CONFIG_DEVID, devid_str) == 0); 940 } 941 if (devid_str != NULL) 942 devid_str_free(devid_str); 943 if (minor != NULL) 944 devid_str_free(minor); 945 devid_free(devid); 946 } 947 948 /* 949 * Update the path to refer to the 's0' slice. The presence of 950 * the 'whole_disk' field indicates to the CLI that we should 951 * chop off the slice number when displaying the device in 952 * future output. 953 */ 954 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 955 956 (void) close(fd); 957 958 return (0); 959 } 960 961 for (c = 0; c < children; c++) 962 if ((ret = make_disks(zhp, child[c])) != 0) 963 return (ret); 964 965 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 966 &child, &children) == 0) 967 for (c = 0; c < children; c++) 968 if ((ret = make_disks(zhp, child[c])) != 0) 969 return (ret); 970 971 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 972 &child, &children) == 0) 973 for (c = 0; c < children; c++) 974 if ((ret = make_disks(zhp, child[c])) != 0) 975 return (ret); 976 977 return (0); 978 } 979 980 /* 981 * Determine if the given path is a hot spare within the given configuration. 982 */ 983 static boolean_t 984 is_spare(nvlist_t *config, const char *path) 985 { 986 int fd; 987 pool_state_t state; 988 char *name = NULL; 989 nvlist_t *label; 990 uint64_t guid, spareguid; 991 nvlist_t *nvroot; 992 nvlist_t **spares; 993 uint_t i, nspares; 994 boolean_t inuse; 995 996 if ((fd = open(path, O_RDONLY)) < 0) 997 return (B_FALSE); 998 999 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 1000 !inuse || 1001 state != POOL_STATE_SPARE || 1002 zpool_read_label(fd, &label) != 0) { 1003 free(name); 1004 (void) close(fd); 1005 return (B_FALSE); 1006 } 1007 free(name); 1008 1009 (void) close(fd); 1010 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1011 nvlist_free(label); 1012 1013 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1014 &nvroot) == 0); 1015 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1016 &spares, &nspares) == 0) { 1017 for (i = 0; i < nspares; i++) { 1018 verify(nvlist_lookup_uint64(spares[i], 1019 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1020 if (spareguid == guid) 1021 return (B_TRUE); 1022 } 1023 } 1024 1025 return (B_FALSE); 1026 } 1027 1028 /* 1029 * Go through and find any devices that are in use. We rely on libdiskmgt for 1030 * the majority of this task. 1031 */ 1032 static int 1033 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 1034 int isspare) 1035 { 1036 nvlist_t **child; 1037 uint_t c, children; 1038 char *type, *path; 1039 int ret; 1040 char buf[MAXPATHLEN]; 1041 uint64_t wholedisk; 1042 1043 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1044 1045 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1046 &child, &children) != 0) { 1047 1048 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1049 1050 /* 1051 * As a generic check, we look to see if this is a replace of a 1052 * hot spare within the same pool. If so, we allow it 1053 * regardless of what libdiskmgt or zpool_in_use() says. 1054 */ 1055 if (isreplacing) { 1056 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1057 &wholedisk) == 0 && wholedisk) 1058 (void) snprintf(buf, sizeof (buf), "%ss0", 1059 path); 1060 else 1061 (void) strlcpy(buf, path, sizeof (buf)); 1062 if (is_spare(config, buf)) 1063 return (0); 1064 } 1065 1066 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1067 ret = check_device(path, force, isspare); 1068 1069 if (strcmp(type, VDEV_TYPE_FILE) == 0) 1070 ret = check_file(path, force, isspare); 1071 1072 return (ret); 1073 } 1074 1075 for (c = 0; c < children; c++) 1076 if ((ret = check_in_use(config, child[c], force, 1077 isreplacing, B_FALSE)) != 0) 1078 return (ret); 1079 1080 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1081 &child, &children) == 0) 1082 for (c = 0; c < children; c++) 1083 if ((ret = check_in_use(config, child[c], force, 1084 isreplacing, B_TRUE)) != 0) 1085 return (ret); 1086 1087 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1088 &child, &children) == 0) 1089 for (c = 0; c < children; c++) 1090 if ((ret = check_in_use(config, child[c], force, 1091 isreplacing, B_FALSE)) != 0) 1092 return (ret); 1093 1094 return (0); 1095 } 1096 1097 static const char * 1098 is_grouping(const char *type, int *mindev) 1099 { 1100 if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { 1101 if (mindev != NULL) 1102 *mindev = 2; 1103 return (VDEV_TYPE_RAIDZ); 1104 } 1105 1106 if (strcmp(type, "raidz2") == 0) { 1107 if (mindev != NULL) 1108 *mindev = 3; 1109 return (VDEV_TYPE_RAIDZ); 1110 } 1111 1112 if (strcmp(type, "mirror") == 0) { 1113 if (mindev != NULL) 1114 *mindev = 2; 1115 return (VDEV_TYPE_MIRROR); 1116 } 1117 1118 if (strcmp(type, "spare") == 0) { 1119 if (mindev != NULL) 1120 *mindev = 1; 1121 return (VDEV_TYPE_SPARE); 1122 } 1123 1124 if (strcmp(type, "log") == 0) { 1125 if (mindev != NULL) 1126 *mindev = 1; 1127 return (VDEV_TYPE_LOG); 1128 } 1129 1130 if (strcmp(type, "cache") == 0) { 1131 if (mindev != NULL) 1132 *mindev = 1; 1133 return (VDEV_TYPE_L2CACHE); 1134 } 1135 1136 return (NULL); 1137 } 1138 1139 /* 1140 * Construct a syntactically valid vdev specification, 1141 * and ensure that all devices and files exist and can be opened. 1142 * Note: we don't bother freeing anything in the error paths 1143 * because the program is just going to exit anyway. 1144 */ 1145 nvlist_t * 1146 construct_spec(int argc, char **argv) 1147 { 1148 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1149 int t, toplevels, mindev, nspares, nlogs, nl2cache; 1150 const char *type; 1151 uint64_t is_log; 1152 boolean_t seen_logs; 1153 1154 top = NULL; 1155 toplevels = 0; 1156 spares = NULL; 1157 l2cache = NULL; 1158 nspares = 0; 1159 nlogs = 0; 1160 nl2cache = 0; 1161 is_log = B_FALSE; 1162 seen_logs = B_FALSE; 1163 1164 while (argc > 0) { 1165 nv = NULL; 1166 1167 /* 1168 * If it's a mirror or raidz, the subsequent arguments are 1169 * its leaves -- until we encounter the next mirror or raidz. 1170 */ 1171 if ((type = is_grouping(argv[0], &mindev)) != NULL) { 1172 nvlist_t **child = NULL; 1173 int c, children = 0; 1174 1175 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1176 if (spares != NULL) { 1177 (void) fprintf(stderr, 1178 gettext("invalid vdev " 1179 "specification: 'spare' can be " 1180 "specified only once\n")); 1181 return (NULL); 1182 } 1183 is_log = B_FALSE; 1184 } 1185 1186 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1187 if (seen_logs) { 1188 (void) fprintf(stderr, 1189 gettext("invalid vdev " 1190 "specification: 'log' can be " 1191 "specified only once\n")); 1192 return (NULL); 1193 } 1194 seen_logs = B_TRUE; 1195 is_log = B_TRUE; 1196 argc--; 1197 argv++; 1198 /* 1199 * A log is not a real grouping device. 1200 * We just set is_log and continue. 1201 */ 1202 continue; 1203 } 1204 1205 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1206 if (l2cache != NULL) { 1207 (void) fprintf(stderr, 1208 gettext("invalid vdev " 1209 "specification: 'cache' can be " 1210 "specified only once\n")); 1211 return (NULL); 1212 } 1213 is_log = B_FALSE; 1214 } 1215 1216 if (is_log) { 1217 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1218 (void) fprintf(stderr, 1219 gettext("invalid vdev " 1220 "specification: unsupported 'log' " 1221 "device: %s\n"), type); 1222 return (NULL); 1223 } 1224 nlogs++; 1225 } 1226 1227 for (c = 1; c < argc; c++) { 1228 if (is_grouping(argv[c], NULL) != NULL) 1229 break; 1230 children++; 1231 child = realloc(child, 1232 children * sizeof (nvlist_t *)); 1233 if (child == NULL) 1234 zpool_no_memory(); 1235 if ((nv = make_leaf_vdev(argv[c], B_FALSE)) 1236 == NULL) 1237 return (NULL); 1238 child[children - 1] = nv; 1239 } 1240 1241 if (children < mindev) { 1242 (void) fprintf(stderr, gettext("invalid vdev " 1243 "specification: %s requires at least %d " 1244 "devices\n"), argv[0], mindev); 1245 return (NULL); 1246 } 1247 1248 argc -= c; 1249 argv += c; 1250 1251 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1252 spares = child; 1253 nspares = children; 1254 continue; 1255 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1256 l2cache = child; 1257 nl2cache = children; 1258 continue; 1259 } else { 1260 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1261 0) == 0); 1262 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1263 type) == 0); 1264 verify(nvlist_add_uint64(nv, 1265 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1266 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1267 verify(nvlist_add_uint64(nv, 1268 ZPOOL_CONFIG_NPARITY, 1269 mindev - 1) == 0); 1270 } 1271 verify(nvlist_add_nvlist_array(nv, 1272 ZPOOL_CONFIG_CHILDREN, child, 1273 children) == 0); 1274 1275 for (c = 0; c < children; c++) 1276 nvlist_free(child[c]); 1277 free(child); 1278 } 1279 } else { 1280 /* 1281 * We have a device. Pass off to make_leaf_vdev() to 1282 * construct the appropriate nvlist describing the vdev. 1283 */ 1284 if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL) 1285 return (NULL); 1286 if (is_log) 1287 nlogs++; 1288 argc--; 1289 argv++; 1290 } 1291 1292 toplevels++; 1293 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1294 if (top == NULL) 1295 zpool_no_memory(); 1296 top[toplevels - 1] = nv; 1297 } 1298 1299 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1300 (void) fprintf(stderr, gettext("invalid vdev " 1301 "specification: at least one toplevel vdev must be " 1302 "specified\n")); 1303 return (NULL); 1304 } 1305 1306 if (seen_logs && nlogs == 0) { 1307 (void) fprintf(stderr, gettext("invalid vdev specification: " 1308 "log requires at least 1 device\n")); 1309 return (NULL); 1310 } 1311 1312 /* 1313 * Finally, create nvroot and add all top-level vdevs to it. 1314 */ 1315 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1316 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1317 VDEV_TYPE_ROOT) == 0); 1318 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1319 top, toplevels) == 0); 1320 if (nspares != 0) 1321 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1322 spares, nspares) == 0); 1323 if (nl2cache != 0) 1324 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1325 l2cache, nl2cache) == 0); 1326 1327 for (t = 0; t < toplevels; t++) 1328 nvlist_free(top[t]); 1329 for (t = 0; t < nspares; t++) 1330 nvlist_free(spares[t]); 1331 for (t = 0; t < nl2cache; t++) 1332 nvlist_free(l2cache[t]); 1333 if (spares) 1334 free(spares); 1335 if (l2cache) 1336 free(l2cache); 1337 free(top); 1338 1339 return (nvroot); 1340 } 1341 1342 1343 /* 1344 * Get and validate the contents of the given vdev specification. This ensures 1345 * that the nvlist returned is well-formed, that all the devices exist, and that 1346 * they are not currently in use by any other known consumer. The 'poolconfig' 1347 * parameter is the current configuration of the pool when adding devices 1348 * existing pool, and is used to perform additional checks, such as changing the 1349 * replication level of the pool. It can be 'NULL' to indicate that this is a 1350 * new pool. The 'force' flag controls whether devices should be forcefully 1351 * added, even if they appear in use. 1352 */ 1353 nvlist_t * 1354 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, 1355 boolean_t isreplacing, int argc, char **argv) 1356 { 1357 nvlist_t *newroot; 1358 nvlist_t *poolconfig = NULL; 1359 is_force = force; 1360 1361 /* 1362 * Construct the vdev specification. If this is successful, we know 1363 * that we have a valid specification, and that all devices can be 1364 * opened. 1365 */ 1366 if ((newroot = construct_spec(argc, argv)) == NULL) 1367 return (NULL); 1368 1369 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) 1370 return (NULL); 1371 1372 /* 1373 * Validate each device to make sure that its not shared with another 1374 * subsystem. We do this even if 'force' is set, because there are some 1375 * uses (such as a dedicated dump device) that even '-f' cannot 1376 * override. 1377 */ 1378 if (check_in_use(poolconfig, newroot, force, isreplacing, 1379 B_FALSE) != 0) { 1380 nvlist_free(newroot); 1381 return (NULL); 1382 } 1383 1384 /* 1385 * Check the replication level of the given vdevs and report any errors 1386 * found. We include the existing pool spec, if any, as we need to 1387 * catch changes against the existing replication level. 1388 */ 1389 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1390 nvlist_free(newroot); 1391 return (NULL); 1392 } 1393 1394 /* 1395 * Run through the vdev specification and label any whole disks found. 1396 */ 1397 if (make_disks(zhp, newroot) != 0) { 1398 nvlist_free(newroot); 1399 return (NULL); 1400 } 1401 1402 return (newroot); 1403 } 1404