1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Functions to convert between a list of vdevs and an nvlist representing the 29 * configuration. Each entry in the list can be one of: 30 * 31 * Device vdevs 32 * disk=(path=..., devid=...) 33 * file=(path=...) 34 * 35 * Group vdevs 36 * raidz[1|2]=(...) 37 * mirror=(...) 38 * 39 * Hot spares 40 * 41 * While the underlying implementation supports it, group vdevs cannot contain 42 * other group vdevs. All userland verification of devices is contained within 43 * this file. If successful, the nvlist returned can be passed directly to the 44 * kernel; we've done as much verification as possible in userland. 45 * 46 * Hot spares are a special case, and passed down as an array of disk vdevs, at 47 * the same level as the root of the vdev tree. 48 * 49 * The only function exported by this file is 'make_root_vdev'. The 50 * function performs several passes: 51 * 52 * 1. Construct the vdev specification. Performs syntax validation and 53 * makes sure each device is valid. 54 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 55 * devices are also in use. Some can be overridden using the 'force' 56 * flag, others cannot. 57 * 3. Check for replication errors if the 'force' flag is not specified. 58 * validates that the replication level is consistent across the 59 * entire pool. 60 * 4. Call libzfs to label any whole disks with an EFI label. 61 */ 62 63 #include <assert.h> 64 #include <devid.h> 65 #include <errno.h> 66 #include <fcntl.h> 67 #include <libdiskmgt.h> 68 #include <libintl.h> 69 #include <libnvpair.h> 70 #include <limits.h> 71 #include <stdio.h> 72 #include <string.h> 73 #include <unistd.h> 74 #include <sys/efi_partition.h> 75 #include <sys/stat.h> 76 #include <sys/vtoc.h> 77 #include <sys/mntent.h> 78 79 #include "zpool_util.h" 80 81 #define DISK_ROOT "/dev/dsk" 82 #define RDISK_ROOT "/dev/rdsk" 83 #define BACKUP_SLICE "s2" 84 85 /* 86 * For any given vdev specification, we can have multiple errors. The 87 * vdev_error() function keeps track of whether we have seen an error yet, and 88 * prints out a header if its the first error we've seen. 89 */ 90 boolean_t error_seen; 91 boolean_t is_force; 92 93 /*PRINTFLIKE1*/ 94 static void 95 vdev_error(const char *fmt, ...) 96 { 97 va_list ap; 98 99 if (!error_seen) { 100 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 101 if (!is_force) 102 (void) fprintf(stderr, gettext("use '-f' to override " 103 "the following errors:\n")); 104 else 105 (void) fprintf(stderr, gettext("the following errors " 106 "must be manually repaired:\n")); 107 error_seen = B_TRUE; 108 } 109 110 va_start(ap, fmt); 111 (void) vfprintf(stderr, fmt, ap); 112 va_end(ap); 113 } 114 115 static void 116 libdiskmgt_error(int error) 117 { 118 /* 119 * ENXIO/ENODEV is a valid error message if the device doesn't live in 120 * /dev/dsk. Don't bother printing an error message in this case. 121 */ 122 if (error == ENXIO || error == ENODEV) 123 return; 124 125 (void) fprintf(stderr, gettext("warning: device in use checking " 126 "failed: %s\n"), strerror(error)); 127 } 128 129 /* 130 * Validate a device, passing the bulk of the work off to libdiskmgt. 131 */ 132 static int 133 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 134 { 135 char *msg; 136 int error = 0; 137 dm_who_type_t who; 138 139 if (force) 140 who = DM_WHO_ZPOOL_FORCE; 141 else if (isspare) 142 who = DM_WHO_ZPOOL_SPARE; 143 else 144 who = DM_WHO_ZPOOL; 145 146 if (dm_inuse((char *)path, &msg, who, &error) || error) { 147 if (error != 0) { 148 libdiskmgt_error(error); 149 return (0); 150 } else { 151 vdev_error("%s", msg); 152 free(msg); 153 return (-1); 154 } 155 } 156 157 /* 158 * If we're given a whole disk, ignore overlapping slices since we're 159 * about to label it anyway. 160 */ 161 error = 0; 162 if (!wholedisk && !force && 163 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 164 if (error == 0) { 165 /* dm_isoverlapping returned -1 */ 166 vdev_error(gettext("%s overlaps with %s\n"), path, msg); 167 free(msg); 168 return (-1); 169 } else if (error != ENODEV) { 170 /* libdiskmgt's devcache only handles physical drives */ 171 libdiskmgt_error(error); 172 return (0); 173 } 174 } 175 176 return (0); 177 } 178 179 180 /* 181 * Validate a whole disk. Iterate over all slices on the disk and make sure 182 * that none is in use by calling check_slice(). 183 */ 184 static int 185 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 186 { 187 dm_descriptor_t *drive, *media, *slice; 188 int err = 0; 189 int i; 190 int ret; 191 192 /* 193 * Get the drive associated with this disk. This should never fail, 194 * because we already have an alias handle open for the device. 195 */ 196 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 197 &err)) == NULL || *drive == NULL) { 198 if (err) 199 libdiskmgt_error(err); 200 return (0); 201 } 202 203 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 204 &err)) == NULL) { 205 dm_free_descriptors(drive); 206 if (err) 207 libdiskmgt_error(err); 208 return (0); 209 } 210 211 dm_free_descriptors(drive); 212 213 /* 214 * It is possible that the user has specified a removable media drive, 215 * and the media is not present. 216 */ 217 if (*media == NULL) { 218 dm_free_descriptors(media); 219 vdev_error(gettext("'%s' has no media in drive\n"), name); 220 return (-1); 221 } 222 223 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 224 &err)) == NULL) { 225 dm_free_descriptors(media); 226 if (err) 227 libdiskmgt_error(err); 228 return (0); 229 } 230 231 dm_free_descriptors(media); 232 233 ret = 0; 234 235 /* 236 * Iterate over all slices and report any errors. We don't care about 237 * overlapping slices because we are using the whole disk. 238 */ 239 for (i = 0; slice[i] != NULL; i++) { 240 char *name = dm_get_name(slice[i], &err); 241 242 if (check_slice(name, force, B_TRUE, isspare) != 0) 243 ret = -1; 244 245 dm_free_name(name); 246 } 247 248 dm_free_descriptors(slice); 249 return (ret); 250 } 251 252 /* 253 * Validate a device. 254 */ 255 static int 256 check_device(const char *path, boolean_t force, boolean_t isspare) 257 { 258 dm_descriptor_t desc; 259 int err; 260 char *dev; 261 262 /* 263 * For whole disks, libdiskmgt does not include the leading dev path. 264 */ 265 dev = strrchr(path, '/'); 266 assert(dev != NULL); 267 dev++; 268 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 269 err = check_disk(path, desc, force, isspare); 270 dm_free_descriptor(desc); 271 return (err); 272 } 273 274 return (check_slice(path, force, B_FALSE, isspare)); 275 } 276 277 /* 278 * Check that a file is valid. All we can do in this case is check that it's 279 * not in use by another pool, and not in use by swap. 280 */ 281 static int 282 check_file(const char *file, boolean_t force, boolean_t isspare) 283 { 284 char *name; 285 int fd; 286 int ret = 0; 287 int err; 288 pool_state_t state; 289 boolean_t inuse; 290 291 if (dm_inuse_swap(file, &err)) { 292 if (err) 293 libdiskmgt_error(err); 294 else 295 vdev_error(gettext("%s is currently used by swap. " 296 "Please see swap(1M).\n"), file); 297 return (-1); 298 } 299 300 if ((fd = open(file, O_RDONLY)) < 0) 301 return (0); 302 303 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 304 const char *desc; 305 306 switch (state) { 307 case POOL_STATE_ACTIVE: 308 desc = gettext("active"); 309 break; 310 311 case POOL_STATE_EXPORTED: 312 desc = gettext("exported"); 313 break; 314 315 case POOL_STATE_POTENTIALLY_ACTIVE: 316 desc = gettext("potentially active"); 317 break; 318 319 default: 320 desc = gettext("unknown"); 321 break; 322 } 323 324 /* 325 * Allow hot spares to be shared between pools. 326 */ 327 if (state == POOL_STATE_SPARE && isspare) 328 return (0); 329 330 if (state == POOL_STATE_ACTIVE || 331 state == POOL_STATE_SPARE || !force) { 332 switch (state) { 333 case POOL_STATE_SPARE: 334 vdev_error(gettext("%s is reserved as a hot " 335 "spare for pool %s\n"), file, name); 336 break; 337 default: 338 vdev_error(gettext("%s is part of %s pool " 339 "'%s'\n"), file, desc, name); 340 break; 341 } 342 ret = -1; 343 } 344 345 free(name); 346 } 347 348 (void) close(fd); 349 return (ret); 350 } 351 352 353 /* 354 * By "whole disk" we mean an entire physical disk (something we can 355 * label, toggle the write cache on, etc.) as opposed to the full 356 * capacity of a pseudo-device such as lofi or did. We act as if we 357 * are labeling the disk, which should be a pretty good test of whether 358 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 359 * it isn't. 360 */ 361 static boolean_t 362 is_whole_disk(const char *arg) 363 { 364 struct dk_gpt *label; 365 int fd; 366 char path[MAXPATHLEN]; 367 368 (void) snprintf(path, sizeof (path), "%s%s%s", 369 RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); 370 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) 371 return (B_FALSE); 372 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 373 (void) close(fd); 374 return (B_FALSE); 375 } 376 efi_free(label); 377 (void) close(fd); 378 return (B_TRUE); 379 } 380 381 /* 382 * Create a leaf vdev. Determine if this is a file or a device. If it's a 383 * device, fill in the device id to make a complete nvlist. Valid forms for a 384 * leaf vdev are: 385 * 386 * /dev/dsk/xxx Complete disk path 387 * /xxx Full path to file 388 * xxx Shorthand for /dev/dsk/xxx 389 */ 390 static nvlist_t * 391 make_leaf_vdev(const char *arg, uint64_t is_log) 392 { 393 char path[MAXPATHLEN]; 394 struct stat64 statbuf; 395 nvlist_t *vdev = NULL; 396 char *type = NULL; 397 boolean_t wholedisk = B_FALSE; 398 399 /* 400 * Determine what type of vdev this is, and put the full path into 401 * 'path'. We detect whether this is a device of file afterwards by 402 * checking the st_mode of the file. 403 */ 404 if (arg[0] == '/') { 405 /* 406 * Complete device or file path. Exact type is determined by 407 * examining the file descriptor afterwards. 408 */ 409 wholedisk = is_whole_disk(arg); 410 if (!wholedisk && (stat64(arg, &statbuf) != 0)) { 411 (void) fprintf(stderr, 412 gettext("cannot open '%s': %s\n"), 413 arg, strerror(errno)); 414 return (NULL); 415 } 416 417 (void) strlcpy(path, arg, sizeof (path)); 418 } else { 419 /* 420 * This may be a short path for a device, or it could be total 421 * gibberish. Check to see if it's a known device in 422 * /dev/dsk/. As part of this check, see if we've been given a 423 * an entire disk (minus the slice number). 424 */ 425 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 426 arg); 427 wholedisk = is_whole_disk(path); 428 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 429 /* 430 * If we got ENOENT, then the user gave us 431 * gibberish, so try to direct them with a 432 * reasonable error message. Otherwise, 433 * regurgitate strerror() since it's the best we 434 * can do. 435 */ 436 if (errno == ENOENT) { 437 (void) fprintf(stderr, 438 gettext("cannot open '%s': no such " 439 "device in %s\n"), arg, DISK_ROOT); 440 (void) fprintf(stderr, 441 gettext("must be a full path or " 442 "shorthand device name\n")); 443 return (NULL); 444 } else { 445 (void) fprintf(stderr, 446 gettext("cannot open '%s': %s\n"), 447 path, strerror(errno)); 448 return (NULL); 449 } 450 } 451 } 452 453 /* 454 * Determine whether this is a device or a file. 455 */ 456 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 457 type = VDEV_TYPE_DISK; 458 } else if (S_ISREG(statbuf.st_mode)) { 459 type = VDEV_TYPE_FILE; 460 } else { 461 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 462 "block device or regular file\n"), path); 463 return (NULL); 464 } 465 466 /* 467 * Finally, we have the complete device or file, and we know that it is 468 * acceptable to use. Construct the nvlist to describe this vdev. All 469 * vdevs have a 'path' element, and devices also have a 'devid' element. 470 */ 471 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 472 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 473 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 474 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); 475 if (strcmp(type, VDEV_TYPE_DISK) == 0) 476 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 477 (uint64_t)wholedisk) == 0); 478 479 /* 480 * For a whole disk, defer getting its devid until after labeling it. 481 */ 482 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 483 /* 484 * Get the devid for the device. 485 */ 486 int fd; 487 ddi_devid_t devid; 488 char *minor = NULL, *devid_str = NULL; 489 490 if ((fd = open(path, O_RDONLY)) < 0) { 491 (void) fprintf(stderr, gettext("cannot open '%s': " 492 "%s\n"), path, strerror(errno)); 493 nvlist_free(vdev); 494 return (NULL); 495 } 496 497 if (devid_get(fd, &devid) == 0) { 498 if (devid_get_minor_name(fd, &minor) == 0 && 499 (devid_str = devid_str_encode(devid, minor)) != 500 NULL) { 501 verify(nvlist_add_string(vdev, 502 ZPOOL_CONFIG_DEVID, devid_str) == 0); 503 } 504 if (devid_str != NULL) 505 devid_str_free(devid_str); 506 if (minor != NULL) 507 devid_str_free(minor); 508 devid_free(devid); 509 } 510 511 (void) close(fd); 512 } 513 514 return (vdev); 515 } 516 517 /* 518 * Go through and verify the replication level of the pool is consistent. 519 * Performs the following checks: 520 * 521 * For the new spec, verifies that devices in mirrors and raidz are the 522 * same size. 523 * 524 * If the current configuration already has inconsistent replication 525 * levels, ignore any other potential problems in the new spec. 526 * 527 * Otherwise, make sure that the current spec (if there is one) and the new 528 * spec have consistent replication levels. 529 */ 530 typedef struct replication_level { 531 char *zprl_type; 532 uint64_t zprl_children; 533 uint64_t zprl_parity; 534 } replication_level_t; 535 536 #define ZPOOL_FUZZ (16 * 1024 * 1024) 537 538 /* 539 * Given a list of toplevel vdevs, return the current replication level. If 540 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 541 * an error message will be displayed for each self-inconsistent vdev. 542 */ 543 static replication_level_t * 544 get_replication(nvlist_t *nvroot, boolean_t fatal) 545 { 546 nvlist_t **top; 547 uint_t t, toplevels; 548 nvlist_t **child; 549 uint_t c, children; 550 nvlist_t *nv; 551 char *type; 552 replication_level_t lastrep, rep, *ret; 553 boolean_t dontreport; 554 555 ret = safe_malloc(sizeof (replication_level_t)); 556 557 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 558 &top, &toplevels) == 0); 559 560 lastrep.zprl_type = NULL; 561 for (t = 0; t < toplevels; t++) { 562 uint64_t is_log = B_FALSE; 563 564 nv = top[t]; 565 566 /* 567 * For separate logs we ignore the top level vdev replication 568 * constraints. 569 */ 570 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 571 if (is_log) 572 continue; 573 574 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, 575 &type) == 0); 576 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 577 &child, &children) != 0) { 578 /* 579 * This is a 'file' or 'disk' vdev. 580 */ 581 rep.zprl_type = type; 582 rep.zprl_children = 1; 583 rep.zprl_parity = 0; 584 } else { 585 uint64_t vdev_size; 586 587 /* 588 * This is a mirror or RAID-Z vdev. Go through and make 589 * sure the contents are all the same (files vs. disks), 590 * keeping track of the number of elements in the 591 * process. 592 * 593 * We also check that the size of each vdev (if it can 594 * be determined) is the same. 595 */ 596 rep.zprl_type = type; 597 rep.zprl_children = 0; 598 599 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 600 verify(nvlist_lookup_uint64(nv, 601 ZPOOL_CONFIG_NPARITY, 602 &rep.zprl_parity) == 0); 603 assert(rep.zprl_parity != 0); 604 } else { 605 rep.zprl_parity = 0; 606 } 607 608 /* 609 * The 'dontreport' variable indicates that we've 610 * already reported an error for this spec, so don't 611 * bother doing it again. 612 */ 613 type = NULL; 614 dontreport = 0; 615 vdev_size = -1ULL; 616 for (c = 0; c < children; c++) { 617 nvlist_t *cnv = child[c]; 618 char *path; 619 struct stat64 statbuf; 620 uint64_t size = -1ULL; 621 char *childtype; 622 int fd, err; 623 624 rep.zprl_children++; 625 626 verify(nvlist_lookup_string(cnv, 627 ZPOOL_CONFIG_TYPE, &childtype) == 0); 628 629 /* 630 * If this is a replacing or spare vdev, then 631 * get the real first child of the vdev. 632 */ 633 if (strcmp(childtype, 634 VDEV_TYPE_REPLACING) == 0 || 635 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 636 nvlist_t **rchild; 637 uint_t rchildren; 638 639 verify(nvlist_lookup_nvlist_array(cnv, 640 ZPOOL_CONFIG_CHILDREN, &rchild, 641 &rchildren) == 0); 642 assert(rchildren == 2); 643 cnv = rchild[0]; 644 645 verify(nvlist_lookup_string(cnv, 646 ZPOOL_CONFIG_TYPE, 647 &childtype) == 0); 648 } 649 650 verify(nvlist_lookup_string(cnv, 651 ZPOOL_CONFIG_PATH, &path) == 0); 652 653 /* 654 * If we have a raidz/mirror that combines disks 655 * with files, report it as an error. 656 */ 657 if (!dontreport && type != NULL && 658 strcmp(type, childtype) != 0) { 659 if (ret != NULL) 660 free(ret); 661 ret = NULL; 662 if (fatal) 663 vdev_error(gettext( 664 "mismatched replication " 665 "level: %s contains both " 666 "files and devices\n"), 667 rep.zprl_type); 668 else 669 return (NULL); 670 dontreport = B_TRUE; 671 } 672 673 /* 674 * According to stat(2), the value of 'st_size' 675 * is undefined for block devices and character 676 * devices. But there is no effective way to 677 * determine the real size in userland. 678 * 679 * Instead, we'll take advantage of an 680 * implementation detail of spec_size(). If the 681 * device is currently open, then we (should) 682 * return a valid size. 683 * 684 * If we still don't get a valid size (indicated 685 * by a size of 0 or MAXOFFSET_T), then ignore 686 * this device altogether. 687 */ 688 if ((fd = open(path, O_RDONLY)) >= 0) { 689 err = fstat64(fd, &statbuf); 690 (void) close(fd); 691 } else { 692 err = stat64(path, &statbuf); 693 } 694 695 if (err != 0 || 696 statbuf.st_size == 0 || 697 statbuf.st_size == MAXOFFSET_T) 698 continue; 699 700 size = statbuf.st_size; 701 702 /* 703 * Also make sure that devices and 704 * slices have a consistent size. If 705 * they differ by a significant amount 706 * (~16MB) then report an error. 707 */ 708 if (!dontreport && 709 (vdev_size != -1ULL && 710 (labs(size - vdev_size) > 711 ZPOOL_FUZZ))) { 712 if (ret != NULL) 713 free(ret); 714 ret = NULL; 715 if (fatal) 716 vdev_error(gettext( 717 "%s contains devices of " 718 "different sizes\n"), 719 rep.zprl_type); 720 else 721 return (NULL); 722 dontreport = B_TRUE; 723 } 724 725 type = childtype; 726 vdev_size = size; 727 } 728 } 729 730 /* 731 * At this point, we have the replication of the last toplevel 732 * vdev in 'rep'. Compare it to 'lastrep' to see if its 733 * different. 734 */ 735 if (lastrep.zprl_type != NULL) { 736 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 737 if (ret != NULL) 738 free(ret); 739 ret = NULL; 740 if (fatal) 741 vdev_error(gettext( 742 "mismatched replication level: " 743 "both %s and %s vdevs are " 744 "present\n"), 745 lastrep.zprl_type, rep.zprl_type); 746 else 747 return (NULL); 748 } else if (lastrep.zprl_parity != rep.zprl_parity) { 749 if (ret) 750 free(ret); 751 ret = NULL; 752 if (fatal) 753 vdev_error(gettext( 754 "mismatched replication level: " 755 "both %llu and %llu device parity " 756 "%s vdevs are present\n"), 757 lastrep.zprl_parity, 758 rep.zprl_parity, 759 rep.zprl_type); 760 else 761 return (NULL); 762 } else if (lastrep.zprl_children != rep.zprl_children) { 763 if (ret) 764 free(ret); 765 ret = NULL; 766 if (fatal) 767 vdev_error(gettext( 768 "mismatched replication level: " 769 "both %llu-way and %llu-way %s " 770 "vdevs are present\n"), 771 lastrep.zprl_children, 772 rep.zprl_children, 773 rep.zprl_type); 774 else 775 return (NULL); 776 } 777 } 778 lastrep = rep; 779 } 780 781 if (ret != NULL) 782 *ret = rep; 783 784 return (ret); 785 } 786 787 /* 788 * Check the replication level of the vdev spec against the current pool. Calls 789 * get_replication() to make sure the new spec is self-consistent. If the pool 790 * has a consistent replication level, then we ignore any errors. Otherwise, 791 * report any difference between the two. 792 */ 793 static int 794 check_replication(nvlist_t *config, nvlist_t *newroot) 795 { 796 nvlist_t **child; 797 uint_t children; 798 replication_level_t *current = NULL, *new; 799 int ret; 800 801 /* 802 * If we have a current pool configuration, check to see if it's 803 * self-consistent. If not, simply return success. 804 */ 805 if (config != NULL) { 806 nvlist_t *nvroot; 807 808 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 809 &nvroot) == 0); 810 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 811 return (0); 812 } 813 /* 814 * for spares there may be no children, and therefore no 815 * replication level to check 816 */ 817 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 818 &child, &children) != 0) || (children == 0)) { 819 free(current); 820 return (0); 821 } 822 823 /* 824 * If all we have is logs then there's no replication level to check. 825 */ 826 if (num_logs(newroot) == children) { 827 free(current); 828 return (0); 829 } 830 831 /* 832 * Get the replication level of the new vdev spec, reporting any 833 * inconsistencies found. 834 */ 835 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 836 free(current); 837 return (-1); 838 } 839 840 /* 841 * Check to see if the new vdev spec matches the replication level of 842 * the current pool. 843 */ 844 ret = 0; 845 if (current != NULL) { 846 if (strcmp(current->zprl_type, new->zprl_type) != 0) { 847 vdev_error(gettext( 848 "mismatched replication level: pool uses %s " 849 "and new vdev is %s\n"), 850 current->zprl_type, new->zprl_type); 851 ret = -1; 852 } else if (current->zprl_parity != new->zprl_parity) { 853 vdev_error(gettext( 854 "mismatched replication level: pool uses %llu " 855 "device parity and new vdev uses %llu\n"), 856 current->zprl_parity, new->zprl_parity); 857 ret = -1; 858 } else if (current->zprl_children != new->zprl_children) { 859 vdev_error(gettext( 860 "mismatched replication level: pool uses %llu-way " 861 "%s and new vdev uses %llu-way %s\n"), 862 current->zprl_children, current->zprl_type, 863 new->zprl_children, new->zprl_type); 864 ret = -1; 865 } 866 } 867 868 free(new); 869 if (current != NULL) 870 free(current); 871 872 return (ret); 873 } 874 875 /* 876 * Go through and find any whole disks in the vdev specification, labelling them 877 * as appropriate. When constructing the vdev spec, we were unable to open this 878 * device in order to provide a devid. Now that we have labelled the disk and 879 * know that slice 0 is valid, we can construct the devid now. 880 * 881 * If the disk was already labeled with an EFI label, we will have gotten the 882 * devid already (because we were able to open the whole disk). Otherwise, we 883 * need to get the devid after we label the disk. 884 */ 885 static int 886 make_disks(zpool_handle_t *zhp, nvlist_t *nv) 887 { 888 nvlist_t **child; 889 uint_t c, children; 890 char *type, *path, *diskname; 891 char buf[MAXPATHLEN]; 892 uint64_t wholedisk; 893 int fd; 894 int ret; 895 ddi_devid_t devid; 896 char *minor = NULL, *devid_str = NULL; 897 898 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 899 900 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 901 &child, &children) != 0) { 902 903 if (strcmp(type, VDEV_TYPE_DISK) != 0) 904 return (0); 905 906 /* 907 * We have a disk device. Get the path to the device 908 * and see if it's a whole disk by appending the backup 909 * slice and stat()ing the device. 910 */ 911 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 912 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 913 &wholedisk) != 0 || !wholedisk) 914 return (0); 915 916 diskname = strrchr(path, '/'); 917 assert(diskname != NULL); 918 diskname++; 919 if (zpool_label_disk(g_zfs, zhp, diskname) == -1) 920 return (-1); 921 922 /* 923 * Fill in the devid, now that we've labeled the disk. 924 */ 925 (void) snprintf(buf, sizeof (buf), "%ss0", path); 926 if ((fd = open(buf, O_RDONLY)) < 0) { 927 (void) fprintf(stderr, 928 gettext("cannot open '%s': %s\n"), 929 buf, strerror(errno)); 930 return (-1); 931 } 932 933 if (devid_get(fd, &devid) == 0) { 934 if (devid_get_minor_name(fd, &minor) == 0 && 935 (devid_str = devid_str_encode(devid, minor)) != 936 NULL) { 937 verify(nvlist_add_string(nv, 938 ZPOOL_CONFIG_DEVID, devid_str) == 0); 939 } 940 if (devid_str != NULL) 941 devid_str_free(devid_str); 942 if (minor != NULL) 943 devid_str_free(minor); 944 devid_free(devid); 945 } 946 947 /* 948 * Update the path to refer to the 's0' slice. The presence of 949 * the 'whole_disk' field indicates to the CLI that we should 950 * chop off the slice number when displaying the device in 951 * future output. 952 */ 953 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 954 955 (void) close(fd); 956 957 return (0); 958 } 959 960 for (c = 0; c < children; c++) 961 if ((ret = make_disks(zhp, child[c])) != 0) 962 return (ret); 963 964 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 965 &child, &children) == 0) 966 for (c = 0; c < children; c++) 967 if ((ret = make_disks(zhp, child[c])) != 0) 968 return (ret); 969 970 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 971 &child, &children) == 0) 972 for (c = 0; c < children; c++) 973 if ((ret = make_disks(zhp, child[c])) != 0) 974 return (ret); 975 976 return (0); 977 } 978 979 /* 980 * Determine if the given path is a hot spare within the given configuration. 981 */ 982 static boolean_t 983 is_spare(nvlist_t *config, const char *path) 984 { 985 int fd; 986 pool_state_t state; 987 char *name = NULL; 988 nvlist_t *label; 989 uint64_t guid, spareguid; 990 nvlist_t *nvroot; 991 nvlist_t **spares; 992 uint_t i, nspares; 993 boolean_t inuse; 994 995 if ((fd = open(path, O_RDONLY)) < 0) 996 return (B_FALSE); 997 998 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 999 !inuse || 1000 state != POOL_STATE_SPARE || 1001 zpool_read_label(fd, &label) != 0) { 1002 free(name); 1003 (void) close(fd); 1004 return (B_FALSE); 1005 } 1006 free(name); 1007 1008 (void) close(fd); 1009 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1010 nvlist_free(label); 1011 1012 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1013 &nvroot) == 0); 1014 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1015 &spares, &nspares) == 0) { 1016 for (i = 0; i < nspares; i++) { 1017 verify(nvlist_lookup_uint64(spares[i], 1018 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1019 if (spareguid == guid) 1020 return (B_TRUE); 1021 } 1022 } 1023 1024 return (B_FALSE); 1025 } 1026 1027 /* 1028 * Go through and find any devices that are in use. We rely on libdiskmgt for 1029 * the majority of this task. 1030 */ 1031 static int 1032 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 1033 int isspare) 1034 { 1035 nvlist_t **child; 1036 uint_t c, children; 1037 char *type, *path; 1038 int ret; 1039 char buf[MAXPATHLEN]; 1040 uint64_t wholedisk; 1041 1042 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1043 1044 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1045 &child, &children) != 0) { 1046 1047 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1048 1049 /* 1050 * As a generic check, we look to see if this is a replace of a 1051 * hot spare within the same pool. If so, we allow it 1052 * regardless of what libdiskmgt or zpool_in_use() says. 1053 */ 1054 if (isreplacing) { 1055 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1056 &wholedisk) == 0 && wholedisk) 1057 (void) snprintf(buf, sizeof (buf), "%ss0", 1058 path); 1059 else 1060 (void) strlcpy(buf, path, sizeof (buf)); 1061 if (is_spare(config, buf)) 1062 return (0); 1063 } 1064 1065 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1066 ret = check_device(path, force, isspare); 1067 1068 if (strcmp(type, VDEV_TYPE_FILE) == 0) 1069 ret = check_file(path, force, isspare); 1070 1071 return (ret); 1072 } 1073 1074 for (c = 0; c < children; c++) 1075 if ((ret = check_in_use(config, child[c], force, 1076 isreplacing, B_FALSE)) != 0) 1077 return (ret); 1078 1079 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1080 &child, &children) == 0) 1081 for (c = 0; c < children; c++) 1082 if ((ret = check_in_use(config, child[c], force, 1083 isreplacing, B_TRUE)) != 0) 1084 return (ret); 1085 1086 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1087 &child, &children) == 0) 1088 for (c = 0; c < children; c++) 1089 if ((ret = check_in_use(config, child[c], force, 1090 isreplacing, B_FALSE)) != 0) 1091 return (ret); 1092 1093 return (0); 1094 } 1095 1096 static const char * 1097 is_grouping(const char *type, int *mindev, int *maxdev) 1098 { 1099 if (strncmp(type, "raidz", 5) == 0) { 1100 const char *p = type + 5; 1101 char *end; 1102 long nparity; 1103 1104 if (*p == '\0') { 1105 nparity = 1; 1106 } else if (*p == '0') { 1107 return (NULL); /* no zero prefixes allowed */ 1108 } else { 1109 errno = 0; 1110 nparity = strtol(p, &end, 10); 1111 if (errno != 0 || nparity < 1 || nparity >= 255 || 1112 *end != '\0') 1113 return (NULL); 1114 } 1115 1116 if (mindev != NULL) 1117 *mindev = nparity + 1; 1118 if (maxdev != NULL) 1119 *maxdev = 255; 1120 return (VDEV_TYPE_RAIDZ); 1121 } 1122 1123 if (maxdev != NULL) 1124 *maxdev = INT_MAX; 1125 1126 if (strcmp(type, "mirror") == 0) { 1127 if (mindev != NULL) 1128 *mindev = 2; 1129 return (VDEV_TYPE_MIRROR); 1130 } 1131 1132 if (strcmp(type, "spare") == 0) { 1133 if (mindev != NULL) 1134 *mindev = 1; 1135 return (VDEV_TYPE_SPARE); 1136 } 1137 1138 if (strcmp(type, "log") == 0) { 1139 if (mindev != NULL) 1140 *mindev = 1; 1141 return (VDEV_TYPE_LOG); 1142 } 1143 1144 if (strcmp(type, "cache") == 0) { 1145 if (mindev != NULL) 1146 *mindev = 1; 1147 return (VDEV_TYPE_L2CACHE); 1148 } 1149 1150 return (NULL); 1151 } 1152 1153 /* 1154 * Construct a syntactically valid vdev specification, 1155 * and ensure that all devices and files exist and can be opened. 1156 * Note: we don't bother freeing anything in the error paths 1157 * because the program is just going to exit anyway. 1158 */ 1159 nvlist_t * 1160 construct_spec(int argc, char **argv) 1161 { 1162 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1163 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1164 const char *type; 1165 uint64_t is_log; 1166 boolean_t seen_logs; 1167 1168 top = NULL; 1169 toplevels = 0; 1170 spares = NULL; 1171 l2cache = NULL; 1172 nspares = 0; 1173 nlogs = 0; 1174 nl2cache = 0; 1175 is_log = B_FALSE; 1176 seen_logs = B_FALSE; 1177 1178 while (argc > 0) { 1179 nv = NULL; 1180 1181 /* 1182 * If it's a mirror or raidz, the subsequent arguments are 1183 * its leaves -- until we encounter the next mirror or raidz. 1184 */ 1185 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { 1186 nvlist_t **child = NULL; 1187 int c, children = 0; 1188 1189 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1190 if (spares != NULL) { 1191 (void) fprintf(stderr, 1192 gettext("invalid vdev " 1193 "specification: 'spare' can be " 1194 "specified only once\n")); 1195 return (NULL); 1196 } 1197 is_log = B_FALSE; 1198 } 1199 1200 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1201 if (seen_logs) { 1202 (void) fprintf(stderr, 1203 gettext("invalid vdev " 1204 "specification: 'log' can be " 1205 "specified only once\n")); 1206 return (NULL); 1207 } 1208 seen_logs = B_TRUE; 1209 is_log = B_TRUE; 1210 argc--; 1211 argv++; 1212 /* 1213 * A log is not a real grouping device. 1214 * We just set is_log and continue. 1215 */ 1216 continue; 1217 } 1218 1219 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1220 if (l2cache != NULL) { 1221 (void) fprintf(stderr, 1222 gettext("invalid vdev " 1223 "specification: 'cache' can be " 1224 "specified only once\n")); 1225 return (NULL); 1226 } 1227 is_log = B_FALSE; 1228 } 1229 1230 if (is_log) { 1231 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1232 (void) fprintf(stderr, 1233 gettext("invalid vdev " 1234 "specification: unsupported 'log' " 1235 "device: %s\n"), type); 1236 return (NULL); 1237 } 1238 nlogs++; 1239 } 1240 1241 for (c = 1; c < argc; c++) { 1242 if (is_grouping(argv[c], NULL, NULL) != NULL) 1243 break; 1244 children++; 1245 child = realloc(child, 1246 children * sizeof (nvlist_t *)); 1247 if (child == NULL) 1248 zpool_no_memory(); 1249 if ((nv = make_leaf_vdev(argv[c], B_FALSE)) 1250 == NULL) 1251 return (NULL); 1252 child[children - 1] = nv; 1253 } 1254 1255 if (children < mindev) { 1256 (void) fprintf(stderr, gettext("invalid vdev " 1257 "specification: %s requires at least %d " 1258 "devices\n"), argv[0], mindev); 1259 return (NULL); 1260 } 1261 1262 if (children > maxdev) { 1263 (void) fprintf(stderr, gettext("invalid vdev " 1264 "specification: %s supports no more than " 1265 "%d devices\n"), argv[0], maxdev); 1266 return (NULL); 1267 } 1268 1269 argc -= c; 1270 argv += c; 1271 1272 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1273 spares = child; 1274 nspares = children; 1275 continue; 1276 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1277 l2cache = child; 1278 nl2cache = children; 1279 continue; 1280 } else { 1281 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1282 0) == 0); 1283 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1284 type) == 0); 1285 verify(nvlist_add_uint64(nv, 1286 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1287 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1288 verify(nvlist_add_uint64(nv, 1289 ZPOOL_CONFIG_NPARITY, 1290 mindev - 1) == 0); 1291 } 1292 verify(nvlist_add_nvlist_array(nv, 1293 ZPOOL_CONFIG_CHILDREN, child, 1294 children) == 0); 1295 1296 for (c = 0; c < children; c++) 1297 nvlist_free(child[c]); 1298 free(child); 1299 } 1300 } else { 1301 /* 1302 * We have a device. Pass off to make_leaf_vdev() to 1303 * construct the appropriate nvlist describing the vdev. 1304 */ 1305 if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL) 1306 return (NULL); 1307 if (is_log) 1308 nlogs++; 1309 argc--; 1310 argv++; 1311 } 1312 1313 toplevels++; 1314 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1315 if (top == NULL) 1316 zpool_no_memory(); 1317 top[toplevels - 1] = nv; 1318 } 1319 1320 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1321 (void) fprintf(stderr, gettext("invalid vdev " 1322 "specification: at least one toplevel vdev must be " 1323 "specified\n")); 1324 return (NULL); 1325 } 1326 1327 if (seen_logs && nlogs == 0) { 1328 (void) fprintf(stderr, gettext("invalid vdev specification: " 1329 "log requires at least 1 device\n")); 1330 return (NULL); 1331 } 1332 1333 /* 1334 * Finally, create nvroot and add all top-level vdevs to it. 1335 */ 1336 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1337 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1338 VDEV_TYPE_ROOT) == 0); 1339 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1340 top, toplevels) == 0); 1341 if (nspares != 0) 1342 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1343 spares, nspares) == 0); 1344 if (nl2cache != 0) 1345 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1346 l2cache, nl2cache) == 0); 1347 1348 for (t = 0; t < toplevels; t++) 1349 nvlist_free(top[t]); 1350 for (t = 0; t < nspares; t++) 1351 nvlist_free(spares[t]); 1352 for (t = 0; t < nl2cache; t++) 1353 nvlist_free(l2cache[t]); 1354 if (spares) 1355 free(spares); 1356 if (l2cache) 1357 free(l2cache); 1358 free(top); 1359 1360 return (nvroot); 1361 } 1362 1363 1364 /* 1365 * Get and validate the contents of the given vdev specification. This ensures 1366 * that the nvlist returned is well-formed, that all the devices exist, and that 1367 * they are not currently in use by any other known consumer. The 'poolconfig' 1368 * parameter is the current configuration of the pool when adding devices 1369 * existing pool, and is used to perform additional checks, such as changing the 1370 * replication level of the pool. It can be 'NULL' to indicate that this is a 1371 * new pool. The 'force' flag controls whether devices should be forcefully 1372 * added, even if they appear in use. 1373 */ 1374 nvlist_t * 1375 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, 1376 boolean_t isreplacing, boolean_t dryrun, int argc, char **argv) 1377 { 1378 nvlist_t *newroot; 1379 nvlist_t *poolconfig = NULL; 1380 is_force = force; 1381 1382 /* 1383 * Construct the vdev specification. If this is successful, we know 1384 * that we have a valid specification, and that all devices can be 1385 * opened. 1386 */ 1387 if ((newroot = construct_spec(argc, argv)) == NULL) 1388 return (NULL); 1389 1390 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) 1391 return (NULL); 1392 1393 /* 1394 * Validate each device to make sure that its not shared with another 1395 * subsystem. We do this even if 'force' is set, because there are some 1396 * uses (such as a dedicated dump device) that even '-f' cannot 1397 * override. 1398 */ 1399 if (check_in_use(poolconfig, newroot, force, isreplacing, 1400 B_FALSE) != 0) { 1401 nvlist_free(newroot); 1402 return (NULL); 1403 } 1404 1405 /* 1406 * Check the replication level of the given vdevs and report any errors 1407 * found. We include the existing pool spec, if any, as we need to 1408 * catch changes against the existing replication level. 1409 */ 1410 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1411 nvlist_free(newroot); 1412 return (NULL); 1413 } 1414 1415 /* 1416 * Run through the vdev specification and label any whole disks found. 1417 */ 1418 if (!dryrun && make_disks(zhp, newroot) != 0) { 1419 nvlist_free(newroot); 1420 return (NULL); 1421 } 1422 1423 return (newroot); 1424 } 1425