1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Functions to convert between a list of vdevs and an nvlist representing the 29 * configuration. Each entry in the list can be one of: 30 * 31 * Device vdevs 32 * disk=(path=..., devid=...) 33 * file=(path=...) 34 * 35 * Group vdevs 36 * raidz[1|2]=(...) 37 * mirror=(...) 38 * 39 * Hot spares 40 * 41 * While the underlying implementation supports it, group vdevs cannot contain 42 * other group vdevs. All userland verification of devices is contained within 43 * this file. If successful, the nvlist returned can be passed directly to the 44 * kernel; we've done as much verification as possible in userland. 45 * 46 * Hot spares are a special case, and passed down as an array of disk vdevs, at 47 * the same level as the root of the vdev tree. 48 * 49 * The only function exported by this file is 'make_root_vdev'. The 50 * function performs several passes: 51 * 52 * 1. Construct the vdev specification. Performs syntax validation and 53 * makes sure each device is valid. 54 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 55 * devices are also in use. Some can be overridden using the 'force' 56 * flag, others cannot. 57 * 3. Check for replication errors if the 'force' flag is not specified. 58 * validates that the replication level is consistent across the 59 * entire pool. 60 * 4. Call libzfs to label any whole disks with an EFI label. 61 */ 62 63 #include <assert.h> 64 #include <devid.h> 65 #include <errno.h> 66 #include <fcntl.h> 67 #include <libdiskmgt.h> 68 #include <libintl.h> 69 #include <libnvpair.h> 70 #include <stdio.h> 71 #include <string.h> 72 #include <unistd.h> 73 #include <sys/efi_partition.h> 74 #include <sys/stat.h> 75 #include <sys/vtoc.h> 76 #include <sys/mntent.h> 77 78 #include "zpool_util.h" 79 80 #define DISK_ROOT "/dev/dsk" 81 #define RDISK_ROOT "/dev/rdsk" 82 #define BACKUP_SLICE "s2" 83 84 /* 85 * For any given vdev specification, we can have multiple errors. The 86 * vdev_error() function keeps track of whether we have seen an error yet, and 87 * prints out a header if its the first error we've seen. 88 */ 89 boolean_t error_seen; 90 boolean_t is_force; 91 92 /*PRINTFLIKE1*/ 93 static void 94 vdev_error(const char *fmt, ...) 95 { 96 va_list ap; 97 98 if (!error_seen) { 99 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 100 if (!is_force) 101 (void) fprintf(stderr, gettext("use '-f' to override " 102 "the following errors:\n")); 103 else 104 (void) fprintf(stderr, gettext("the following errors " 105 "must be manually repaired:\n")); 106 error_seen = B_TRUE; 107 } 108 109 va_start(ap, fmt); 110 (void) vfprintf(stderr, fmt, ap); 111 va_end(ap); 112 } 113 114 static void 115 libdiskmgt_error(int error) 116 { 117 /* 118 * ENXIO/ENODEV is a valid error message if the device doesn't live in 119 * /dev/dsk. Don't bother printing an error message in this case. 120 */ 121 if (error == ENXIO || error == ENODEV) 122 return; 123 124 (void) fprintf(stderr, gettext("warning: device in use checking " 125 "failed: %s\n"), strerror(error)); 126 } 127 128 /* 129 * Validate a device, passing the bulk of the work off to libdiskmgt. 130 */ 131 static int 132 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 133 { 134 char *msg; 135 int error = 0; 136 dm_who_type_t who; 137 138 if (force) 139 who = DM_WHO_ZPOOL_FORCE; 140 else if (isspare) 141 who = DM_WHO_ZPOOL_SPARE; 142 else 143 who = DM_WHO_ZPOOL; 144 145 if (dm_inuse((char *)path, &msg, who, &error) || error) { 146 if (error != 0) { 147 libdiskmgt_error(error); 148 return (0); 149 } else { 150 vdev_error("%s", msg); 151 free(msg); 152 return (-1); 153 } 154 } 155 156 /* 157 * If we're given a whole disk, ignore overlapping slices since we're 158 * about to label it anyway. 159 */ 160 error = 0; 161 if (!wholedisk && !force && 162 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 163 if (error == 0) { 164 /* dm_isoverlapping returned -1 */ 165 vdev_error(gettext("%s overlaps with %s\n"), path, msg); 166 free(msg); 167 return (-1); 168 } else if (error != ENODEV) { 169 /* libdiskmgt's devcache only handles physical drives */ 170 libdiskmgt_error(error); 171 return (0); 172 } 173 } 174 175 return (0); 176 } 177 178 179 /* 180 * Validate a whole disk. Iterate over all slices on the disk and make sure 181 * that none is in use by calling check_slice(). 182 */ 183 static int 184 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 185 { 186 dm_descriptor_t *drive, *media, *slice; 187 int err = 0; 188 int i; 189 int ret; 190 191 /* 192 * Get the drive associated with this disk. This should never fail, 193 * because we already have an alias handle open for the device. 194 */ 195 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 196 &err)) == NULL || *drive == NULL) { 197 if (err) 198 libdiskmgt_error(err); 199 return (0); 200 } 201 202 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 203 &err)) == NULL) { 204 dm_free_descriptors(drive); 205 if (err) 206 libdiskmgt_error(err); 207 return (0); 208 } 209 210 dm_free_descriptors(drive); 211 212 /* 213 * It is possible that the user has specified a removable media drive, 214 * and the media is not present. 215 */ 216 if (*media == NULL) { 217 dm_free_descriptors(media); 218 vdev_error(gettext("'%s' has no media in drive\n"), name); 219 return (-1); 220 } 221 222 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 223 &err)) == NULL) { 224 dm_free_descriptors(media); 225 if (err) 226 libdiskmgt_error(err); 227 return (0); 228 } 229 230 dm_free_descriptors(media); 231 232 ret = 0; 233 234 /* 235 * Iterate over all slices and report any errors. We don't care about 236 * overlapping slices because we are using the whole disk. 237 */ 238 for (i = 0; slice[i] != NULL; i++) { 239 char *name = dm_get_name(slice[i], &err); 240 241 if (check_slice(name, force, B_TRUE, isspare) != 0) 242 ret = -1; 243 244 dm_free_name(name); 245 } 246 247 dm_free_descriptors(slice); 248 return (ret); 249 } 250 251 /* 252 * Validate a device. 253 */ 254 static int 255 check_device(const char *path, boolean_t force, boolean_t isspare) 256 { 257 dm_descriptor_t desc; 258 int err; 259 char *dev; 260 261 /* 262 * For whole disks, libdiskmgt does not include the leading dev path. 263 */ 264 dev = strrchr(path, '/'); 265 assert(dev != NULL); 266 dev++; 267 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 268 err = check_disk(path, desc, force, isspare); 269 dm_free_descriptor(desc); 270 return (err); 271 } 272 273 return (check_slice(path, force, B_FALSE, isspare)); 274 } 275 276 /* 277 * Check that a file is valid. All we can do in this case is check that it's 278 * not in use by another pool, and not in use by swap. 279 */ 280 static int 281 check_file(const char *file, boolean_t force, boolean_t isspare) 282 { 283 char *name; 284 int fd; 285 int ret = 0; 286 int err; 287 pool_state_t state; 288 boolean_t inuse; 289 290 if (dm_inuse_swap(file, &err)) { 291 if (err) 292 libdiskmgt_error(err); 293 else 294 vdev_error(gettext("%s is currently used by swap. " 295 "Please see swap(1M).\n"), file); 296 return (-1); 297 } 298 299 if ((fd = open(file, O_RDONLY)) < 0) 300 return (0); 301 302 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 303 const char *desc; 304 305 switch (state) { 306 case POOL_STATE_ACTIVE: 307 desc = gettext("active"); 308 break; 309 310 case POOL_STATE_EXPORTED: 311 desc = gettext("exported"); 312 break; 313 314 case POOL_STATE_POTENTIALLY_ACTIVE: 315 desc = gettext("potentially active"); 316 break; 317 318 default: 319 desc = gettext("unknown"); 320 break; 321 } 322 323 /* 324 * Allow hot spares to be shared between pools. 325 */ 326 if (state == POOL_STATE_SPARE && isspare) 327 return (0); 328 329 if (state == POOL_STATE_ACTIVE || 330 state == POOL_STATE_SPARE || !force) { 331 switch (state) { 332 case POOL_STATE_SPARE: 333 vdev_error(gettext("%s is reserved as a hot " 334 "spare for pool %s\n"), file, name); 335 break; 336 default: 337 vdev_error(gettext("%s is part of %s pool " 338 "'%s'\n"), file, desc, name); 339 break; 340 } 341 ret = -1; 342 } 343 344 free(name); 345 } 346 347 (void) close(fd); 348 return (ret); 349 } 350 351 352 /* 353 * By "whole disk" we mean an entire physical disk (something we can 354 * label, toggle the write cache on, etc.) as opposed to the full 355 * capacity of a pseudo-device such as lofi or did. We act as if we 356 * are labeling the disk, which should be a pretty good test of whether 357 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 358 * it isn't. 359 */ 360 static boolean_t 361 is_whole_disk(const char *arg) 362 { 363 struct dk_gpt *label; 364 int fd; 365 char path[MAXPATHLEN]; 366 367 (void) snprintf(path, sizeof (path), "%s%s%s", 368 RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); 369 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) 370 return (B_FALSE); 371 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 372 (void) close(fd); 373 return (B_FALSE); 374 } 375 efi_free(label); 376 (void) close(fd); 377 return (B_TRUE); 378 } 379 380 /* 381 * Create a leaf vdev. Determine if this is a file or a device. If it's a 382 * device, fill in the device id to make a complete nvlist. Valid forms for a 383 * leaf vdev are: 384 * 385 * /dev/dsk/xxx Complete disk path 386 * /xxx Full path to file 387 * xxx Shorthand for /dev/dsk/xxx 388 */ 389 static nvlist_t * 390 make_leaf_vdev(const char *arg, uint64_t is_log) 391 { 392 char path[MAXPATHLEN]; 393 struct stat64 statbuf; 394 nvlist_t *vdev = NULL; 395 char *type = NULL; 396 boolean_t wholedisk = B_FALSE; 397 398 /* 399 * Determine what type of vdev this is, and put the full path into 400 * 'path'. We detect whether this is a device of file afterwards by 401 * checking the st_mode of the file. 402 */ 403 if (arg[0] == '/') { 404 /* 405 * Complete device or file path. Exact type is determined by 406 * examining the file descriptor afterwards. 407 */ 408 wholedisk = is_whole_disk(arg); 409 if (!wholedisk && (stat64(arg, &statbuf) != 0)) { 410 (void) fprintf(stderr, 411 gettext("cannot open '%s': %s\n"), 412 arg, strerror(errno)); 413 return (NULL); 414 } 415 416 (void) strlcpy(path, arg, sizeof (path)); 417 } else { 418 /* 419 * This may be a short path for a device, or it could be total 420 * gibberish. Check to see if it's a known device in 421 * /dev/dsk/. As part of this check, see if we've been given a 422 * an entire disk (minus the slice number). 423 */ 424 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 425 arg); 426 wholedisk = is_whole_disk(path); 427 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 428 /* 429 * If we got ENOENT, then the user gave us 430 * gibberish, so try to direct them with a 431 * reasonable error message. Otherwise, 432 * regurgitate strerror() since it's the best we 433 * can do. 434 */ 435 if (errno == ENOENT) { 436 (void) fprintf(stderr, 437 gettext("cannot open '%s': no such " 438 "device in %s\n"), arg, DISK_ROOT); 439 (void) fprintf(stderr, 440 gettext("must be a full path or " 441 "shorthand device name\n")); 442 return (NULL); 443 } else { 444 (void) fprintf(stderr, 445 gettext("cannot open '%s': %s\n"), 446 path, strerror(errno)); 447 return (NULL); 448 } 449 } 450 } 451 452 /* 453 * Determine whether this is a device or a file. 454 */ 455 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 456 type = VDEV_TYPE_DISK; 457 } else if (S_ISREG(statbuf.st_mode)) { 458 type = VDEV_TYPE_FILE; 459 } else { 460 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 461 "block device or regular file\n"), path); 462 return (NULL); 463 } 464 465 /* 466 * Finally, we have the complete device or file, and we know that it is 467 * acceptable to use. Construct the nvlist to describe this vdev. All 468 * vdevs have a 'path' element, and devices also have a 'devid' element. 469 */ 470 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 471 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 472 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 473 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); 474 if (strcmp(type, VDEV_TYPE_DISK) == 0) 475 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 476 (uint64_t)wholedisk) == 0); 477 478 /* 479 * For a whole disk, defer getting its devid until after labeling it. 480 */ 481 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 482 /* 483 * Get the devid for the device. 484 */ 485 int fd; 486 ddi_devid_t devid; 487 char *minor = NULL, *devid_str = NULL; 488 489 if ((fd = open(path, O_RDONLY)) < 0) { 490 (void) fprintf(stderr, gettext("cannot open '%s': " 491 "%s\n"), path, strerror(errno)); 492 nvlist_free(vdev); 493 return (NULL); 494 } 495 496 if (devid_get(fd, &devid) == 0) { 497 if (devid_get_minor_name(fd, &minor) == 0 && 498 (devid_str = devid_str_encode(devid, minor)) != 499 NULL) { 500 verify(nvlist_add_string(vdev, 501 ZPOOL_CONFIG_DEVID, devid_str) == 0); 502 } 503 if (devid_str != NULL) 504 devid_str_free(devid_str); 505 if (minor != NULL) 506 devid_str_free(minor); 507 devid_free(devid); 508 } 509 510 (void) close(fd); 511 } 512 513 return (vdev); 514 } 515 516 /* 517 * Go through and verify the replication level of the pool is consistent. 518 * Performs the following checks: 519 * 520 * For the new spec, verifies that devices in mirrors and raidz are the 521 * same size. 522 * 523 * If the current configuration already has inconsistent replication 524 * levels, ignore any other potential problems in the new spec. 525 * 526 * Otherwise, make sure that the current spec (if there is one) and the new 527 * spec have consistent replication levels. 528 */ 529 typedef struct replication_level { 530 char *zprl_type; 531 uint64_t zprl_children; 532 uint64_t zprl_parity; 533 } replication_level_t; 534 535 #define ZPOOL_FUZZ (16 * 1024 * 1024) 536 537 /* 538 * Given a list of toplevel vdevs, return the current replication level. If 539 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 540 * an error message will be displayed for each self-inconsistent vdev. 541 */ 542 static replication_level_t * 543 get_replication(nvlist_t *nvroot, boolean_t fatal) 544 { 545 nvlist_t **top; 546 uint_t t, toplevels; 547 nvlist_t **child; 548 uint_t c, children; 549 nvlist_t *nv; 550 char *type; 551 replication_level_t lastrep, rep, *ret; 552 boolean_t dontreport; 553 554 ret = safe_malloc(sizeof (replication_level_t)); 555 556 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 557 &top, &toplevels) == 0); 558 559 lastrep.zprl_type = NULL; 560 for (t = 0; t < toplevels; t++) { 561 uint64_t is_log = B_FALSE; 562 563 nv = top[t]; 564 565 /* 566 * For separate logs we ignore the top level vdev replication 567 * constraints. 568 */ 569 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 570 if (is_log) 571 continue; 572 573 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, 574 &type) == 0); 575 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 576 &child, &children) != 0) { 577 /* 578 * This is a 'file' or 'disk' vdev. 579 */ 580 rep.zprl_type = type; 581 rep.zprl_children = 1; 582 rep.zprl_parity = 0; 583 } else { 584 uint64_t vdev_size; 585 586 /* 587 * This is a mirror or RAID-Z vdev. Go through and make 588 * sure the contents are all the same (files vs. disks), 589 * keeping track of the number of elements in the 590 * process. 591 * 592 * We also check that the size of each vdev (if it can 593 * be determined) is the same. 594 */ 595 rep.zprl_type = type; 596 rep.zprl_children = 0; 597 598 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 599 verify(nvlist_lookup_uint64(nv, 600 ZPOOL_CONFIG_NPARITY, 601 &rep.zprl_parity) == 0); 602 assert(rep.zprl_parity != 0); 603 } else { 604 rep.zprl_parity = 0; 605 } 606 607 /* 608 * The 'dontreport' variable indicates that we've 609 * already reported an error for this spec, so don't 610 * bother doing it again. 611 */ 612 type = NULL; 613 dontreport = 0; 614 vdev_size = -1ULL; 615 for (c = 0; c < children; c++) { 616 nvlist_t *cnv = child[c]; 617 char *path; 618 struct stat64 statbuf; 619 uint64_t size = -1ULL; 620 char *childtype; 621 int fd, err; 622 623 rep.zprl_children++; 624 625 verify(nvlist_lookup_string(cnv, 626 ZPOOL_CONFIG_TYPE, &childtype) == 0); 627 628 /* 629 * If this is a replacing or spare vdev, then 630 * get the real first child of the vdev. 631 */ 632 if (strcmp(childtype, 633 VDEV_TYPE_REPLACING) == 0 || 634 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 635 nvlist_t **rchild; 636 uint_t rchildren; 637 638 verify(nvlist_lookup_nvlist_array(cnv, 639 ZPOOL_CONFIG_CHILDREN, &rchild, 640 &rchildren) == 0); 641 assert(rchildren == 2); 642 cnv = rchild[0]; 643 644 verify(nvlist_lookup_string(cnv, 645 ZPOOL_CONFIG_TYPE, 646 &childtype) == 0); 647 } 648 649 verify(nvlist_lookup_string(cnv, 650 ZPOOL_CONFIG_PATH, &path) == 0); 651 652 /* 653 * If we have a raidz/mirror that combines disks 654 * with files, report it as an error. 655 */ 656 if (!dontreport && type != NULL && 657 strcmp(type, childtype) != 0) { 658 if (ret != NULL) 659 free(ret); 660 ret = NULL; 661 if (fatal) 662 vdev_error(gettext( 663 "mismatched replication " 664 "level: %s contains both " 665 "files and devices\n"), 666 rep.zprl_type); 667 else 668 return (NULL); 669 dontreport = B_TRUE; 670 } 671 672 /* 673 * According to stat(2), the value of 'st_size' 674 * is undefined for block devices and character 675 * devices. But there is no effective way to 676 * determine the real size in userland. 677 * 678 * Instead, we'll take advantage of an 679 * implementation detail of spec_size(). If the 680 * device is currently open, then we (should) 681 * return a valid size. 682 * 683 * If we still don't get a valid size (indicated 684 * by a size of 0 or MAXOFFSET_T), then ignore 685 * this device altogether. 686 */ 687 if ((fd = open(path, O_RDONLY)) >= 0) { 688 err = fstat64(fd, &statbuf); 689 (void) close(fd); 690 } else { 691 err = stat64(path, &statbuf); 692 } 693 694 if (err != 0 || 695 statbuf.st_size == 0 || 696 statbuf.st_size == MAXOFFSET_T) 697 continue; 698 699 size = statbuf.st_size; 700 701 /* 702 * Also make sure that devices and 703 * slices have a consistent size. If 704 * they differ by a significant amount 705 * (~16MB) then report an error. 706 */ 707 if (!dontreport && 708 (vdev_size != -1ULL && 709 (labs(size - vdev_size) > 710 ZPOOL_FUZZ))) { 711 if (ret != NULL) 712 free(ret); 713 ret = NULL; 714 if (fatal) 715 vdev_error(gettext( 716 "%s contains devices of " 717 "different sizes\n"), 718 rep.zprl_type); 719 else 720 return (NULL); 721 dontreport = B_TRUE; 722 } 723 724 type = childtype; 725 vdev_size = size; 726 } 727 } 728 729 /* 730 * At this point, we have the replication of the last toplevel 731 * vdev in 'rep'. Compare it to 'lastrep' to see if its 732 * different. 733 */ 734 if (lastrep.zprl_type != NULL) { 735 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 736 if (ret != NULL) 737 free(ret); 738 ret = NULL; 739 if (fatal) 740 vdev_error(gettext( 741 "mismatched replication level: " 742 "both %s and %s vdevs are " 743 "present\n"), 744 lastrep.zprl_type, rep.zprl_type); 745 else 746 return (NULL); 747 } else if (lastrep.zprl_parity != rep.zprl_parity) { 748 if (ret) 749 free(ret); 750 ret = NULL; 751 if (fatal) 752 vdev_error(gettext( 753 "mismatched replication level: " 754 "both %llu and %llu device parity " 755 "%s vdevs are present\n"), 756 lastrep.zprl_parity, 757 rep.zprl_parity, 758 rep.zprl_type); 759 else 760 return (NULL); 761 } else if (lastrep.zprl_children != rep.zprl_children) { 762 if (ret) 763 free(ret); 764 ret = NULL; 765 if (fatal) 766 vdev_error(gettext( 767 "mismatched replication level: " 768 "both %llu-way and %llu-way %s " 769 "vdevs are present\n"), 770 lastrep.zprl_children, 771 rep.zprl_children, 772 rep.zprl_type); 773 else 774 return (NULL); 775 } 776 } 777 lastrep = rep; 778 } 779 780 if (ret != NULL) 781 *ret = rep; 782 783 return (ret); 784 } 785 786 /* 787 * Check the replication level of the vdev spec against the current pool. Calls 788 * get_replication() to make sure the new spec is self-consistent. If the pool 789 * has a consistent replication level, then we ignore any errors. Otherwise, 790 * report any difference between the two. 791 */ 792 static int 793 check_replication(nvlist_t *config, nvlist_t *newroot) 794 { 795 nvlist_t **child; 796 uint_t children; 797 replication_level_t *current = NULL, *new; 798 int ret; 799 800 /* 801 * If we have a current pool configuration, check to see if it's 802 * self-consistent. If not, simply return success. 803 */ 804 if (config != NULL) { 805 nvlist_t *nvroot; 806 807 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 808 &nvroot) == 0); 809 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 810 return (0); 811 } 812 /* 813 * for spares there may be no children, and therefore no 814 * replication level to check 815 */ 816 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 817 &child, &children) != 0) || (children == 0)) { 818 free(current); 819 return (0); 820 } 821 822 /* 823 * If all we have is logs then there's no replication level to check. 824 */ 825 if (num_logs(newroot) == children) { 826 free(current); 827 return (0); 828 } 829 830 /* 831 * Get the replication level of the new vdev spec, reporting any 832 * inconsistencies found. 833 */ 834 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 835 free(current); 836 return (-1); 837 } 838 839 /* 840 * Check to see if the new vdev spec matches the replication level of 841 * the current pool. 842 */ 843 ret = 0; 844 if (current != NULL) { 845 if (strcmp(current->zprl_type, new->zprl_type) != 0) { 846 vdev_error(gettext( 847 "mismatched replication level: pool uses %s " 848 "and new vdev is %s\n"), 849 current->zprl_type, new->zprl_type); 850 ret = -1; 851 } else if (current->zprl_parity != new->zprl_parity) { 852 vdev_error(gettext( 853 "mismatched replication level: pool uses %llu " 854 "device parity and new vdev uses %llu\n"), 855 current->zprl_parity, new->zprl_parity); 856 ret = -1; 857 } else if (current->zprl_children != new->zprl_children) { 858 vdev_error(gettext( 859 "mismatched replication level: pool uses %llu-way " 860 "%s and new vdev uses %llu-way %s\n"), 861 current->zprl_children, current->zprl_type, 862 new->zprl_children, new->zprl_type); 863 ret = -1; 864 } 865 } 866 867 free(new); 868 if (current != NULL) 869 free(current); 870 871 return (ret); 872 } 873 874 /* 875 * Go through and find any whole disks in the vdev specification, labelling them 876 * as appropriate. When constructing the vdev spec, we were unable to open this 877 * device in order to provide a devid. Now that we have labelled the disk and 878 * know that slice 0 is valid, we can construct the devid now. 879 * 880 * If the disk was already labeled with an EFI label, we will have gotten the 881 * devid already (because we were able to open the whole disk). Otherwise, we 882 * need to get the devid after we label the disk. 883 */ 884 static int 885 make_disks(zpool_handle_t *zhp, nvlist_t *nv) 886 { 887 nvlist_t **child; 888 uint_t c, children; 889 char *type, *path, *diskname; 890 char buf[MAXPATHLEN]; 891 uint64_t wholedisk; 892 int fd; 893 int ret; 894 ddi_devid_t devid; 895 char *minor = NULL, *devid_str = NULL; 896 897 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 898 899 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 900 &child, &children) != 0) { 901 902 if (strcmp(type, VDEV_TYPE_DISK) != 0) 903 return (0); 904 905 /* 906 * We have a disk device. Get the path to the device 907 * and see if it's a whole disk by appending the backup 908 * slice and stat()ing the device. 909 */ 910 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 911 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 912 &wholedisk) != 0 || !wholedisk) 913 return (0); 914 915 diskname = strrchr(path, '/'); 916 assert(diskname != NULL); 917 diskname++; 918 if (zpool_label_disk(g_zfs, zhp, diskname) == -1) 919 return (-1); 920 921 /* 922 * Fill in the devid, now that we've labeled the disk. 923 */ 924 (void) snprintf(buf, sizeof (buf), "%ss0", path); 925 if ((fd = open(buf, O_RDONLY)) < 0) { 926 (void) fprintf(stderr, 927 gettext("cannot open '%s': %s\n"), 928 buf, strerror(errno)); 929 return (-1); 930 } 931 932 if (devid_get(fd, &devid) == 0) { 933 if (devid_get_minor_name(fd, &minor) == 0 && 934 (devid_str = devid_str_encode(devid, minor)) != 935 NULL) { 936 verify(nvlist_add_string(nv, 937 ZPOOL_CONFIG_DEVID, devid_str) == 0); 938 } 939 if (devid_str != NULL) 940 devid_str_free(devid_str); 941 if (minor != NULL) 942 devid_str_free(minor); 943 devid_free(devid); 944 } 945 946 /* 947 * Update the path to refer to the 's0' slice. The presence of 948 * the 'whole_disk' field indicates to the CLI that we should 949 * chop off the slice number when displaying the device in 950 * future output. 951 */ 952 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 953 954 (void) close(fd); 955 956 return (0); 957 } 958 959 for (c = 0; c < children; c++) 960 if ((ret = make_disks(zhp, child[c])) != 0) 961 return (ret); 962 963 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 964 &child, &children) == 0) 965 for (c = 0; c < children; c++) 966 if ((ret = make_disks(zhp, child[c])) != 0) 967 return (ret); 968 969 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 970 &child, &children) == 0) 971 for (c = 0; c < children; c++) 972 if ((ret = make_disks(zhp, child[c])) != 0) 973 return (ret); 974 975 return (0); 976 } 977 978 /* 979 * Determine if the given path is a hot spare within the given configuration. 980 */ 981 static boolean_t 982 is_spare(nvlist_t *config, const char *path) 983 { 984 int fd; 985 pool_state_t state; 986 char *name = NULL; 987 nvlist_t *label; 988 uint64_t guid, spareguid; 989 nvlist_t *nvroot; 990 nvlist_t **spares; 991 uint_t i, nspares; 992 boolean_t inuse; 993 994 if ((fd = open(path, O_RDONLY)) < 0) 995 return (B_FALSE); 996 997 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 998 !inuse || 999 state != POOL_STATE_SPARE || 1000 zpool_read_label(fd, &label) != 0) { 1001 free(name); 1002 (void) close(fd); 1003 return (B_FALSE); 1004 } 1005 free(name); 1006 1007 (void) close(fd); 1008 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1009 nvlist_free(label); 1010 1011 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1012 &nvroot) == 0); 1013 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1014 &spares, &nspares) == 0) { 1015 for (i = 0; i < nspares; i++) { 1016 verify(nvlist_lookup_uint64(spares[i], 1017 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1018 if (spareguid == guid) 1019 return (B_TRUE); 1020 } 1021 } 1022 1023 return (B_FALSE); 1024 } 1025 1026 /* 1027 * Go through and find any devices that are in use. We rely on libdiskmgt for 1028 * the majority of this task. 1029 */ 1030 static int 1031 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 1032 int isspare) 1033 { 1034 nvlist_t **child; 1035 uint_t c, children; 1036 char *type, *path; 1037 int ret; 1038 char buf[MAXPATHLEN]; 1039 uint64_t wholedisk; 1040 1041 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1042 1043 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1044 &child, &children) != 0) { 1045 1046 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1047 1048 /* 1049 * As a generic check, we look to see if this is a replace of a 1050 * hot spare within the same pool. If so, we allow it 1051 * regardless of what libdiskmgt or zpool_in_use() says. 1052 */ 1053 if (isreplacing) { 1054 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1055 &wholedisk) == 0 && wholedisk) 1056 (void) snprintf(buf, sizeof (buf), "%ss0", 1057 path); 1058 else 1059 (void) strlcpy(buf, path, sizeof (buf)); 1060 if (is_spare(config, buf)) 1061 return (0); 1062 } 1063 1064 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1065 ret = check_device(path, force, isspare); 1066 1067 if (strcmp(type, VDEV_TYPE_FILE) == 0) 1068 ret = check_file(path, force, isspare); 1069 1070 return (ret); 1071 } 1072 1073 for (c = 0; c < children; c++) 1074 if ((ret = check_in_use(config, child[c], force, 1075 isreplacing, B_FALSE)) != 0) 1076 return (ret); 1077 1078 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1079 &child, &children) == 0) 1080 for (c = 0; c < children; c++) 1081 if ((ret = check_in_use(config, child[c], force, 1082 isreplacing, B_TRUE)) != 0) 1083 return (ret); 1084 1085 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1086 &child, &children) == 0) 1087 for (c = 0; c < children; c++) 1088 if ((ret = check_in_use(config, child[c], force, 1089 isreplacing, B_FALSE)) != 0) 1090 return (ret); 1091 1092 return (0); 1093 } 1094 1095 static const char * 1096 is_grouping(const char *type, int *mindev) 1097 { 1098 if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { 1099 if (mindev != NULL) 1100 *mindev = 2; 1101 return (VDEV_TYPE_RAIDZ); 1102 } 1103 1104 if (strcmp(type, "raidz2") == 0) { 1105 if (mindev != NULL) 1106 *mindev = 3; 1107 return (VDEV_TYPE_RAIDZ); 1108 } 1109 1110 if (strcmp(type, "mirror") == 0) { 1111 if (mindev != NULL) 1112 *mindev = 2; 1113 return (VDEV_TYPE_MIRROR); 1114 } 1115 1116 if (strcmp(type, "spare") == 0) { 1117 if (mindev != NULL) 1118 *mindev = 1; 1119 return (VDEV_TYPE_SPARE); 1120 } 1121 1122 if (strcmp(type, "log") == 0) { 1123 if (mindev != NULL) 1124 *mindev = 1; 1125 return (VDEV_TYPE_LOG); 1126 } 1127 1128 if (strcmp(type, "cache") == 0) { 1129 if (mindev != NULL) 1130 *mindev = 1; 1131 return (VDEV_TYPE_L2CACHE); 1132 } 1133 1134 return (NULL); 1135 } 1136 1137 /* 1138 * Construct a syntactically valid vdev specification, 1139 * and ensure that all devices and files exist and can be opened. 1140 * Note: we don't bother freeing anything in the error paths 1141 * because the program is just going to exit anyway. 1142 */ 1143 nvlist_t * 1144 construct_spec(int argc, char **argv) 1145 { 1146 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1147 int t, toplevels, mindev, nspares, nlogs, nl2cache; 1148 const char *type; 1149 uint64_t is_log; 1150 boolean_t seen_logs; 1151 1152 top = NULL; 1153 toplevels = 0; 1154 spares = NULL; 1155 l2cache = NULL; 1156 nspares = 0; 1157 nlogs = 0; 1158 nl2cache = 0; 1159 is_log = B_FALSE; 1160 seen_logs = B_FALSE; 1161 1162 while (argc > 0) { 1163 nv = NULL; 1164 1165 /* 1166 * If it's a mirror or raidz, the subsequent arguments are 1167 * its leaves -- until we encounter the next mirror or raidz. 1168 */ 1169 if ((type = is_grouping(argv[0], &mindev)) != NULL) { 1170 nvlist_t **child = NULL; 1171 int c, children = 0; 1172 1173 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1174 if (spares != NULL) { 1175 (void) fprintf(stderr, 1176 gettext("invalid vdev " 1177 "specification: 'spare' can be " 1178 "specified only once\n")); 1179 return (NULL); 1180 } 1181 is_log = B_FALSE; 1182 } 1183 1184 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1185 if (seen_logs) { 1186 (void) fprintf(stderr, 1187 gettext("invalid vdev " 1188 "specification: 'log' can be " 1189 "specified only once\n")); 1190 return (NULL); 1191 } 1192 seen_logs = B_TRUE; 1193 is_log = B_TRUE; 1194 argc--; 1195 argv++; 1196 /* 1197 * A log is not a real grouping device. 1198 * We just set is_log and continue. 1199 */ 1200 continue; 1201 } 1202 1203 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1204 if (l2cache != NULL) { 1205 (void) fprintf(stderr, 1206 gettext("invalid vdev " 1207 "specification: 'cache' can be " 1208 "specified only once\n")); 1209 return (NULL); 1210 } 1211 is_log = B_FALSE; 1212 } 1213 1214 if (is_log) { 1215 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1216 (void) fprintf(stderr, 1217 gettext("invalid vdev " 1218 "specification: unsupported 'log' " 1219 "device: %s\n"), type); 1220 return (NULL); 1221 } 1222 nlogs++; 1223 } 1224 1225 for (c = 1; c < argc; c++) { 1226 if (is_grouping(argv[c], NULL) != NULL) 1227 break; 1228 children++; 1229 child = realloc(child, 1230 children * sizeof (nvlist_t *)); 1231 if (child == NULL) 1232 zpool_no_memory(); 1233 if ((nv = make_leaf_vdev(argv[c], B_FALSE)) 1234 == NULL) 1235 return (NULL); 1236 child[children - 1] = nv; 1237 } 1238 1239 if (children < mindev) { 1240 (void) fprintf(stderr, gettext("invalid vdev " 1241 "specification: %s requires at least %d " 1242 "devices\n"), argv[0], mindev); 1243 return (NULL); 1244 } 1245 1246 argc -= c; 1247 argv += c; 1248 1249 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1250 spares = child; 1251 nspares = children; 1252 continue; 1253 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1254 l2cache = child; 1255 nl2cache = children; 1256 continue; 1257 } else { 1258 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1259 0) == 0); 1260 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1261 type) == 0); 1262 verify(nvlist_add_uint64(nv, 1263 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1264 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1265 verify(nvlist_add_uint64(nv, 1266 ZPOOL_CONFIG_NPARITY, 1267 mindev - 1) == 0); 1268 } 1269 verify(nvlist_add_nvlist_array(nv, 1270 ZPOOL_CONFIG_CHILDREN, child, 1271 children) == 0); 1272 1273 for (c = 0; c < children; c++) 1274 nvlist_free(child[c]); 1275 free(child); 1276 } 1277 } else { 1278 /* 1279 * We have a device. Pass off to make_leaf_vdev() to 1280 * construct the appropriate nvlist describing the vdev. 1281 */ 1282 if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL) 1283 return (NULL); 1284 if (is_log) 1285 nlogs++; 1286 argc--; 1287 argv++; 1288 } 1289 1290 toplevels++; 1291 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1292 if (top == NULL) 1293 zpool_no_memory(); 1294 top[toplevels - 1] = nv; 1295 } 1296 1297 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1298 (void) fprintf(stderr, gettext("invalid vdev " 1299 "specification: at least one toplevel vdev must be " 1300 "specified\n")); 1301 return (NULL); 1302 } 1303 1304 if (seen_logs && nlogs == 0) { 1305 (void) fprintf(stderr, gettext("invalid vdev specification: " 1306 "log requires at least 1 device\n")); 1307 return (NULL); 1308 } 1309 1310 /* 1311 * Finally, create nvroot and add all top-level vdevs to it. 1312 */ 1313 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1314 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1315 VDEV_TYPE_ROOT) == 0); 1316 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1317 top, toplevels) == 0); 1318 if (nspares != 0) 1319 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1320 spares, nspares) == 0); 1321 if (nl2cache != 0) 1322 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1323 l2cache, nl2cache) == 0); 1324 1325 for (t = 0; t < toplevels; t++) 1326 nvlist_free(top[t]); 1327 for (t = 0; t < nspares; t++) 1328 nvlist_free(spares[t]); 1329 for (t = 0; t < nl2cache; t++) 1330 nvlist_free(l2cache[t]); 1331 if (spares) 1332 free(spares); 1333 if (l2cache) 1334 free(l2cache); 1335 free(top); 1336 1337 return (nvroot); 1338 } 1339 1340 1341 /* 1342 * Get and validate the contents of the given vdev specification. This ensures 1343 * that the nvlist returned is well-formed, that all the devices exist, and that 1344 * they are not currently in use by any other known consumer. The 'poolconfig' 1345 * parameter is the current configuration of the pool when adding devices 1346 * existing pool, and is used to perform additional checks, such as changing the 1347 * replication level of the pool. It can be 'NULL' to indicate that this is a 1348 * new pool. The 'force' flag controls whether devices should be forcefully 1349 * added, even if they appear in use. 1350 */ 1351 nvlist_t * 1352 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, 1353 boolean_t isreplacing, boolean_t dryrun, int argc, char **argv) 1354 { 1355 nvlist_t *newroot; 1356 nvlist_t *poolconfig = NULL; 1357 is_force = force; 1358 1359 /* 1360 * Construct the vdev specification. If this is successful, we know 1361 * that we have a valid specification, and that all devices can be 1362 * opened. 1363 */ 1364 if ((newroot = construct_spec(argc, argv)) == NULL) 1365 return (NULL); 1366 1367 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) 1368 return (NULL); 1369 1370 /* 1371 * Validate each device to make sure that its not shared with another 1372 * subsystem. We do this even if 'force' is set, because there are some 1373 * uses (such as a dedicated dump device) that even '-f' cannot 1374 * override. 1375 */ 1376 if (check_in_use(poolconfig, newroot, force, isreplacing, 1377 B_FALSE) != 0) { 1378 nvlist_free(newroot); 1379 return (NULL); 1380 } 1381 1382 /* 1383 * Check the replication level of the given vdevs and report any errors 1384 * found. We include the existing pool spec, if any, as we need to 1385 * catch changes against the existing replication level. 1386 */ 1387 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1388 nvlist_free(newroot); 1389 return (NULL); 1390 } 1391 1392 /* 1393 * Run through the vdev specification and label any whole disks found. 1394 */ 1395 if (!dryrun && make_disks(zhp, newroot) != 0) { 1396 nvlist_free(newroot); 1397 return (NULL); 1398 } 1399 1400 return (newroot); 1401 } 1402