1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'get_vdev_spec'. The function 52 * performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <devid.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libdiskmgt.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <stdio.h> 73 #include <string.h> 74 #include <unistd.h> 75 #include <sys/efi_partition.h> 76 #include <sys/stat.h> 77 #include <sys/vtoc.h> 78 #include <sys/mntent.h> 79 80 #include <libzfs.h> 81 82 #include "zpool_util.h" 83 84 #define DISK_ROOT "/dev/dsk" 85 #define RDISK_ROOT "/dev/rdsk" 86 #define BACKUP_SLICE "s2" 87 88 /* 89 * For any given vdev specification, we can have multiple errors. The 90 * vdev_error() function keeps track of whether we have seen an error yet, and 91 * prints out a header if its the first error we've seen. 92 */ 93 boolean_t error_seen; 94 boolean_t is_force; 95 96 /*PRINTFLIKE1*/ 97 static void 98 vdev_error(const char *fmt, ...) 99 { 100 va_list ap; 101 102 if (!error_seen) { 103 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 104 if (!is_force) 105 (void) fprintf(stderr, gettext("use '-f' to override " 106 "the following errors:\n")); 107 else 108 (void) fprintf(stderr, gettext("the following errors " 109 "must be manually repaired:\n")); 110 error_seen = B_TRUE; 111 } 112 113 va_start(ap, fmt); 114 (void) vfprintf(stderr, fmt, ap); 115 va_end(ap); 116 } 117 118 static void 119 libdiskmgt_error(int error) 120 { 121 /* 122 * ENXIO/ENODEV is a valid error message if the device doesn't live in 123 * /dev/dsk. Don't bother printing an error message in this case. 124 */ 125 if (error == ENXIO || error == ENODEV) 126 return; 127 128 (void) fprintf(stderr, gettext("warning: device in use checking " 129 "failed: %s\n"), strerror(error)); 130 } 131 132 /* 133 * Validate a device, passing the bulk of the work off to libdiskmgt. 134 */ 135 int 136 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 137 { 138 char *msg; 139 int error = 0; 140 141 if (dm_inuse((char *)path, &msg, isspare ? DM_WHO_ZPOOL_SPARE : 142 (force ? DM_WHO_ZPOOL_FORCE : DM_WHO_ZPOOL), &error) || error) { 143 if (error != 0) { 144 libdiskmgt_error(error); 145 return (0); 146 } else { 147 vdev_error("%s", msg); 148 free(msg); 149 return (-1); 150 } 151 } 152 153 /* 154 * If we're given a whole disk, ignore overlapping slices since we're 155 * about to label it anyway. 156 */ 157 error = 0; 158 if (!wholedisk && !force && 159 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 160 if (error == 0) { 161 /* dm_isoverlapping returned -1 */ 162 vdev_error(gettext("%s overlaps with %s\n"), path, msg); 163 free(msg); 164 return (-1); 165 } else if (error != ENODEV) { 166 /* libdiskmgt's devcache only handles physical drives */ 167 libdiskmgt_error(error); 168 return (0); 169 } 170 } 171 172 return (0); 173 } 174 175 /* 176 * Validate a whole disk. Iterate over all slices on the disk and make sure 177 * that none is in use by calling check_slice(). 178 */ 179 /* ARGSUSED */ 180 int 181 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 182 { 183 dm_descriptor_t *drive, *media, *slice; 184 int err = 0; 185 int i; 186 int ret; 187 188 /* 189 * Get the drive associated with this disk. This should never fail, 190 * because we already have an alias handle open for the device. 191 */ 192 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 193 &err)) == NULL || *drive == NULL) { 194 if (err) 195 libdiskmgt_error(err); 196 return (0); 197 } 198 199 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 200 &err)) == NULL) { 201 dm_free_descriptors(drive); 202 if (err) 203 libdiskmgt_error(err); 204 return (0); 205 } 206 207 dm_free_descriptors(drive); 208 209 /* 210 * It is possible that the user has specified a removable media drive, 211 * and the media is not present. 212 */ 213 if (*media == NULL) { 214 dm_free_descriptors(media); 215 vdev_error(gettext("'%s' has no media in drive\n"), name); 216 return (-1); 217 } 218 219 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 220 &err)) == NULL) { 221 dm_free_descriptors(media); 222 if (err) 223 libdiskmgt_error(err); 224 return (0); 225 } 226 227 dm_free_descriptors(media); 228 229 ret = 0; 230 231 /* 232 * Iterate over all slices and report any errors. We don't care about 233 * overlapping slices because we are using the whole disk. 234 */ 235 for (i = 0; slice[i] != NULL; i++) { 236 char *name = dm_get_name(slice[i], &err); 237 238 if (check_slice(name, force, B_TRUE, isspare) != 0) 239 ret = -1; 240 241 dm_free_name(name); 242 } 243 244 dm_free_descriptors(slice); 245 return (ret); 246 } 247 248 /* 249 * Validate a device. 250 */ 251 int 252 check_device(const char *path, boolean_t force, boolean_t isspare) 253 { 254 dm_descriptor_t desc; 255 int err; 256 char *dev; 257 258 /* 259 * For whole disks, libdiskmgt does not include the leading dev path. 260 */ 261 dev = strrchr(path, '/'); 262 assert(dev != NULL); 263 dev++; 264 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 265 err = check_disk(path, desc, force, isspare); 266 dm_free_descriptor(desc); 267 return (err); 268 } 269 270 return (check_slice(path, force, B_FALSE, isspare)); 271 } 272 273 /* 274 * Check that a file is valid. All we can do in this case is check that it's 275 * not in use by another pool, and not in use by swap. 276 */ 277 int 278 check_file(const char *file, boolean_t force, boolean_t isspare) 279 { 280 char *name; 281 int fd; 282 int ret = 0; 283 int err; 284 pool_state_t state; 285 boolean_t inuse; 286 287 if (dm_inuse_swap(file, &err)) { 288 if (err) 289 libdiskmgt_error(err); 290 else 291 vdev_error(gettext("%s is currently used by swap. " 292 "Please see swap(1M).\n"), file); 293 return (-1); 294 } 295 296 if ((fd = open(file, O_RDONLY)) < 0) 297 return (0); 298 299 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 300 const char *desc; 301 302 switch (state) { 303 case POOL_STATE_ACTIVE: 304 desc = gettext("active"); 305 break; 306 307 case POOL_STATE_EXPORTED: 308 desc = gettext("exported"); 309 break; 310 311 case POOL_STATE_POTENTIALLY_ACTIVE: 312 desc = gettext("potentially active"); 313 break; 314 315 default: 316 desc = gettext("unknown"); 317 break; 318 } 319 320 /* 321 * Allow hot spares to be shared between pools. 322 */ 323 if (state == POOL_STATE_SPARE && isspare) 324 return (0); 325 326 if (state == POOL_STATE_ACTIVE || 327 state == POOL_STATE_SPARE || !force) { 328 switch (state) { 329 case POOL_STATE_SPARE: 330 vdev_error(gettext("%s is reserved as a hot " 331 "spare for pool %s\n"), file, name); 332 break; 333 default: 334 vdev_error(gettext("%s is part of %s pool " 335 "'%s'\n"), file, desc, name); 336 break; 337 } 338 ret = -1; 339 } 340 341 free(name); 342 } 343 344 (void) close(fd); 345 return (ret); 346 } 347 348 static boolean_t 349 is_whole_disk(const char *arg, struct stat64 *statbuf) 350 { 351 char path[MAXPATHLEN]; 352 353 (void) snprintf(path, sizeof (path), "%s%s", arg, BACKUP_SLICE); 354 if (stat64(path, statbuf) == 0) 355 return (B_TRUE); 356 357 return (B_FALSE); 358 } 359 360 /* 361 * Create a leaf vdev. Determine if this is a file or a device. If it's a 362 * device, fill in the device id to make a complete nvlist. Valid forms for a 363 * leaf vdev are: 364 * 365 * /dev/dsk/xxx Complete disk path 366 * /xxx Full path to file 367 * xxx Shorthand for /dev/dsk/xxx 368 */ 369 nvlist_t * 370 make_leaf_vdev(const char *arg) 371 { 372 char path[MAXPATHLEN]; 373 struct stat64 statbuf; 374 nvlist_t *vdev = NULL; 375 char *type = NULL; 376 boolean_t wholedisk = B_FALSE; 377 378 /* 379 * Determine what type of vdev this is, and put the full path into 380 * 'path'. We detect whether this is a device of file afterwards by 381 * checking the st_mode of the file. 382 */ 383 if (arg[0] == '/') { 384 /* 385 * Complete device or file path. Exact type is determined by 386 * examining the file descriptor afterwards. 387 */ 388 if (is_whole_disk(arg, &statbuf)) { 389 wholedisk = B_TRUE; 390 } else if (stat64(arg, &statbuf) != 0) { 391 (void) fprintf(stderr, 392 gettext("cannot open '%s': %s\n"), 393 arg, strerror(errno)); 394 return (NULL); 395 } 396 397 (void) strlcpy(path, arg, sizeof (path)); 398 } else { 399 /* 400 * This may be a short path for a device, or it could be total 401 * gibberish. Check to see if it's a known device in 402 * /dev/dsk/. As part of this check, see if we've been given a 403 * an entire disk (minus the slice number). 404 */ 405 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 406 arg); 407 if (is_whole_disk(path, &statbuf)) { 408 wholedisk = B_TRUE; 409 } else if (stat64(path, &statbuf) != 0) { 410 /* 411 * If we got ENOENT, then the user gave us 412 * gibberish, so try to direct them with a 413 * reasonable error message. Otherwise, 414 * regurgitate strerror() since it's the best we 415 * can do. 416 */ 417 if (errno == ENOENT) { 418 (void) fprintf(stderr, 419 gettext("cannot open '%s': no such " 420 "device in %s\n"), arg, DISK_ROOT); 421 (void) fprintf(stderr, 422 gettext("must be a full path or " 423 "shorthand device name\n")); 424 return (NULL); 425 } else { 426 (void) fprintf(stderr, 427 gettext("cannot open '%s': %s\n"), 428 path, strerror(errno)); 429 return (NULL); 430 } 431 } 432 } 433 434 /* 435 * Determine whether this is a device or a file. 436 */ 437 if (S_ISBLK(statbuf.st_mode)) { 438 type = VDEV_TYPE_DISK; 439 } else if (S_ISREG(statbuf.st_mode)) { 440 type = VDEV_TYPE_FILE; 441 } else { 442 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 443 "block device or regular file\n"), path); 444 return (NULL); 445 } 446 447 /* 448 * Finally, we have the complete device or file, and we know that it is 449 * acceptable to use. Construct the nvlist to describe this vdev. All 450 * vdevs have a 'path' element, and devices also have a 'devid' element. 451 */ 452 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 453 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 454 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 455 if (strcmp(type, VDEV_TYPE_DISK) == 0) 456 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 457 (uint64_t)wholedisk) == 0); 458 459 /* 460 * For a whole disk, defer getting its devid until after labeling it. 461 */ 462 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 463 /* 464 * Get the devid for the device. 465 */ 466 int fd; 467 ddi_devid_t devid; 468 char *minor = NULL, *devid_str = NULL; 469 470 if ((fd = open(path, O_RDONLY)) < 0) { 471 (void) fprintf(stderr, gettext("cannot open '%s': " 472 "%s\n"), path, strerror(errno)); 473 nvlist_free(vdev); 474 return (NULL); 475 } 476 477 if (devid_get(fd, &devid) == 0) { 478 if (devid_get_minor_name(fd, &minor) == 0 && 479 (devid_str = devid_str_encode(devid, minor)) != 480 NULL) { 481 verify(nvlist_add_string(vdev, 482 ZPOOL_CONFIG_DEVID, devid_str) == 0); 483 } 484 if (devid_str != NULL) 485 devid_str_free(devid_str); 486 if (minor != NULL) 487 devid_str_free(minor); 488 devid_free(devid); 489 } 490 491 (void) close(fd); 492 } 493 494 return (vdev); 495 } 496 497 /* 498 * Go through and verify the replication level of the pool is consistent. 499 * Performs the following checks: 500 * 501 * For the new spec, verifies that devices in mirrors and raidz are the 502 * same size. 503 * 504 * If the current configuration already has inconsistent replication 505 * levels, ignore any other potential problems in the new spec. 506 * 507 * Otherwise, make sure that the current spec (if there is one) and the new 508 * spec have consistent replication levels. 509 */ 510 typedef struct replication_level { 511 char *zprl_type; 512 uint64_t zprl_children; 513 uint64_t zprl_parity; 514 } replication_level_t; 515 516 /* 517 * Given a list of toplevel vdevs, return the current replication level. If 518 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 519 * an error message will be displayed for each self-inconsistent vdev. 520 */ 521 replication_level_t * 522 get_replication(nvlist_t *nvroot, boolean_t fatal) 523 { 524 nvlist_t **top; 525 uint_t t, toplevels; 526 nvlist_t **child; 527 uint_t c, children; 528 nvlist_t *nv; 529 char *type; 530 replication_level_t lastrep, rep, *ret; 531 boolean_t dontreport; 532 533 ret = safe_malloc(sizeof (replication_level_t)); 534 535 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 536 &top, &toplevels) == 0); 537 538 lastrep.zprl_type = NULL; 539 for (t = 0; t < toplevels; t++) { 540 nv = top[t]; 541 542 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 543 544 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 545 &child, &children) != 0) { 546 /* 547 * This is a 'file' or 'disk' vdev. 548 */ 549 rep.zprl_type = type; 550 rep.zprl_children = 1; 551 rep.zprl_parity = 0; 552 } else { 553 uint64_t vdev_size; 554 555 /* 556 * This is a mirror or RAID-Z vdev. Go through and make 557 * sure the contents are all the same (files vs. disks), 558 * keeping track of the number of elements in the 559 * process. 560 * 561 * We also check that the size of each vdev (if it can 562 * be determined) is the same. 563 */ 564 rep.zprl_type = type; 565 rep.zprl_children = 0; 566 567 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 568 verify(nvlist_lookup_uint64(nv, 569 ZPOOL_CONFIG_NPARITY, 570 &rep.zprl_parity) == 0); 571 assert(rep.zprl_parity != 0); 572 } else { 573 rep.zprl_parity = 0; 574 } 575 576 /* 577 * The 'dontreport' variable indicatest that we've 578 * already reported an error for this spec, so don't 579 * bother doing it again. 580 */ 581 type = NULL; 582 dontreport = 0; 583 vdev_size = -1ULL; 584 for (c = 0; c < children; c++) { 585 nvlist_t *cnv = child[c]; 586 char *path; 587 struct stat64 statbuf; 588 uint64_t size = -1ULL; 589 char *childtype; 590 int fd, err; 591 592 rep.zprl_children++; 593 594 verify(nvlist_lookup_string(cnv, 595 ZPOOL_CONFIG_TYPE, &childtype) == 0); 596 597 /* 598 * If this is a a replacing or spare vdev, then 599 * get the real first child of the vdev. 600 */ 601 if (strcmp(childtype, 602 VDEV_TYPE_REPLACING) == 0 || 603 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 604 nvlist_t **rchild; 605 uint_t rchildren; 606 607 verify(nvlist_lookup_nvlist_array(cnv, 608 ZPOOL_CONFIG_CHILDREN, &rchild, 609 &rchildren) == 0); 610 assert(rchildren == 2); 611 cnv = rchild[0]; 612 613 verify(nvlist_lookup_string(cnv, 614 ZPOOL_CONFIG_TYPE, 615 &childtype) == 0); 616 } 617 618 verify(nvlist_lookup_string(cnv, 619 ZPOOL_CONFIG_PATH, &path) == 0); 620 621 /* 622 * If we have a raidz/mirror that combines disks 623 * with files, report it as an error. 624 */ 625 if (!dontreport && type != NULL && 626 strcmp(type, childtype) != 0) { 627 if (ret != NULL) 628 free(ret); 629 ret = NULL; 630 if (fatal) 631 vdev_error(gettext( 632 "mismatched replication " 633 "level: %s contains both " 634 "files and devices\n"), 635 rep.zprl_type); 636 else 637 return (NULL); 638 dontreport = B_TRUE; 639 } 640 641 /* 642 * According to stat(2), the value of 'st_size' 643 * is undefined for block devices and character 644 * devices. But there is no effective way to 645 * determine the real size in userland. 646 * 647 * Instead, we'll take advantage of an 648 * implementation detail of spec_size(). If the 649 * device is currently open, then we (should) 650 * return a valid size. 651 * 652 * If we still don't get a valid size (indicated 653 * by a size of 0 or MAXOFFSET_T), then ignore 654 * this device altogether. 655 */ 656 if ((fd = open(path, O_RDONLY)) >= 0) { 657 err = fstat64(fd, &statbuf); 658 (void) close(fd); 659 } else { 660 err = stat64(path, &statbuf); 661 } 662 663 if (err != 0 || 664 statbuf.st_size == 0 || 665 statbuf.st_size == MAXOFFSET_T) 666 continue; 667 668 size = statbuf.st_size; 669 670 /* 671 * Also check the size of each device. If they 672 * differ, then report an error. 673 */ 674 if (!dontreport && vdev_size != -1ULL && 675 size != vdev_size) { 676 if (ret != NULL) 677 free(ret); 678 ret = NULL; 679 if (fatal) 680 vdev_error(gettext( 681 "%s contains devices of " 682 "different sizes\n"), 683 rep.zprl_type); 684 else 685 return (NULL); 686 dontreport = B_TRUE; 687 } 688 689 type = childtype; 690 vdev_size = size; 691 } 692 } 693 694 /* 695 * At this point, we have the replication of the last toplevel 696 * vdev in 'rep'. Compare it to 'lastrep' to see if its 697 * different. 698 */ 699 if (lastrep.zprl_type != NULL) { 700 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 701 if (ret != NULL) 702 free(ret); 703 ret = NULL; 704 if (fatal) 705 vdev_error(gettext( 706 "mismatched replication level: " 707 "both %s and %s vdevs are " 708 "present\n"), 709 lastrep.zprl_type, rep.zprl_type); 710 else 711 return (NULL); 712 } else if (lastrep.zprl_parity != rep.zprl_parity) { 713 if (ret) 714 free(ret); 715 ret = NULL; 716 if (fatal) 717 vdev_error(gettext( 718 "mismatched replication level: " 719 "both %llu and %llu device parity " 720 "%s vdevs are present\n"), 721 lastrep.zprl_parity, 722 rep.zprl_parity, 723 rep.zprl_type); 724 else 725 return (NULL); 726 } else if (lastrep.zprl_children != rep.zprl_children) { 727 if (ret) 728 free(ret); 729 ret = NULL; 730 if (fatal) 731 vdev_error(gettext( 732 "mismatched replication level: " 733 "both %llu-way and %llu-way %s " 734 "vdevs are present\n"), 735 lastrep.zprl_children, 736 rep.zprl_children, 737 rep.zprl_type); 738 else 739 return (NULL); 740 } 741 } 742 lastrep = rep; 743 } 744 745 if (ret != NULL) 746 *ret = rep; 747 748 return (ret); 749 } 750 751 /* 752 * Check the replication level of the vdev spec against the current pool. Calls 753 * get_replication() to make sure the new spec is self-consistent. If the pool 754 * has a consistent replication level, then we ignore any errors. Otherwise, 755 * report any difference between the two. 756 */ 757 int 758 check_replication(nvlist_t *config, nvlist_t *newroot) 759 { 760 replication_level_t *current = NULL, *new; 761 int ret; 762 763 /* 764 * If we have a current pool configuration, check to see if it's 765 * self-consistent. If not, simply return success. 766 */ 767 if (config != NULL) { 768 nvlist_t *nvroot; 769 770 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 771 &nvroot) == 0); 772 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 773 return (0); 774 } 775 776 /* 777 * Get the replication level of the new vdev spec, reporting any 778 * inconsistencies found. 779 */ 780 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 781 free(current); 782 return (-1); 783 } 784 785 /* 786 * Check to see if the new vdev spec matches the replication level of 787 * the current pool. 788 */ 789 ret = 0; 790 if (current != NULL) { 791 if (strcmp(current->zprl_type, new->zprl_type) != 0) { 792 vdev_error(gettext( 793 "mismatched replication level: pool uses %s " 794 "and new vdev is %s\n"), 795 current->zprl_type, new->zprl_type); 796 ret = -1; 797 } else if (current->zprl_parity != new->zprl_parity) { 798 vdev_error(gettext( 799 "mismatched replication level: pool uses %llu " 800 "device parity and new vdev uses %llu\n"), 801 current->zprl_parity, new->zprl_parity); 802 ret = -1; 803 } else if (current->zprl_children != new->zprl_children) { 804 vdev_error(gettext( 805 "mismatched replication level: pool uses %llu-way " 806 "%s and new vdev uses %llu-way %s\n"), 807 current->zprl_children, current->zprl_type, 808 new->zprl_children, new->zprl_type); 809 ret = -1; 810 } 811 } 812 813 free(new); 814 if (current != NULL) 815 free(current); 816 817 return (ret); 818 } 819 820 /* 821 * Label an individual disk. The name provided is the short name, stripped of 822 * any leading /dev path. 823 */ 824 int 825 label_disk(char *name) 826 { 827 char path[MAXPATHLEN]; 828 struct dk_gpt *vtoc; 829 int fd; 830 size_t resv = 16384; 831 832 (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, 833 BACKUP_SLICE); 834 835 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { 836 /* 837 * This shouldn't happen. We've long since verified that this 838 * is a valid device. 839 */ 840 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 841 path, strerror(errno)); 842 return (-1); 843 } 844 845 846 if (efi_alloc_and_init(fd, 9, &vtoc) != 0) { 847 /* 848 * The only way this can fail is if we run out of memory, or we 849 * were unable to read the disk geometry. 850 */ 851 if (errno == ENOMEM) 852 zpool_no_memory(); 853 854 (void) fprintf(stderr, gettext("cannot label '%s': unable to " 855 "read disk geometry\n"), name); 856 (void) close(fd); 857 return (-1); 858 } 859 860 vtoc->efi_parts[0].p_start = vtoc->efi_first_u_lba; 861 vtoc->efi_parts[0].p_size = vtoc->efi_last_u_lba + 1 - 862 vtoc->efi_first_u_lba - resv; 863 864 /* 865 * Why we use V_USR: V_BACKUP confuses users, and is considered 866 * disposable by some EFI utilities (since EFI doesn't have a backup 867 * slice). V_UNASSIGNED is supposed to be used only for zero size 868 * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, 869 * etc. were all pretty specific. V_USR is as close to reality as we 870 * can get, in the absence of V_OTHER. 871 */ 872 vtoc->efi_parts[0].p_tag = V_USR; 873 (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); 874 875 vtoc->efi_parts[8].p_start = vtoc->efi_last_u_lba + 1 - resv; 876 vtoc->efi_parts[8].p_size = resv; 877 vtoc->efi_parts[8].p_tag = V_RESERVED; 878 879 if (efi_write(fd, vtoc) != 0) { 880 /* 881 * Currently, EFI labels are not supported for IDE disks, and it 882 * is likely that they will not be supported on other drives for 883 * some time. Print out a helpful error message directing the 884 * user to manually label the disk and give a specific slice. 885 */ 886 (void) fprintf(stderr, gettext("cannot label '%s': failed to " 887 "write EFI label\n"), name); 888 (void) fprintf(stderr, gettext("use fdisk(1M) to partition " 889 "the disk, and provide a specific slice\n")); 890 (void) close(fd); 891 efi_free(vtoc); 892 return (-1); 893 } 894 895 (void) close(fd); 896 efi_free(vtoc); 897 return (0); 898 } 899 900 /* 901 * Go through and find any whole disks in the vdev specification, labelling them 902 * as appropriate. When constructing the vdev spec, we were unable to open this 903 * device in order to provide a devid. Now that we have labelled the disk and 904 * know that slice 0 is valid, we can construct the devid now. 905 * 906 * If the disk was already labelled with an EFI label, we will have gotten the 907 * devid already (because we were able to open the whole disk). Otherwise, we 908 * need to get the devid after we label the disk. 909 */ 910 int 911 make_disks(nvlist_t *nv) 912 { 913 nvlist_t **child; 914 uint_t c, children; 915 char *type, *path, *diskname; 916 char buf[MAXPATHLEN]; 917 uint64_t wholedisk; 918 int fd; 919 int ret; 920 ddi_devid_t devid; 921 char *minor = NULL, *devid_str = NULL; 922 923 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 924 925 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 926 &child, &children) != 0) { 927 928 if (strcmp(type, VDEV_TYPE_DISK) != 0) 929 return (0); 930 931 /* 932 * We have a disk device. Get the path to the device 933 * and see if its a whole disk by appending the backup 934 * slice and stat()ing the device. 935 */ 936 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 937 938 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 939 &wholedisk) != 0 || !wholedisk) 940 return (0); 941 942 diskname = strrchr(path, '/'); 943 assert(diskname != NULL); 944 diskname++; 945 if (label_disk(diskname) != 0) 946 return (-1); 947 948 /* 949 * Fill in the devid, now that we've labeled the disk. 950 */ 951 (void) snprintf(buf, sizeof (buf), "%ss0", path); 952 if ((fd = open(buf, O_RDONLY)) < 0) { 953 (void) fprintf(stderr, 954 gettext("cannot open '%s': %s\n"), 955 buf, strerror(errno)); 956 return (-1); 957 } 958 959 if (devid_get(fd, &devid) == 0) { 960 if (devid_get_minor_name(fd, &minor) == 0 && 961 (devid_str = devid_str_encode(devid, minor)) != 962 NULL) { 963 verify(nvlist_add_string(nv, 964 ZPOOL_CONFIG_DEVID, devid_str) == 0); 965 } 966 if (devid_str != NULL) 967 devid_str_free(devid_str); 968 if (minor != NULL) 969 devid_str_free(minor); 970 devid_free(devid); 971 } 972 973 /* 974 * Update the path to refer to the 's0' slice. The presence of 975 * the 'whole_disk' field indicates to the CLI that we should 976 * chop off the slice number when displaying the device in 977 * future output. 978 */ 979 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 980 981 (void) close(fd); 982 983 return (0); 984 } 985 986 for (c = 0; c < children; c++) 987 if ((ret = make_disks(child[c])) != 0) 988 return (ret); 989 990 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 991 &child, &children) == 0) 992 for (c = 0; c < children; c++) 993 if ((ret = make_disks(child[c])) != 0) 994 return (ret); 995 996 return (0); 997 } 998 999 /* 1000 * Determine if the given path is a hot spare within the given configuration. 1001 */ 1002 static boolean_t 1003 is_spare(nvlist_t *config, const char *path) 1004 { 1005 int fd; 1006 pool_state_t state; 1007 char *name = NULL; 1008 nvlist_t *label; 1009 uint64_t guid, spareguid; 1010 nvlist_t *nvroot; 1011 nvlist_t **spares; 1012 uint_t i, nspares; 1013 boolean_t inuse; 1014 1015 if ((fd = open(path, O_RDONLY)) < 0) 1016 return (B_FALSE); 1017 1018 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 1019 !inuse || 1020 state != POOL_STATE_SPARE || 1021 zpool_read_label(fd, &label) != 0) { 1022 free(name); 1023 (void) close(fd); 1024 return (B_FALSE); 1025 } 1026 free(name); 1027 1028 (void) close(fd); 1029 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1030 nvlist_free(label); 1031 1032 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1033 &nvroot) == 0); 1034 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1035 &spares, &nspares) == 0) { 1036 for (i = 0; i < nspares; i++) { 1037 verify(nvlist_lookup_uint64(spares[i], 1038 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1039 if (spareguid == guid) 1040 return (B_TRUE); 1041 } 1042 } 1043 1044 return (B_FALSE); 1045 } 1046 1047 /* 1048 * Go through and find any devices that are in use. We rely on libdiskmgt for 1049 * the majority of this task. 1050 */ 1051 int 1052 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 1053 int isspare) 1054 { 1055 nvlist_t **child; 1056 uint_t c, children; 1057 char *type, *path; 1058 int ret; 1059 char buf[MAXPATHLEN]; 1060 uint64_t wholedisk; 1061 1062 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1063 1064 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1065 &child, &children) != 0) { 1066 1067 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1068 1069 /* 1070 * As a generic check, we look to see if this is a replace of a 1071 * hot spare within the same pool. If so, we allow it 1072 * regardless of what libdiskmgt or zpool_in_use() says. 1073 */ 1074 if (isreplacing) { 1075 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1076 &wholedisk) == 0 && wholedisk) 1077 (void) snprintf(buf, sizeof (buf), "%ss0", 1078 path); 1079 else 1080 (void) strlcpy(buf, path, sizeof (buf)); 1081 if (is_spare(config, buf)) 1082 return (0); 1083 } 1084 1085 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1086 ret = check_device(path, force, isspare); 1087 1088 if (strcmp(type, VDEV_TYPE_FILE) == 0) 1089 ret = check_file(path, force, isspare); 1090 1091 return (ret); 1092 } 1093 1094 for (c = 0; c < children; c++) 1095 if ((ret = check_in_use(config, child[c], force, 1096 isreplacing, B_FALSE)) != 0) 1097 return (ret); 1098 1099 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1100 &child, &children) == 0) 1101 for (c = 0; c < children; c++) 1102 if ((ret = check_in_use(config, child[c], force, 1103 isreplacing, B_TRUE)) != 0) 1104 return (ret); 1105 1106 return (0); 1107 } 1108 1109 const char * 1110 is_grouping(const char *type, int *mindev) 1111 { 1112 if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { 1113 if (mindev != NULL) 1114 *mindev = 2; 1115 return (VDEV_TYPE_RAIDZ); 1116 } 1117 1118 if (strcmp(type, "raidz2") == 0) { 1119 if (mindev != NULL) 1120 *mindev = 3; 1121 return (VDEV_TYPE_RAIDZ); 1122 } 1123 1124 if (strcmp(type, "mirror") == 0) { 1125 if (mindev != NULL) 1126 *mindev = 2; 1127 return (VDEV_TYPE_MIRROR); 1128 } 1129 1130 if (strcmp(type, "spare") == 0) { 1131 if (mindev != NULL) 1132 *mindev = 1; 1133 return (VDEV_TYPE_SPARE); 1134 } 1135 1136 return (NULL); 1137 } 1138 1139 /* 1140 * Construct a syntactically valid vdev specification, 1141 * and ensure that all devices and files exist and can be opened. 1142 * Note: we don't bother freeing anything in the error paths 1143 * because the program is just going to exit anyway. 1144 */ 1145 nvlist_t * 1146 construct_spec(int argc, char **argv) 1147 { 1148 nvlist_t *nvroot, *nv, **top, **spares; 1149 int t, toplevels, mindev, nspares; 1150 const char *type; 1151 1152 top = NULL; 1153 toplevels = 0; 1154 spares = NULL; 1155 nspares = 0; 1156 1157 while (argc > 0) { 1158 nv = NULL; 1159 1160 /* 1161 * If it's a mirror or raidz, the subsequent arguments are 1162 * its leaves -- until we encounter the next mirror or raidz. 1163 */ 1164 if ((type = is_grouping(argv[0], &mindev)) != NULL) { 1165 nvlist_t **child = NULL; 1166 int c, children = 0; 1167 1168 if (strcmp(type, VDEV_TYPE_SPARE) == 0 && 1169 spares != NULL) { 1170 (void) fprintf(stderr, gettext("invalid vdev " 1171 "specification: 'spare' can be " 1172 "specified only once\n")); 1173 return (NULL); 1174 } 1175 1176 for (c = 1; c < argc; c++) { 1177 if (is_grouping(argv[c], NULL) != NULL) 1178 break; 1179 children++; 1180 child = realloc(child, 1181 children * sizeof (nvlist_t *)); 1182 if (child == NULL) 1183 zpool_no_memory(); 1184 if ((nv = make_leaf_vdev(argv[c])) == NULL) 1185 return (NULL); 1186 child[children - 1] = nv; 1187 } 1188 1189 if (children < mindev) { 1190 (void) fprintf(stderr, gettext("invalid vdev " 1191 "specification: %s requires at least %d " 1192 "devices\n"), argv[0], mindev); 1193 return (NULL); 1194 } 1195 1196 argc -= c; 1197 argv += c; 1198 1199 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1200 spares = child; 1201 nspares = children; 1202 continue; 1203 } else { 1204 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1205 0) == 0); 1206 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1207 type) == 0); 1208 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1209 verify(nvlist_add_uint64(nv, 1210 ZPOOL_CONFIG_NPARITY, 1211 mindev - 1) == 0); 1212 } 1213 verify(nvlist_add_nvlist_array(nv, 1214 ZPOOL_CONFIG_CHILDREN, child, 1215 children) == 0); 1216 1217 for (c = 0; c < children; c++) 1218 nvlist_free(child[c]); 1219 free(child); 1220 } 1221 } else { 1222 /* 1223 * We have a device. Pass off to make_leaf_vdev() to 1224 * construct the appropriate nvlist describing the vdev. 1225 */ 1226 if ((nv = make_leaf_vdev(argv[0])) == NULL) 1227 return (NULL); 1228 argc--; 1229 argv++; 1230 } 1231 1232 toplevels++; 1233 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1234 if (top == NULL) 1235 zpool_no_memory(); 1236 top[toplevels - 1] = nv; 1237 } 1238 1239 if (toplevels == 0 && nspares == 0) { 1240 (void) fprintf(stderr, gettext("invalid vdev " 1241 "specification: at least one toplevel vdev must be " 1242 "specified\n")); 1243 return (NULL); 1244 } 1245 1246 /* 1247 * Finally, create nvroot and add all top-level vdevs to it. 1248 */ 1249 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1250 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1251 VDEV_TYPE_ROOT) == 0); 1252 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1253 top, toplevels) == 0); 1254 if (nspares != 0) 1255 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1256 spares, nspares) == 0); 1257 1258 for (t = 0; t < toplevels; t++) 1259 nvlist_free(top[t]); 1260 for (t = 0; t < nspares; t++) 1261 nvlist_free(spares[t]); 1262 if (spares) 1263 free(spares); 1264 free(top); 1265 1266 return (nvroot); 1267 } 1268 1269 /* 1270 * Get and validate the contents of the given vdev specification. This ensures 1271 * that the nvlist returned is well-formed, that all the devices exist, and that 1272 * they are not currently in use by any other known consumer. The 'poolconfig' 1273 * parameter is the current configuration of the pool when adding devices 1274 * existing pool, and is used to perform additional checks, such as changing the 1275 * replication level of the pool. It can be 'NULL' to indicate that this is a 1276 * new pool. The 'force' flag controls whether devices should be forcefully 1277 * added, even if they appear in use. 1278 */ 1279 nvlist_t * 1280 make_root_vdev(nvlist_t *poolconfig, int force, int check_rep, 1281 boolean_t isreplacing, int argc, char **argv) 1282 { 1283 nvlist_t *newroot; 1284 1285 is_force = force; 1286 1287 /* 1288 * Construct the vdev specification. If this is successful, we know 1289 * that we have a valid specification, and that all devices can be 1290 * opened. 1291 */ 1292 if ((newroot = construct_spec(argc, argv)) == NULL) 1293 return (NULL); 1294 1295 /* 1296 * Validate each device to make sure that its not shared with another 1297 * subsystem. We do this even if 'force' is set, because there are some 1298 * uses (such as a dedicated dump device) that even '-f' cannot 1299 * override. 1300 */ 1301 if (check_in_use(poolconfig, newroot, force, isreplacing, 1302 B_FALSE) != 0) { 1303 nvlist_free(newroot); 1304 return (NULL); 1305 } 1306 1307 /* 1308 * Check the replication level of the given vdevs and report any errors 1309 * found. We include the existing pool spec, if any, as we need to 1310 * catch changes against the existing replication level. 1311 */ 1312 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1313 nvlist_free(newroot); 1314 return (NULL); 1315 } 1316 1317 /* 1318 * Run through the vdev specification and label any whole disks found. 1319 */ 1320 if (make_disks(newroot) != 0) { 1321 nvlist_free(newroot); 1322 return (NULL); 1323 } 1324 1325 return (newroot); 1326 } 1327